# Importing Libraries

In [1]:
import random #for splitting the data set into training and testing
from math import log #for calculating entropy of splits
from sklearn import tree #for comparison
from sklearn import metrics #performance metrics

# train_test_split
indices are fed to the random function. whatever indices are selected....those are pushed to test data....rest are pushed to train data.

In [3]:
def train_test_split(dataset,test_size= 0.2): #handles both integer and fractional(proportion) test sizes
    total = range(len(dataset))
    if (type(test_size) == float) and (test_size > 0) and (test_size <1): # if fractional value is passed
        size = int(test_size*len(dataset))
        test_index = random.sample(population=total,k=size)
    elif (type(test_size)== int) and (test_size <= len(dataset)): # if an integer is passed
        test_index = random.sample(population=total,k=test_size)
    else:
        raise Exception('invalid test_size')
    test = [dataset[value] for value in total if value in test_index]
    train = [dataset[value] for value  in total if value not in test_index]
    return train,test

In [4]:
def find_unique (dataset,column): #finding unique entries in a column/feature
    values = set() #always holds unique values.
    for datapoint in dataset:
        values.add(datapoint[column])
    x = list(values)
    x.sort() #sorting helps in finding the midpoints
    return x


# distinguishing continuous and categorical features

In [5]:
def get_feature_type(dataset,threshold=10): #manual analysis is necessary to determine the threshold before calling this function. This function helps in marking features as continuous and categorical
    feature_type = []
    for i in range(len(dataset[0])-1):
        count = len(find_unique(dataset,i))
        if isinstance(dataset[0][i],str) or count<=threshold:
            feature_type.append('categorical')
        else:
            feature_type.append('continuous')
    return feature_type

In [6]:
def find_centres (dataset): #Finding the split points for the entire data set
    dict = {}
    for i in range(len(dataset[0])-1): #ignore the labels
        #centres = set() #order is not required and unique centres are needed
        column_values = find_unique(dataset,i) #find unique feature values
        if feature_type[i] == 'continuous': #this will be global to this function
            # for j in range(1,len(column_values)): #This entire for loop is just to find mid points. I think this can be avoided--->TO DO
            #     centres.add((column_values[j-1]+column_values[j])/2)
            # dict[i] = centres
            column_values.pop()
            centres = set(column_values)
            dict[i] = centres
        elif len(column_values)>1: #categorical variable must have more than one categories. #CHeck the edge case
            # for column_value in column_values:
            #     centres.add(column_value)
            centres = set(column_values)
            dict[i] = centres

    return dict

# def find_centres (dataset): #Finding the split points for the entire data set
#     dict = {}
#     for i in range(len(dataset[0])-1): #ignore the labels
#         #centres = set() #order is not required and unique centres are needed
#         column_values = find_unique(dataset,i) #find unique feature values
#         if feature_type[i] == 'continuous': #this will be global to this function
#             # for j in range(1,len(column_values)): #This entire for loop is just to find mid points. I think this can be avoided--->TO DO
#             #     centres.add((column_values[j-1]+column_values[j])/2)
#             # dict[i] = centres
#             column_values.pop()
#             #centres = set(column_values)
#             #dict[i] = centres
#         #elif len(column_values)>1: #categorical variable must have more than one categories. #CHeck the edge case
#             # for column_value in column_values:
#             #     centres.add(column_value)
#         centres = set(column_values)
#         dict[i] = centres
#
#     return dict

In [7]:
def split_data(dataset,feature_index,centre): # splits the data beased on given feature and given entre point
    left,right = [],[]  #the splits
    for datapoint in dataset:
        if feature_type[feature_index] == 'continuous':
            if datapoint[feature_index]<= centre: #split is not on equality for continuous feature
                left.append(datapoint)
            else:
                right.append(datapoint)
        else: #categorical
            if datapoint[feature_index]== centre:
                left.append(datapoint)
            else:
                right.append(datapoint)
    return left,right

In [8]:
def class_counts(dataset): #returns the counts of the datapoints belong to a class
    counts = {}  # a dictionary of label -> count.
    for datapoint in dataset:
        # in our dataset format, the label is always the last column
        label = datapoint[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [9]:
def gini(dataset): # returns the gini value for a data set
    total = len(dataset)
    counts = class_counts(dataset)
    gin = 1
    for label in counts:
        prob = counts[label]/total
        gin-= prob**2
    return gin

In [10]:
def entropy(dataset): #returns the entropy value for a data set
    total = len(dataset)
    counts = class_counts(dataset)
    ent = 0
    for label in counts:
        prob = counts[label]/total
        prod = -prob * log(prob,2)
        ent+= prod
    return ent

In [11]:
def entropy_split(left,right): # returns entropy for the entire split
    total = len(left)+len(right)
    return (len(left)/total)*entropy(left)+ (len(right)/total)*entropy(right)

In [12]:
def gini_split (left,right): # returns gini value for the entire split
    total = len(left)+len(right)
    return (len(left)/total)*gini(left)+ (len(right)/total)*gini(right)

In [13]:
def find_best_split(dataset,split_type): # function to find the best split point based on the split criteria
    if split_type == 'gini':
        function = gini_split
    else:
        function = entropy_split
    mini_gini = 9999999999
    best_feature_index = '999999999999'
    best_centre = 'X'
    best_left = 'X'
    best_right = 'X'
    potential_split = find_centres(dataset)
    for feature in potential_split:  #for every feature
        for centre in potential_split[feature]:  # for every possible mid point in the feature
            left,right = split_data(dataset,feature,centre) # split the data
            current_gini = function(left,right) #calcualte the gini value for the split
            if current_gini < mini_gini:
                mini_gini = current_gini
                best_feature_index = feature
                best_centre = centre
                best_left = left
                best_right = right
    return mini_gini,best_centre,best_feature_index,best_left,best_right

In [14]:
class Decision_Tree: # class to store the state of the decision tree. Entire tree can be identified if the root is remembered
    def __init__(self,value,question = None, left=None,right= None):
        self.value = value
        self.question = question
        self.left = left
        self.right = right

    def __eq__(self, other):
        if self is None or other is None:
            return False
        return (self.value == other.value) and (self.question == other.question) and (self.left == other.left) and (self.right == other.right)

In [15]:
def build_tree(dataset, header=None, split_type='gini', min_samples_split=2, max_depth=None, counter=0,
               featureType=None):  # main function to build the decision tree
    if counter == 0:
        if split_type != 'gini' and split_type != 'entropy':
            raise ValueError("Unsupported split type")
        global feature_type
        if featureType == None:
            feature_type = get_feature_type(dataset)
        else:
            if (len(dataset[0])-1) == len(featureType) and type(featureType) == list:
                feature_type = featureType
            else:
                raise ValueError('Length mismatch between feature type and the data_set')
        if header is not None:  ## block added recently
            if len(header) != len(dataset[0]):
                raise ValueError('Length mismatch between feature type and the header passed')  ##Added recently
    mini_gini, mid, feature_index, left_side, right_side = find_best_split(dataset,
                                                                           split_type)  # find the best split for the database

    if (mini_gini == 0) or (mini_gini == 9999999999) or (left_side == []) or (left_side == 'X') or (
            right_side == []) or (right_side == 'X') or (mid == 'X') or (len(dataset) <= min_samples_split) or (
            (max_depth is not None) and (counter == max_depth)):  # Base cases of no splits
        pred = class_counts(dataset)  # make the prediction and form a leaf node
        # print(predict)
        # print(max(predict,key =predict.get),)
        return Decision_Tree(str(max(pred, key=pred.get)))  ##changing to store string info for leaf
    else:
        counter += 1
        operator = '=='
        if feature_type[feature_index] == 'continuous':  # setting the operator based on the feature type
            operator = '<='
        if header:
            string = "is {} {} {}?".format(header[feature_index], operator, mid)
        else:
            string = "is {} {} {}?".format(feature_index, operator, mid)
        left_tree = build_tree(left_side, header, split_type, min_samples_split, max_depth=max_depth, counter=counter)
        right_tree = build_tree(right_side, header, split_type, min_samples_split, max_depth=max_depth, counter=counter)
        if left_tree == right_tree:  # to handle the case when left sub tree and right sub tree give same prediction
            pred = class_counts(dataset)
            return Decision_Tree(str(max(pred, key=pred.get)))  ## changing to store string value for the leaf node to handle both integer and string labels
        else:
            return Decision_Tree([feature_index, operator, mid], string, left_tree,
                                 right_tree)  # create a node in the tree (not a leaf node)

In [16]:
def print_tree(root, spacing=""):
    # Base case: we've reached a leaf
    if (root.left == None) and (root.right == None):
        print (spacing + "Prediction: ", root.value)
        return

    # Print the question at this node
    print (spacing + str(root.question))

    # Call this function recursively on the true branch
    print (spacing + '--> Left:')
    print_tree(root.left, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> Right:')
    print_tree(root.right, spacing + "  ")

In [17]:
def predict (root,test): #for predicting multiple data points
    predictions = []
    for point in test:
        predictions.append(predict_single(root,point))
    return predictions

In [18]:
def predict_single (root,sample):
     # base case if root.value is a string we actually have reached a prediction.
    if type(root.value) == str:
        return root.value
    else:
        if root.value[1] == '<=': #prediction based on the operator type
            if sample[root.value[0]] <= root.value[-1]:
                return predict_single(root.left,sample)
            else:
                return predict_single(root.right,sample)
        else:
            if sample[root.value[0]] == root.value[-1]:
                return predict_single(root.left,sample)
            else:
                return predict_single(root.right,sample)

# Reading the data set

In [19]:
file = open('iris.data','r') #reading the iris data set. We are already aware that it is clean data set. No missing values.
line = file.readline()
iris_dataset = []

In [20]:
line

'5.1,3.5,1.4,0.2,Iris-setosa\n'

In [21]:
datapoint = line.split(',')
datapoint

['5.1', '3.5', '1.4', '0.2', 'Iris-setosa\n']

In [22]:
while len(line)> 1:
    line = line[:len(line)-1] #remove the newline character from the end of the string
    datapoint = line.split(',')
    iris_dataset.append(datapoint)
    line = file.readline()

In [23]:
file.close()

In [24]:
iris_dataset

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
 ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'],
 ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'],
 ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'],
 ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'],
 ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'],
 ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'],
 ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'],
 ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'],
 ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'],
 ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'],
 ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'],
 ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'],
 ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'],
 ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'],
 ['5.1', '3.8', '1.5', '0.3', 'Iris-setosa'],
 ['5.4', '3.4', '1.7', '0.2', 'Iris-setosa'],
 ['5.1', '3.7', '1.5', '0.4', 'Iri

In [25]:
len(iris_dataset)

150

In [26]:
for datapoint in iris_dataset: # data type conversion
    for j in range(len(datapoint)-1):
        datapoint[j] = float(datapoint[j])

In [27]:
iris_dataset

[[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
 [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
 [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
 [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
 [5.0, 3.6, 1.4, 0.2, 'Iris-setosa'],
 [5.4, 3.9, 1.7, 0.4, 'Iris-setosa'],
 [4.6, 3.4, 1.4, 0.3, 'Iris-setosa'],
 [5.0, 3.4, 1.5, 0.2, 'Iris-setosa'],
 [4.4, 2.9, 1.4, 0.2, 'Iris-setosa'],
 [4.9, 3.1, 1.5, 0.1, 'Iris-setosa'],
 [5.4, 3.7, 1.5, 0.2, 'Iris-setosa'],
 [4.8, 3.4, 1.6, 0.2, 'Iris-setosa'],
 [4.8, 3.0, 1.4, 0.1, 'Iris-setosa'],
 [4.3, 3.0, 1.1, 0.1, 'Iris-setosa'],
 [5.8, 4.0, 1.2, 0.2, 'Iris-setosa'],
 [5.7, 4.4, 1.5, 0.4, 'Iris-setosa'],
 [5.4, 3.9, 1.3, 0.4, 'Iris-setosa'],
 [5.1, 3.5, 1.4, 0.3, 'Iris-setosa'],
 [5.7, 3.8, 1.7, 0.3, 'Iris-setosa'],
 [5.1, 3.8, 1.5, 0.3, 'Iris-setosa'],
 [5.4, 3.4, 1.7, 0.2, 'Iris-setosa'],
 [5.1, 3.7, 1.5, 0.4, 'Iris-setosa'],
 [4.6, 3.6, 1.0, 0.2, 'Iris-setosa'],
 [5.1, 3.3, 1.7, 0.5, 'Iris-setosa'],
 [4.8, 3.4, 1.9, 0.2, 'Iris-setosa'],
 [5.0, 3.0, 1.6, 0.2, 'Iris-setosa'],
 [5.0, 3.4, 

In [28]:
iris_header= ['sepal_length','sepal_width','petal_length','petal_width','labels']

In [29]:
file = open('Titanic.csv',
            'r')  #reading the iris data set. We are already aware that it is clean data set. No missing values.
line = file.readline()
titanic_dataset = []
datapoint = line.split(',')
while len(line) > 1:
    line = line[:len(line) - 1]  #remove the newline character from the end of the string
    datapoint = line.split(',')
    titanic_dataset.append(datapoint)
    line = file.readline()
file.close()
for datapoint in titanic_dataset:  # data type conversion
    for j in range(len(datapoint) - 1):
        if (j != 1) and (j != 6):
            datapoint[j] = float(datapoint[j])
titanic_dataset


[[3.0, 'male', 22.0, 1.0, 0.0, 7.25, 'S', '0'],
 [1.0, 'female', 38.0, 1.0, 0.0, 71.2833, 'C', '1'],
 [3.0, 'female', 26.0, 0.0, 0.0, 7.925, 'S', '1'],
 [1.0, 'female', 35.0, 1.0, 0.0, 53.1, 'S', '1'],
 [3.0, 'male', 35.0, 0.0, 0.0, 8.05, 'S', '0'],
 [3.0, 'male', 28.0, 0.0, 0.0, 8.4583, 'Q', '0'],
 [1.0, 'male', 54.0, 0.0, 0.0, 51.8625, 'S', '0'],
 [3.0, 'male', 2.0, 3.0, 1.0, 21.075, 'S', '0'],
 [3.0, 'female', 27.0, 0.0, 2.0, 11.1333, 'S', '1'],
 [2.0, 'female', 14.0, 1.0, 0.0, 30.0708, 'C', '1'],
 [3.0, 'female', 4.0, 1.0, 1.0, 16.7, 'S', '1'],
 [1.0, 'female', 58.0, 0.0, 0.0, 26.55, 'S', '1'],
 [3.0, 'male', 20.0, 0.0, 0.0, 8.05, 'S', '0'],
 [3.0, 'male', 39.0, 1.0, 5.0, 31.275, 'S', '0'],
 [3.0, 'female', 14.0, 0.0, 0.0, 7.8542, 'S', '0'],
 [2.0, 'female', 55.0, 0.0, 0.0, 16.0, 'S', '1'],
 [3.0, 'male', 2.0, 4.0, 1.0, 29.125, 'Q', '0'],
 [2.0, 'male', 28.0, 0.0, 0.0, 13.0, 'S', '1'],
 [3.0, 'female', 31.0, 1.0, 0.0, 18.0, 'S', '0'],
 [3.0, 'female', 28.0, 0.0, 0.0, 7.225, 'C', '1

In [30]:
titanic_header= ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Label',]

In [31]:
train,test = train_test_split(iris_dataset)

In [32]:
len(train)

120

In [33]:
x = find_unique(train,2)
print(x)

[1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.9, 3.0, 3.3, 3.5, 3.7, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.3, 6.4, 6.6, 6.7, 6.9]


In [34]:
feature_type = get_feature_type(titanic_dataset)
feature_type

['categorical',
 'categorical',
 'continuous',
 'categorical',
 'categorical',
 'continuous',
 'categorical']

In [35]:
left,right=split_data(titanic_dataset,1,'male')

In [36]:
left

[[3.0, 'male', 22.0, 1.0, 0.0, 7.25, 'S', '0'],
 [3.0, 'male', 35.0, 0.0, 0.0, 8.05, 'S', '0'],
 [3.0, 'male', 28.0, 0.0, 0.0, 8.4583, 'Q', '0'],
 [1.0, 'male', 54.0, 0.0, 0.0, 51.8625, 'S', '0'],
 [3.0, 'male', 2.0, 3.0, 1.0, 21.075, 'S', '0'],
 [3.0, 'male', 20.0, 0.0, 0.0, 8.05, 'S', '0'],
 [3.0, 'male', 39.0, 1.0, 5.0, 31.275, 'S', '0'],
 [3.0, 'male', 2.0, 4.0, 1.0, 29.125, 'Q', '0'],
 [2.0, 'male', 28.0, 0.0, 0.0, 13.0, 'S', '1'],
 [2.0, 'male', 35.0, 0.0, 0.0, 26.0, 'S', '0'],
 [2.0, 'male', 34.0, 0.0, 0.0, 13.0, 'S', '1'],
 [1.0, 'male', 28.0, 0.0, 0.0, 35.5, 'S', '1'],
 [3.0, 'male', 28.0, 0.0, 0.0, 7.225, 'C', '0'],
 [1.0, 'male', 19.0, 3.0, 2.0, 263.0, 'S', '0'],
 [3.0, 'male', 28.0, 0.0, 0.0, 7.8958, 'S', '0'],
 [1.0, 'male', 40.0, 0.0, 0.0, 27.7208, 'C', '0'],
 [2.0, 'male', 66.0, 0.0, 0.0, 10.5, 'S', '0'],
 [1.0, 'male', 28.0, 1.0, 0.0, 82.1708, 'C', '0'],
 [1.0, 'male', 42.0, 1.0, 0.0, 52.0, 'S', '0'],
 [3.0, 'male', 28.0, 0.0, 0.0, 7.2292, 'C', '1'],
 [3.0, 'male', 21.0

In [37]:
right

[[1.0, 'female', 38.0, 1.0, 0.0, 71.2833, 'C', '1'],
 [3.0, 'female', 26.0, 0.0, 0.0, 7.925, 'S', '1'],
 [1.0, 'female', 35.0, 1.0, 0.0, 53.1, 'S', '1'],
 [3.0, 'female', 27.0, 0.0, 2.0, 11.1333, 'S', '1'],
 [2.0, 'female', 14.0, 1.0, 0.0, 30.0708, 'C', '1'],
 [3.0, 'female', 4.0, 1.0, 1.0, 16.7, 'S', '1'],
 [1.0, 'female', 58.0, 0.0, 0.0, 26.55, 'S', '1'],
 [3.0, 'female', 14.0, 0.0, 0.0, 7.8542, 'S', '0'],
 [2.0, 'female', 55.0, 0.0, 0.0, 16.0, 'S', '1'],
 [3.0, 'female', 31.0, 1.0, 0.0, 18.0, 'S', '0'],
 [3.0, 'female', 28.0, 0.0, 0.0, 7.225, 'C', '1'],
 [3.0, 'female', 15.0, 0.0, 0.0, 8.0292, 'Q', '1'],
 [3.0, 'female', 8.0, 3.0, 1.0, 21.075, 'S', '0'],
 [3.0, 'female', 38.0, 1.0, 5.0, 31.3875, 'S', '1'],
 [3.0, 'female', 28.0, 0.0, 0.0, 7.8792, 'Q', '1'],
 [1.0, 'female', 28.0, 1.0, 0.0, 146.5208, 'C', '1'],
 [3.0, 'female', 28.0, 0.0, 0.0, 7.75, 'Q', '1'],
 [3.0, 'female', 18.0, 2.0, 0.0, 18.0, 'S', '0'],
 [3.0, 'female', 14.0, 1.0, 0.0, 11.2417, 'C', '1'],
 [3.0, 'female', 40.0,

In [38]:
y = find_centres(titanic_dataset)
y

{0: {1.0, 2.0, 3.0},
 1: {'female', 'male'},
 2: {0.42,
  0.67,
  0.75,
  0.83,
  0.92,
  1.0,
  2.0,
  3.0,
  4.0,
  5.0,
  6.0,
  7.0,
  8.0,
  9.0,
  10.0,
  11.0,
  12.0,
  13.0,
  14.0,
  14.5,
  15.0,
  16.0,
  17.0,
  18.0,
  19.0,
  20.0,
  20.5,
  21.0,
  22.0,
  23.0,
  23.5,
  24.0,
  24.5,
  25.0,
  26.0,
  27.0,
  28.0,
  28.5,
  29.0,
  30.0,
  30.5,
  31.0,
  32.0,
  32.5,
  33.0,
  34.0,
  34.5,
  35.0,
  36.0,
  36.5,
  37.0,
  38.0,
  39.0,
  40.0,
  40.5,
  41.0,
  42.0,
  43.0,
  44.0,
  45.0,
  45.5,
  46.0,
  47.0,
  48.0,
  49.0,
  50.0,
  51.0,
  52.0,
  53.0,
  54.0,
  55.0,
  55.5,
  56.0,
  57.0,
  58.0,
  59.0,
  60.0,
  61.0,
  62.0,
  63.0,
  64.0,
  65.0,
  66.0,
  70.0,
  70.5,
  71.0,
  74.0},
 3: {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0},
 4: {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0},
 5: {0.0,
  4.0125,
  5.0,
  6.2375,
  6.4375,
  6.45,
  6.4958,
  6.75,
  6.8583,
  6.95,
  6.975,
  7.0458,
  7.05,
  7.0542,
  7.125,
  7.1417,
  7.225,
  7.2292,
  7.25,
  7.3125

In [39]:
ginir = gini(train)
ginir

0.6661111111111111

In [40]:
gini_min,centre,feature,left,right= find_best_split(train,'entropy')
gini_min

0.6830400735540036

In [41]:
feature

2

In [42]:
centre

1.9

In [43]:
left

[[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
 [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
 [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
 [5.0, 3.6, 1.4, 0.2, 'Iris-setosa'],
 [4.4, 2.9, 1.4, 0.2, 'Iris-setosa'],
 [4.9, 3.1, 1.5, 0.1, 'Iris-setosa'],
 [5.4, 3.7, 1.5, 0.2, 'Iris-setosa'],
 [4.8, 3.4, 1.6, 0.2, 'Iris-setosa'],
 [4.8, 3.0, 1.4, 0.1, 'Iris-setosa'],
 [4.3, 3.0, 1.1, 0.1, 'Iris-setosa'],
 [5.7, 4.4, 1.5, 0.4, 'Iris-setosa'],
 [5.4, 3.9, 1.3, 0.4, 'Iris-setosa'],
 [5.1, 3.5, 1.4, 0.3, 'Iris-setosa'],
 [5.1, 3.8, 1.5, 0.3, 'Iris-setosa'],
 [5.4, 3.4, 1.7, 0.2, 'Iris-setosa'],
 [5.1, 3.7, 1.5, 0.4, 'Iris-setosa'],
 [4.6, 3.6, 1.0, 0.2, 'Iris-setosa'],
 [5.0, 3.0, 1.6, 0.2, 'Iris-setosa'],
 [5.0, 3.4, 1.6, 0.4, 'Iris-setosa'],
 [5.2, 3.5, 1.5, 0.2, 'Iris-setosa'],
 [5.2, 3.4, 1.4, 0.2, 'Iris-setosa'],
 [4.8, 3.1, 1.6, 0.2, 'Iris-setosa'],
 [5.4, 3.4, 1.5, 0.4, 'Iris-setosa'],
 [5.2, 4.1, 1.5, 0.1, 'Iris-setosa'],
 [5.5, 4.2, 1.4, 0.2, 'Iris-setosa'],
 [4.9, 3.1, 1.5, 0.1, 'Iris-setosa'],
 [5.0, 3.2, 

In [44]:
right

[[6.4, 3.2, 4.5, 1.5, 'Iris-versicolor'],
 [6.9, 3.1, 4.9, 1.5, 'Iris-versicolor'],
 [5.5, 2.3, 4.0, 1.3, 'Iris-versicolor'],
 [6.5, 2.8, 4.6, 1.5, 'Iris-versicolor'],
 [4.9, 2.4, 3.3, 1.0, 'Iris-versicolor'],
 [6.6, 2.9, 4.6, 1.3, 'Iris-versicolor'],
 [5.2, 2.7, 3.9, 1.4, 'Iris-versicolor'],
 [5.0, 2.0, 3.5, 1.0, 'Iris-versicolor'],
 [5.9, 3.0, 4.2, 1.5, 'Iris-versicolor'],
 [6.0, 2.2, 4.0, 1.0, 'Iris-versicolor'],
 [6.1, 2.9, 4.7, 1.4, 'Iris-versicolor'],
 [6.7, 3.1, 4.4, 1.4, 'Iris-versicolor'],
 [5.6, 3.0, 4.5, 1.5, 'Iris-versicolor'],
 [5.8, 2.7, 4.1, 1.0, 'Iris-versicolor'],
 [6.2, 2.2, 4.5, 1.5, 'Iris-versicolor'],
 [5.6, 2.5, 3.9, 1.1, 'Iris-versicolor'],
 [5.9, 3.2, 4.8, 1.8, 'Iris-versicolor'],
 [6.1, 2.8, 4.0, 1.3, 'Iris-versicolor'],
 [6.3, 2.5, 4.9, 1.5, 'Iris-versicolor'],
 [6.1, 2.8, 4.7, 1.2, 'Iris-versicolor'],
 [6.6, 3.0, 4.4, 1.4, 'Iris-versicolor'],
 [6.8, 2.8, 4.8, 1.4, 'Iris-versicolor'],
 [6.7, 3.0, 5.0, 1.7, 'Iris-versicolor'],
 [6.0, 2.9, 4.5, 1.5, 'Iris-versic

In [45]:
counts = class_counts(train)
print(counts['Iris-setosa'])
print(max(counts,key = counts.get))

38
Iris-versicolor


In [46]:
trial = build_tree(train,iris_header,'gini')

In [47]:
print_tree(trial)

is petal_length <= 1.9?
--> Left:
  Prediction:  Iris-setosa
--> Right:
  is petal_width <= 1.7?
  --> Left:
    is petal_length <= 4.9?
    --> Left:
      Prediction:  Iris-versicolor
    --> Right:
      is petal_width <= 1.5?
      --> Left:
        Prediction:  Iris-virginica
      --> Right:
        Prediction:  Iris-versicolor
  --> Right:
    Prediction:  Iris-virginica


In [48]:
type(trial.value) == list

True

In [49]:
test

[[4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
 [5.4, 3.9, 1.7, 0.4, 'Iris-setosa'],
 [4.6, 3.4, 1.4, 0.3, 'Iris-setosa'],
 [5.0, 3.4, 1.5, 0.2, 'Iris-setosa'],
 [5.8, 4.0, 1.2, 0.2, 'Iris-setosa'],
 [5.7, 3.8, 1.7, 0.3, 'Iris-setosa'],
 [5.1, 3.3, 1.7, 0.5, 'Iris-setosa'],
 [4.8, 3.4, 1.9, 0.2, 'Iris-setosa'],
 [4.7, 3.2, 1.6, 0.2, 'Iris-setosa'],
 [4.4, 3.2, 1.3, 0.2, 'Iris-setosa'],
 [4.6, 3.2, 1.4, 0.2, 'Iris-setosa'],
 [5.0, 3.3, 1.4, 0.2, 'Iris-setosa'],
 [7.0, 3.2, 4.7, 1.4, 'Iris-versicolor'],
 [5.7, 2.8, 4.5, 1.3, 'Iris-versicolor'],
 [6.3, 3.3, 4.7, 1.6, 'Iris-versicolor'],
 [5.6, 2.9, 3.6, 1.3, 'Iris-versicolor'],
 [6.4, 2.9, 4.3, 1.3, 'Iris-versicolor'],
 [5.5, 2.4, 3.8, 1.1, 'Iris-versicolor'],
 [5.8, 2.7, 3.9, 1.2, 'Iris-versicolor'],
 [5.7, 2.8, 4.1, 1.3, 'Iris-versicolor'],
 [6.5, 3.0, 5.8, 2.2, 'Iris-virginica'],
 [4.9, 2.5, 4.5, 1.7, 'Iris-virginica'],
 [6.4, 3.2, 5.3, 2.3, 'Iris-virginica'],
 [7.7, 3.8, 6.7, 2.2, 'Iris-virginica'],
 [6.7, 3.3, 5.7, 2.1, 'Iris-virginica'],
 [6

In [50]:
test

[[4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
 [5.4, 3.9, 1.7, 0.4, 'Iris-setosa'],
 [4.6, 3.4, 1.4, 0.3, 'Iris-setosa'],
 [5.0, 3.4, 1.5, 0.2, 'Iris-setosa'],
 [5.8, 4.0, 1.2, 0.2, 'Iris-setosa'],
 [5.7, 3.8, 1.7, 0.3, 'Iris-setosa'],
 [5.1, 3.3, 1.7, 0.5, 'Iris-setosa'],
 [4.8, 3.4, 1.9, 0.2, 'Iris-setosa'],
 [4.7, 3.2, 1.6, 0.2, 'Iris-setosa'],
 [4.4, 3.2, 1.3, 0.2, 'Iris-setosa'],
 [4.6, 3.2, 1.4, 0.2, 'Iris-setosa'],
 [5.0, 3.3, 1.4, 0.2, 'Iris-setosa'],
 [7.0, 3.2, 4.7, 1.4, 'Iris-versicolor'],
 [5.7, 2.8, 4.5, 1.3, 'Iris-versicolor'],
 [6.3, 3.3, 4.7, 1.6, 'Iris-versicolor'],
 [5.6, 2.9, 3.6, 1.3, 'Iris-versicolor'],
 [6.4, 2.9, 4.3, 1.3, 'Iris-versicolor'],
 [5.5, 2.4, 3.8, 1.1, 'Iris-versicolor'],
 [5.8, 2.7, 3.9, 1.2, 'Iris-versicolor'],
 [5.7, 2.8, 4.1, 1.3, 'Iris-versicolor'],
 [6.5, 3.0, 5.8, 2.2, 'Iris-virginica'],
 [4.9, 2.5, 4.5, 1.7, 'Iris-virginica'],
 [6.4, 3.2, 5.3, 2.3, 'Iris-virginica'],
 [7.7, 3.8, 6.7, 2.2, 'Iris-virginica'],
 [6.7, 3.3, 5.7, 2.1, 'Iris-virginica'],
 [6

In [51]:
prediction = predict(trial,test)
len(prediction)

30

In [52]:
true_labels = list(point[-1] for point in test)
true_labels

['Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica']

In [53]:
print('accuracy: ',metrics.accuracy_score(true_labels,prediction))
print('F1 Score: ',metrics.f1_score(true_labels,prediction,average= 'macro'))
print('precision: ',metrics.precision_score(true_labels,prediction,average= 'macro'))
print('recall: ',metrics.recall_score(true_labels,prediction,average= 'macro'))

accuracy:  0.9666666666666667
F1 Score:  0.9628482972136223
precision:  0.9629629629629629
recall:  0.9666666666666667


In [54]:
titrain,titest = train_test_split(titanic_dataset)
titree = build_tree(titrain,titanic_header,max_depth=3)
print_tree(titree)

is Sex == female?
--> Left:
  is Pclass == 3.0?
  --> Left:
    is Fare <= 22.3583?
    --> Left:
      Prediction:  1
    --> Right:
      Prediction:  0
  --> Right:
    is Age <= 2.0?
    --> Left:
      Prediction:  0
    --> Right:
      Prediction:  1
--> Right:
  is Age <= 6.0?
  --> Left:
    is Pclass == 3.0?
    --> Left:
      Prediction:  0
    --> Right:
      Prediction:  1
  --> Right:
    Prediction:  0


In [55]:
prediction = predict(titree,titest)
true_labels = list(point[-1] for point in titest)
print('accuracy: ',metrics.accuracy_score(true_labels,prediction))
print('F1 Score: ',metrics.f1_score(true_labels,prediction,average= 'macro'))
print('precision: ',metrics.precision_score(true_labels,prediction,average= 'macro'))
print('recall: ',metrics.recall_score(true_labels,prediction,average= 'macro'))

accuracy:  0.8314606741573034
F1 Score:  0.8242727392391733
precision:  0.8378488767869299
recall:  0.8190819081908192


In [56]:
#train,test = train_test_split(iris_dataset)
X = []
y = []
for point in train:
  X.append(point[0:len(point)-1])
  y.append(point[len(point)-1])

In [57]:
X

[[5.1, 3.5, 1.4, 0.2],
 [4.7, 3.2, 1.3, 0.2],
 [4.6, 3.1, 1.5, 0.2],
 [5.0, 3.6, 1.4, 0.2],
 [4.4, 2.9, 1.4, 0.2],
 [4.9, 3.1, 1.5, 0.1],
 [5.4, 3.7, 1.5, 0.2],
 [4.8, 3.4, 1.6, 0.2],
 [4.8, 3.0, 1.4, 0.1],
 [4.3, 3.0, 1.1, 0.1],
 [5.7, 4.4, 1.5, 0.4],
 [5.4, 3.9, 1.3, 0.4],
 [5.1, 3.5, 1.4, 0.3],
 [5.1, 3.8, 1.5, 0.3],
 [5.4, 3.4, 1.7, 0.2],
 [5.1, 3.7, 1.5, 0.4],
 [4.6, 3.6, 1.0, 0.2],
 [5.0, 3.0, 1.6, 0.2],
 [5.0, 3.4, 1.6, 0.4],
 [5.2, 3.5, 1.5, 0.2],
 [5.2, 3.4, 1.4, 0.2],
 [4.8, 3.1, 1.6, 0.2],
 [5.4, 3.4, 1.5, 0.4],
 [5.2, 4.1, 1.5, 0.1],
 [5.5, 4.2, 1.4, 0.2],
 [4.9, 3.1, 1.5, 0.1],
 [5.0, 3.2, 1.2, 0.2],
 [5.5, 3.5, 1.3, 0.2],
 [4.9, 3.1, 1.5, 0.1],
 [4.4, 3.0, 1.3, 0.2],
 [5.1, 3.4, 1.5, 0.2],
 [5.0, 3.5, 1.3, 0.3],
 [4.5, 2.3, 1.3, 0.3],
 [5.0, 3.5, 1.6, 0.6],
 [5.1, 3.8, 1.9, 0.4],
 [4.8, 3.0, 1.4, 0.3],
 [5.1, 3.8, 1.6, 0.2],
 [5.3, 3.7, 1.5, 0.2],
 [6.4, 3.2, 4.5, 1.5],
 [6.9, 3.1, 4.9, 1.5],
 [5.5, 2.3, 4.0, 1.3],
 [6.5, 2.8, 4.6, 1.5],
 [4.9, 2.4, 3.3, 1.0],
 [6.6, 2.9,

In [58]:
y

['Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versi

In [59]:
estimator = tree.DecisionTreeClassifier( random_state=0)
estimator = estimator.fit(X,y)
r = tree.export_text(estimator, feature_names=iris_header[:len(iris_header)-1])
print(r)

|--- petal_width <= 0.80
|   |--- class: Iris-setosa
|--- petal_width >  0.80
|   |--- petal_width <= 1.75
|   |   |--- petal_length <= 4.95
|   |   |   |--- class: Iris-versicolor
|   |   |--- petal_length >  4.95
|   |   |   |--- petal_width <= 1.55
|   |   |   |   |--- class: Iris-virginica
|   |   |   |--- petal_width >  1.55
|   |   |   |   |--- sepal_length <= 6.95
|   |   |   |   |   |--- class: Iris-versicolor
|   |   |   |   |--- sepal_length >  6.95
|   |   |   |   |   |--- class: Iris-virginica
|   |--- petal_width >  1.75
|   |   |--- petal_length <= 4.85
|   |   |   |--- sepal_length <= 5.95
|   |   |   |   |--- class: Iris-versicolor
|   |   |   |--- sepal_length >  5.95
|   |   |   |   |--- class: Iris-virginica
|   |   |--- petal_length >  4.85
|   |   |   |--- class: Iris-virginica



In [60]:
TX,Ty = [],[]
for point in test:
    TX.append(point[0:len(point)-1])
    Ty.append(point[len(point)-1])
prediction= estimator.predict(TX)
metrics.accuracy_score(Ty,prediction)
print('accuracy: ',metrics.accuracy_score(Ty,prediction))
print('F1 Score: ',metrics.f1_score(Ty,prediction,average= 'macro'))
print('precision: ',metrics.precision_score(Ty,prediction,average= 'macro'))
print('recall: ',metrics.recall_score(Ty,prediction,average= 'macro'))

accuracy:  0.9666666666666667
F1 Score:  0.9628482972136223
precision:  0.9629629629629629
recall:  0.9666666666666667


In [61]:
train,test = train_test_split(titanic_dataset)
X = []
y = []
for point in train:
    X.append(point[0:len(point)-1])
    y.append(point[len(point)-1])

Scikit Lear decision Trees cannot directly handle categorical Data

In [62]:
estimator = tree.DecisionTreeClassifier(max_depth=3, random_state=0)
estimator = estimator.fit(X,y)
r = tree.export_text(estimator, feature_names=titanic_header[:len(titanic_header)-1])
print(r)

ValueError: could not convert string to float: 'male'

In [None]:
x = [1,2,3,1,1,2,1,0,4]
y = set(x)
y