In [None]:
import numpy as np
import csv
import pprint

PASSENGERID = 0
PCLASS = 1
NAME = 2
SEX = 3
AGE = 4
SIBSP = 5
PARCH = 6
SURVIVED = 7

In [None]:
def read_csv(name):
  labels = []
  passengers = []
  with open(name, newline='') as csvFile:
    reader = csv.reader(csvFile, delimiter=',')
    for index, row in enumerate(reader):
      if index == 0:
        labels.append(row)
      if index > 0:
        passengers.append(row)
  return labels, passengers

In [None]:
def simplify(data):
  for passenger in data:
    if int(passenger[AGE]) <= 20:
      passenger[AGE] = 'young'
    elif int(passenger[AGE]) <= 40:
      passenger[AGE] = 'middle'
    else:
      passenger[AGE] = 'old' 

    passenger[PCLASS] += '. class'
    passenger[SIBSP] += ' sib'
    passenger[PARCH] += ' parch'

    if passenger[SURVIVED] == '0':
      passenger[SURVIVED] = 'DIED'
    else:
      passenger[SURVIVED] = 'SURVIVED'

In [None]:
labels,data = read_csv('titanic-homework.csv')
simplify(data)

In [None]:
def entropy(data):
    elements,counts = np.unique(data, return_counts = True)
    entropy = - np.sum([(counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

In [None]:
def conditional_entropy(data, attribute, conditional_attribute):
  elements,counts= np.unique([row[attribute] for row in data], return_counts=True)
  conditional_entropy = np.sum([(counts[i]/np.sum(counts))*entropy([row[conditional_attribute] for row in data if row[attribute]==elements[i]]) for i in range(len(elements))])
  return conditional_entropy

In [None]:
def gain(data, attribute, conditional_attribute):
  return entropy([row[conditional_attribute] for row in data]) - conditional_entropy(data, attribute, conditional_attribute)

In [None]:
def intrinsic_info(data, attribute):
  elements,counts = np.unique([row[attribute] for row in data], return_counts = True)
  intrinsic = - np.sum([(counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
  return intrinsic

In [None]:
def gain_ratio(data, attribute, conditional_attribute):
  gain_val = gain(data, attribute, conditional_attribute)
  intrinsic_val = intrinsic_info(data, attribute)
  return gain_val / intrinsic_val

In [None]:
entropy_val = entropy([row[SURVIVED] for row in data])
print('Entropy: ', entropy_val)

for attribute in [PCLASS, SEX, AGE, SIBSP, PARCH]:
    print('=========== ATTRIBUTE', labels[0][attribute], '===========')

    cond_entropy = conditional_entropy(data, attribute, SURVIVED)
    print('Conditional entropy: ', cond_entropy)

    gain_val = gain(data, attribute, SURVIVED)
    print('Gain: ', gain_val)

    intristic_val = intrinsic_info(data, attribute)
    print('Intristic info: ', intristic_val)

    gain_ratio_val = gain_ratio(data, attribute, SURVIVED)
    print('Gain ratio: ', gain_ratio_val)

    print('')

Entropy:  0.9709505944546686
Conditional entropy:  0.8892782366556035
Gain:  0.08167235779906512
Intristic info:  1.3702289460717927
Gain ratio:  0.05960489889898

Conditional entropy:  0.5794280059252064
Gain:  0.3915225885294622
Intristic info:  0.9709505944546686
Gain ratio:  0.40323636523376316

Conditional entropy:  0.9616143464222462
Gain:  0.00933624803242239
Intristic info:  1.4907269475168268
Gain ratio:  0.006262882715023172

Conditional entropy:  0.9302951468759139
Gain:  0.04065544757875472
Intristic info:  1.619081664980563
Gain ratio:  0.025110189595805712

Conditional entropy:  0.9543693160498283
Gain:  0.01658127840484025
Intristic info:  1.1325998252597398
Gain ratio:  0.01464001497707953



In [None]:
def ID3(data, originaldata, elements, attribute = SURVIVED, parent_leaf = None):
    
    if len(np.unique([row[attribute] for row in data])) <= 1:
        return np.unique([row[attribute] for row in data])[0]
    
    elif len(data) == 0:
        return np.unique([row[attribute] for row in originaldata])[np.argmax(np.unique([row[attribute] for row in originaldata],return_counts=True)[1])]
    
    elif len(elements) == 0:
        return parent_leaf
    
    else:
        parent_leaf = np.unique([row[attribute] for row in data])[np.argmax(np.unique([row[attribute] for row in data],return_counts=True)[1])]
        val = [gain_ratio(data, elem, attribute) for elem in elements]
        best = elements[np.argmax(val)]
        tree = {best:{}}
        elements = [i for i in elements if i != best]
        
        for value in np.unique([row[best] for row in data]):
            subdata = [row for row in data if row[best] == value]
            subtree = ID3(subdata, originaldata, elements, parent_leaf = parent_leaf)
            tree[best][value] = subtree
            
        return(tree)   

vals = [PCLASS, SEX, AGE, SIBSP, PARCH]
for i in vals:
  print(labels[0][i], ':', i)


tree = ID3(data, data, vals)
pprint.pprint(tree)

Pclass : 1
Sex : 3
Age : 4
SibSp : 5
Parch : 6
{3: {'female': {5: {'0 sib': 'SURVIVED',
                    '1 sib': {1: {'1. class': 'SURVIVED',
                                  '2. class': 'SURVIVED',
                                  '3. class': {6: {'0 parch': 'DIED',
                                                   '1 parch': 'SURVIVED',
                                                   '5 parch': 'SURVIVED'}}}},
                    '2 sib': 'DIED',
                    '3 sib': {4: {'middle': 'SURVIVED', 'young': 'DIED'}},
                    '4 sib': 'SURVIVED',
                    '5 sib': 'DIED'}},
     'male': {1: {'1. class': {4: {'middle': {5: {'0 sib': 'SURVIVED',
                                                  '1 sib': 'DIED'}},
                                   'old': 'DIED',
                                   'young': {5: {'0 sib': 'SURVIVED',
                                                 '3 sib': 'DIED'}}}},
                  '2. class': {4: {'middle': 'DIED',

In [None]:
def entropy_xtra(data):
    entropy_value = []
    elements,counts = np.unique(data, return_counts = True)
    print(elements)
    entropy_value.append([(counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy_value
entropy_xtra([row[AGE] for row in data])

['middle' 'old' 'young']


[[-0.5, -0.4728231410691525, -0.5179038064476742]]