In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Train.csv")

In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [4]:
columns_to_drop =['name' ,'cabin', 'embarked', 'ticket', 'home.dest' ,'boat', 'body']

In [5]:
data_clean = data.drop(columns_to_drop, axis=1)

In [6]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,,2.0,0.0,23.25


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_clean['sex'] = le.fit_transform(data_clean['sex'])

In [8]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,,2.0,0.0,23.25


In [9]:
data_clean = data_clean.fillna(data_clean['age'].mean())

In [10]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,29.838978,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,29.838978,2.0,0.0,23.25


In [11]:
input_col= ['pclass', 'sex','age', 'sibsp', 'parch', 'fare']
output_col= ['survived']

In [12]:
X =data_clean[input_col]
Y =data_clean[output_col]

Implementing Information Gain

In [13]:
def entropy(col):
    counts = np.unique(col, return_counts= True)
    N = float(col.shape[0])
    entropy = 0.0

    for ix in counts[1]:
        p = ix/N
        entropy += (-1.0*p*np.log2(p))
    return entropy

In [14]:
def divide_data(X_data, fkey, fval):
    left_rows = []
    right_rows = []

    for ix in range(len(X_data)):
        val = X_data.iloc[ix][fkey]  # safer than .at[ix, fkey]
        if val > fval:
            right_rows.append(X_data.iloc[ix])
        else:
            left_rows.append(X_data.iloc[ix])

    x_left = pd.DataFrame(left_rows)
    x_right = pd.DataFrame(right_rows)

    return x_left, x_right

In [15]:
def information_gain(X_data, fkey, fval):
    left, right = divide_data(X_data, fkey, fval)
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -1e9  

    l = float(left.shape[0] / X_data.shape[0])
    r = float(right.shape[0] / X_data.shape[0])

    y = X_data['survived']
    y_left = left['survived']
    y_right = right['survived']

    igain = entropy(y) - (l * entropy(y_left) + r * entropy(y_right))
    return igain

Decision Tree

In [16]:
class DecisionTree:
    def __init__(self, depth=0, max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.depth = depth
        self.max_depth = max_depth
        self.target = None

    def train(self, X_train):
        features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']
        info_gain = []

        for feature in features:
            fval = X_train[feature].mean()
            igain = information_gain(X_train, feature, fval)
            info_gain.append(igain)

        self.fkey = features[np.argmax(info_gain)]
        self.fval = X_train[self.fkey].mean()
        print(f"Making Tree feature is: {self.fkey} at value: {self.fval:.2f}")
        left, right = divide_data(X_train, self.fkey, self.fval)

        # Stopping conditions
        if left.shape[0] == 0 or right.shape[0] == 0 or self.depth >= self.max_depth:
            if X_train['survived'].mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return

        self.left = DecisionTree(depth=self.depth + 1, max_depth=self.max_depth)
        self.left.train(left)

        self.right = DecisionTree(depth=self.depth + 1, max_depth=self.max_depth)
        self.right.train(right)

        if X_train['survived'].mean() >= 0.5:
            self.target = "Survive"
        else:
            self.target = "Dead"

    def predict(self, test):
        if test[self.fkey] > self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

Data

In [17]:
split = int(0.7*data_clean.shape[0])
train = data_clean[:split]
test = data_clean[split:]
test = test.reset_index(drop=True)

In [18]:
print(test.shape,train.shape)

(303, 7) (706, 7)


In [19]:
dt = DecisionTree()

In [20]:
dt.train(train)

Making Tree feature is: sex at value: 0.65
Making Tree feature is: pclass at value: 2.15
Making Tree feature is: parch at value: 0.56
Making Tree feature is: fare at value: 61.57
Making Tree feature is: fare at value: 25.71
Making Tree feature is: fare at value: 14.96
Making Tree feature is: fare at value: 36.47
Making Tree feature is: age at value: 36.37
Making Tree feature is: age at value: 28.76
Making Tree feature is: age at value: 48.42
Making Tree feature is: fare at value: 78.75
Making Tree feature is: pclass at value: 1.75
Making Tree feature is: age at value: 35.88
Making Tree feature is: age at value: 23.70
Making Tree feature is: age at value: 42.78
Making Tree feature is: age at value: 26.71
Making Tree feature is: age at value: 53.00
Making Tree feature is: sibsp at value: 0.87
Making Tree feature is: fare at value: 10.46
Making Tree feature is: fare at value: 7.89
Making Tree feature is: parch at value: 0.06
Making Tree feature is: age at value: 28.38
Making Tree feature 

In [21]:
correct = 0
total = test.shape[0]

for i in range(total):
    row = test.iloc[i]
    prediction = dt.predict(row)
    actual = "Survive" if row["survived"] == 1 else "Dead"
    
    if prediction == actual:
        correct += 1

accuracy = correct / total
print(f"Accuracy on test set: {accuracy:.2f}")

Accuracy on test set: 0.76


By Scikit Learn Library

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
sk = DecisionTreeClassifier(criterion= "entropy", max_depth =5)

In [24]:
sk.fit(train[input_col], train[output_col])

In [25]:
sk.predict(test[input_col])

array([0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1.,
       0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0.,
       0., 0., 1., 0., 1.

In [26]:
sk.score(test[input_col],test[output_col])

0.7920792079207921

Visualization

In [27]:
def print_tree(node, spacing=""):
    # Base case: leaf node
    if node.left is None and node.right is None:
        print(spacing + "Predict:", node.target)
        return

    # Print the question at this node
    print(spacing + f"{node.fkey} <= {node.fval:.2f}?")

    # Call this function recursively on the true branch
    print(spacing + '--> True:')
    print_tree(node.left, spacing + "  ")

    # Call this function recursively on the false branch
    print(spacing + '--> False:')
    print_tree(node.right, spacing + "  ")

In [28]:
print_tree(dt)

sex <= 0.65?
--> True:
  pclass <= 2.15?
  --> True:
    parch <= 0.56?
    --> True:
      fare <= 61.57?
      --> True:
        fare <= 25.71?
        --> True:
          Predict: Survive
        --> False:
          Predict: Survive
      --> False:
        age <= 36.37?
        --> True:
          Predict: Survive
        --> False:
          Predict: Survive
    --> False:
      fare <= 78.75?
      --> True:
        pclass <= 1.75?
        --> True:
          Predict: Survive
        --> False:
          Predict: Survive
      --> False:
        age <= 42.78?
        --> True:
          Predict: Survive
        --> False:
          Predict: Survive
  --> False:
    sibsp <= 0.87?
    --> True:
      fare <= 10.46?
      --> True:
        fare <= 7.89?
        --> True:
          Predict: Survive
        --> False:
          Predict: Dead
      --> False:
        age <= 26.38?
        --> True:
          Predict: Dead
        --> False:
          Predict: Dead
    --> False:
    