# Decision Tree


## Preprocessing

In [24]:
import numpy as np
import pandas as pd
train = pd.read_csv("titanic-train.csv")

In [25]:
x_train = train.iloc[:700].copy()
x_test = train.iloc[700:].copy()
train_label = x_train.Survived
test_label = x_test.Survived
train_len = train_label.size
test_len = test_label.size

In [26]:
x_test

Unnamed: 0,Survived,Pclass,Embarked,Sex,Age,SibSp,Parch,Fare
700,1,1,C,female,18.0,1,0,227.5250
701,1,1,S,male,35.0,0,0,26.2875
702,0,3,C,female,18.0,0,1,14.4542
703,0,3,Q,male,25.0,0,0,7.7417
704,0,3,S,male,26.0,1,0,7.8542
...,...,...,...,...,...,...,...,...
886,0,2,S,male,27.0,0,0,13.0000
887,1,1,S,female,19.0,0,0,30.0000
888,0,3,S,female,,1,2,23.4500
889,1,1,C,male,26.0,0,0,30.0000


In [27]:
x_train[x_train['Survived']==1].Survived.count()/x_train.Survived.count()

0.3871428571428571

In [28]:
x_train["Pclass"] = x_train["Pclass"].fillna(x_train['Pclass'].mode()[0])
x_train["Embarked"] = x_train["Embarked"].fillna(x_train['Embarked'].mode()[0])
x_train["Sex"] = x_train["Sex"].fillna(x_train['Sex'].mode()[0])
x_train["Age"] = x_train["Age"].fillna(x_train['Age'].mean())
x_train["Age"] = x_train["Age"].apply(lambda x: 20 if x<25 else 30 if x<40 else 45)
x_train["SibSp"] = x_train["SibSp"].fillna(x_train['SibSp'].mode()[0])
x_train["Parch"] = x_train["Parch"].fillna(x_train['Parch'].mode()[0])
x_train["Fare"] = x_train["Fare"].fillna(x_train['Fare'].mean())
x_train["Fare"] = x_train["Fare"].apply(lambda x: 5 if x<15 else 20 if x<40 else 50)
x_train

Unnamed: 0,Survived,Pclass,Embarked,Sex,Age,SibSp,Parch,Fare
0,0,3,S,male,20,1,0,5
1,1,1,C,female,30,1,0,50
2,1,3,S,female,30,0,0,5
3,1,1,S,female,30,1,0,50
4,0,3,S,male,30,0,0,5
...,...,...,...,...,...,...,...,...
695,0,2,S,male,45,0,0,5
696,0,3,S,male,45,0,0,5
697,1,3,Q,female,30,0,0,5
698,0,1,C,male,45,1,1,50


In [29]:
x_test["Pclass"] = x_test["Pclass"].fillna(x_test['Pclass'].mode()[0])
x_test["Embarked"] = x_test["Embarked"].fillna(x_test['Embarked'].mode()[0])
x_test["Sex"] = x_test["Sex"].fillna(x_test['Sex'].mode()[0])
x_test["Age"] = x_test["Age"].fillna(x_test['Age'].mean())
x_test["Age"] = x_test["Age"].apply(lambda x: 20 if x<25 else 30 if x<40 else 45)
x_test["SibSp"] = x_test["SibSp"].fillna(x_test['SibSp'].mode()[0])
x_test["Parch"] = x_test["Parch"].fillna(x_test['Parch'].mode()[0])
x_test["Fare"] = x_test["Fare"].fillna(x_test['Fare'].mean())
x_test["Fare"] = x_test["Fare"].apply(lambda x: 5 if x<15 else 20 if x<40 else 50)
x_test

Unnamed: 0,Survived,Pclass,Embarked,Sex,Age,SibSp,Parch,Fare
700,1,1,C,female,20,1,0,50
701,1,1,S,male,30,0,0,20
702,0,3,C,female,20,0,1,5
703,0,3,Q,male,30,0,0,5
704,0,3,S,male,30,1,0,5
...,...,...,...,...,...,...,...,...
886,0,2,S,male,30,0,0,5
887,1,1,S,female,20,0,0,20
888,0,3,S,female,30,1,2,20
889,1,1,C,male,30,0,0,20


## Implementation

In [30]:
def entropy(X):
    n = X.shape[0]
    n_unique = X.nunique()
    ans = 0
    for i in X.unique():
        p_i = X[X==i].shape[0]/n
        ans -= p_i*np.log2(p_i+1e-9)
    return ans

In [31]:
def information_gain(X,A):
    mother_entropy = entropy(X.Survived)
    groups = [x for _, x in X.groupby(X[A])]
    for i in range(len(groups)):
        mother_entropy -= (groups[i].shape[0]/X.shape[0])*entropy(groups[i].Survived)
        
    return mother_entropy

In [32]:
def best_attribute(X):
    max_gain = 0
    best_attr = X.columns[0]
    for column in X.columns[1:]:
        if information_gain(X,column) > max_gain:
            max_gain = information_gain(X,column)
            best_attr = column
    return best_attr

In [33]:
class Node:
    def __init__(self):
        self.parent = False
        self.children = []
        self.subtree = False
        self.attr = False
        self.value = False
        self.leaf = -1
        self.sub_dataset = False


In [34]:
def desicion_tree(X,depth,max_depth):
    if depth == max_depth:
        Leaf = Node()
        Leaf.leaf = X["Survived"].mode()[0]
        return Leaf

    best_attr = best_attribute(X)
    
    node = Node()
    node.attr = best_attr
    for i in X[best_attr].unique():
        child_x = X[X[best_attr]==i]
        child = Node()
        child.value = i
        child.sub_dataset = child_x.copy()
        child.subtree = desicion_tree(child_x.copy(), depth+1,max_depth)
        node.children.append(child)
        child.parent = node
    return node


## Evaluation

In [35]:
def predict(passenger, tree):
    if tree.leaf != -1:
        return tree.leaf
    attr = tree.attr
    value = passenger[attr]
    for i in range(len(tree.children)):
        if value == tree.children[i].value:
            break
    return predict(passenger,tree.children[i].subtree)


In [36]:
def score(test_label, Tree, is_forest = 0):
    preds = []
    for i in range(x_test.count()[0]):
        passenger = x_test.iloc[i]
        if is_forest == 0:
            survived = predict(passenger, Tree)
        else:
            survived = forest_predict(passenger, Tree)
        preds.append(survived)
    eval_df = pd.DataFrame({'Label': test_label, 'Predictions': preds})
    accuracy = eval_df[eval_df['Label']==eval_df['Predictions']].shape[0]/eval_df.shape[0]
    return accuracy, preds


In [37]:
def confusion_matrix(y_test,result):
    result_df = pd.DataFrame({"Label":y_test, "Predictions":result})
    Confusion_matrix = pd.crosstab(result_df['Label'], result_df['Predictions'])
    return Confusion_matrix


In [39]:
Tree = desicion_tree(x_train,0,7)
accuracy , preds = score(test_label,Tree)
print(f"Accuracy is: {accuracy*100:0.2f} %")
print(confusion_matrix(test_label,preds))

Accuracy is: 91.62 %
Predictions    0   1
Label               
0            112   8
1              8  63


# Random Forest

In [40]:
def random_forest(x_train, max_depth, n_attr, n_tree):
    columns = x_test.columns
    forest = []
    for i in range(n_tree):
        selected_columns = columns[np.random.choice(range(1,8), size = 7 - n_attr, replace=False)]
        selected_x_train = x_train.merge(pd.DataFrame(index=np.random.randint(train_len, size=train_len)), left_index=True, right_index=True, how='right').reset_index(drop=True)
        selected_x_train = selected_x_train.drop(selected_columns, axis = 1)
        Tree = desicion_tree(selected_x_train, 0, n_attr)
        forest.append(Tree)
    return forest

In [41]:
def forest_predict(passenger, forest):
    preds = []
    for i in range(len(forest)):
        preds.append(predict(passenger, forest[i]))
    return max(set(preds), key=preds.count)
    

In [51]:
My_forest = random_forest(x_train, max_depth=6, n_attr=6, n_tree=5)
accuracy , preds = score(test_label,My_forest, is_forest= True)
print(f"Accuracy is: {accuracy*100:0.2f} %")
print(confusion_matrix(test_label,preds))

Accuracy is: 84.82 %
Predictions    0   1
Label               
0            106  14
1             15  56
