In [1]:
import pandas as pd
import numpy as np

In [136]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Drop columns which are of no use

In [137]:
columns_to_drop = ["PassengerId", "Name","Ticket","Cabin","Embarked"]
test_columns_to_drop = ["Name","Ticket","Cabin","Embarked"]
data_clean = data.drop(columns_to_drop, axis=1)
test_clean = test.drop(test_columns_to_drop, axis=1)

### Encode categorical data

In [138]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data_clean["Sex"] = le.fit_transform(data_clean["Sex"])
test_clean["Sex"] = le.transform(test_clean["Sex"])

### Handle missing data

In [5]:
data_clean = data_clean.fillna(value=data_clean["Age"].mean())

### Calculate entropy of a column
Entropy is the measure of randomness.

In [7]:
def get_entropy(col):
    """Entropy is calculated as the summation of negative of probability of each unique item in column multiplied by log of probability
       Parameters
       ----------
       col : numpy array
    """
    unique_cols = np.unique(col, return_counts=True)
    n = col.shape[0]
    entropy = 0
    for frequency in unique_cols[1]:
        probability = frequency/n
        entropy += -probability*np.log2(probability)
    return entropy

### Divide data in decision tree

In [8]:
def divide_data(x_data, key, threshold):
    x_left = pd.DataFrame([], columns=x_data.columns)
    x_right = pd.DataFrame([], columns=x_data.columns)
    
    for i in range(x_data.shape[0]):
        val = x_data[key].loc[i]
        if val < threshold:
            x_left = x_left.append(x_data.loc[i])
        else:
            x_right = x_right.append(x_data.loc[i])
    return x_left, x_right

### Calculate Information Gain

In [65]:
def get_information_gain(x_data, key, threshold):
    left_data, right_data = divide_data(x_data, key, threshold)
    
    if(left_data.shape[0]==0 or right_data.shape[0]==0):
        return -1000
    
    l = left_data.shape[0]/x_data.shape[0]
    r = right_data.shape[0]/x_data.shape[0]
    information_gain = get_entropy(x_data["Survived"]) - l*get_entropy(left_data["Survived"]) - r*get_entropy(right_data["Survived"])
    return information_gain

### Decision Tree class

In [145]:
class DecisionTree:
    def __init__(self, depth=0, max_depth=5):
        self.left = None
        self.right = None
        self.key = None
        self.threshold = None
        self.depth = depth
        self.max_depth = max_depth
        self.target = None
    
    def fit(self, x_train, y_train):
        features = x_train.columns
        combined_list = x_train.join(y_train)
        info_gains= []
        for col in features:
            information_gain = get_information_gain(combined_list, col, x_train[col].mean())
            info_gains.append(information_gain)
        self.key = features[np.argmax(info_gains)]
        self.threshold = x_train[self.key].mean()
        print("divide tree along "+self.key)
        
        left_data, right_data = divide_data(combined_list, self.key, self.threshold)
        left_data.reset_index(drop=True, inplace=True)
        right_data.reset_index(drop=True, inplace=True)
        
        if left_data.shape[0]==0 or right_data.shape[0]==0 or self.depth > self.max_depth:
            if combined_list["Survived"].mean() >= 0.5:
                self.target=1
            else:
                self.target=0
            return
        
        self.left = DecisionTree(depth = self.depth+1)
        self.left.fit(left_data.drop(["Survived"], axis=True), left_data["Survived"])
        
        self.right = DecisionTree(depth=self.depth+1)
        self.right.fit(right_data.drop(["Survived"], axis=True), right_data["Survived"])
        
    def predict_row_output(self, test):
        if test[self.key] < self.threshold:
            if self.left is None:
                return self.target
            return self.left.predict(test)
        else:
            if self.right is None:
                return self.target
            return self.right.predict(test)
    
    def predict(self, test, passengerIds):
        y_preds=[]
        for i in range(test.shape[0]):
            y_preds.append({"PassengerId": passenger_ids[i], "Survived": dt.predict_row_output(test.loc[i])})
        predictions = pd.DataFrame(y_preds)
        return predictions

In [None]:
x_train = data_clean.drop(["Survived"], axis=1)
y_train = pd.DataFrame(data_clean["Survived"])

dt = DecisionTree()
dt.fit(x_train, y_train)

divide tree along Sex
divide tree along Pclass
divide tree along Pclass
divide tree along Parch
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along SibSp
divide tree along Age
divide tree along Age
divide tree along Parch
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Fare
divide tree along SibSp
divide tree along Age
divide tree along Parch
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Fare
divide tree along SibSp
divide tree along Fare
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Age
divide tree along Fare
divide tree along SibSp
divide tree along Age
divide tree along Fare
divide tree along Fare
divide tree along Age
divide tree along Parch
divide tree along Age
divide tree along Age
divide tree along SibSp
divide tree along Parch


In [141]:
x_test = test_clean.drop(["PassengerId"], axis=1)
passenger_ids = test_clean["PassengerId"]
dt.predict(x_test, passenger_ids)
predictions.to_csv('output.csv', index=False)

In [143]:
predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
