In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [4]:
data_clean = data.drop(columns=['name','ticket','cabin','boat','body','home.dest',"embarked"])
test = test.drop(columns=['name','ticket','cabin','boat','body','home.dest','embarked'])
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,,2.0,0.0,23.25


In [5]:
data_clean = data_clean.fillna(data_clean["age"].mean())
test = test.fillna(test["age"].mean())
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,29.838978,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,29.838978,2.0,0.0,23.25


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_clean["sex"] = le.fit_transform(data_clean["sex"])
test["sex"] = le.fit_transform(test["sex"])
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,29.838978,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,29.838978,2.0,0.0,23.25


In [7]:
input_cols = ["pclass","sex","age","sibsp","parch","fare"]
output_cols = ["survived"]

train = data_clean[input_cols]
x_test = data_clean[output_cols]
print(train.shape,x_test.shape)

(1009, 6) (1009, 1)


In [8]:
def entropy(col):
    counts = np.unique(col,return_counts=True)
    N = float(col.shape[0])
    ent = 0.0
    
    for ix in counts[1]:
        val = ix/N
        ent+= (-1.0*val*np.log2(val))
    
    return ent

In [9]:
def divide_data(x_data,fkey,fval):
    data_left = pd.DataFrame([],columns=x_data.columns)
    data_right = pd.DataFrame([],columns = x_data.columns)
    
    for ix in range(x_data.shape[0]):
        if(x_data[fkey].loc[ix]>fval):
            data_right = data_right.append(x_data.loc[ix])
        else:
            data_left = data_left.append(x_data.loc[ix])
    return data_left,data_right

def information_gain(x_data,fkey,fval):
    
    left,right = divide_data(x_data,fkey,fval)
    
    if(left.shape[0]==0 or right.shape[0]==0):
        return -100000
    
    l = float(left.shape[0]/x_data.shape[0])
    r = float(right.shape[0]/x_data.shape[0])
    
    ent = entropy(x_data.survived)
    i_gain = ent - (l*entropy(left.survived) + r*entropy(right.survived))
    
    return i_gain        

In [10]:
class DecisionTree:
    
    def __init__(self,depth=0,max_depth=7):
        self.left=None
        self.right=None
        self.depth=depth
        self.max_depth = max_depth
        self.fkey=None
        self.fval = None
        self.target = None
        
    def train(self,x_train):
        
        features = ["pclass","sex","age","sibsp","parch","fare"]
        info_gain = []
        
        for ix in features:
            i_gain = information_gain(x_train,ix,x_train[ix].mean())
            info_gain.append(i_gain)
        
        self.fkey = features[np.argmax(info_gain)]
        self.fval = x_train[self.fkey].mean()
        print("Feature is",self.fkey)
        
        data_left,data_right = divide_data(x_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
        
        if(data_left.shape[0]==0 or data_right.shape[0]==0):
            if(x_train.survived.mean()>=0.5):
                self.target = 'Survived'
            else:
                self.target = 'Dead'
            return
        
        if(self.depth>=self.max_depth):
            if(x_train.survived.mean()>=0.5):
                self.target = 'Survived'
            else:
                self.target = 'Dead'
            return
        
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        if(x_train.survived.mean()>=0.5):
            self.target = 'Survived'
        else:
            self.target = 'Dead'
        return
    
    def predict(self,test):
        if test[self.fkey]>self.fval: 
            # go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            
            if self.left is None:
                return self.target
            else:
                return self.left.predict(test)

In [11]:
dt = DecisionTree()
dt.train(data_clean)

Feature is sex
Feature is pclass
Feature is pclass
Feature is fare
Feature is sibsp
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is parch
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is sibsp
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is sibsp
Feature is parch
Feature is age
Feature is age
Feature is age
Feature is age
Feature is pclass
Feature is parch
Feature is fare
Feature is sibsp
Feature is age
Feature is fare
Feature is fare
Feature is age
Feature is pclass
Feature is pclass
Feature is fare
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feature is parch
Feature is age
Feature is age
Feature is age
Feature is age
Feature is age
Feat

In [12]:
y_pred = []
for ix in range(test.shape[0]):
    y_pred.append(dt.predict(test.loc[ix]))
y_pred 

['Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Survived',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Survived',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Survived',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Survived',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Survive

In [13]:
for i in range(len(y_pred)):
    if y_pred[i]=='Survived':
        y_pred[i] = 1
    else:
        y_pred[i] = 0

In [14]:
y_pred = np.array(y_pred).reshape((-1,1))

In [None]:
df = pd.DataFrame(y_pred,columns=["survived"])
df.to_csv("predict.csv")