In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
def entropy_basic(p1,p2):
    ans = 0
    if(p1!=0):
        ans -= p1*np.log2(p1)
    if(p2!=0):
        ans -= p2*np.log2(p2)
    
    return ans
    

In [14]:
entropy_basic(0.6,0.4)

0.97095059445466858

In [13]:
entropy_basic(0,1)

0.0

In [15]:
entropy_basic(0.3,0.2)

0.98547529722733429

In [20]:
entropy_basic(0.25,0.75)*(4/14)

0.23179374984546652

In [19]:
entropy_basic(4/6,2/6)*(6/14)

0.39355535745192405

In [22]:
entropy_basic(0.5,0.5)*(4/14)

0.2857142857142857

In [24]:
df = pd.read_csv("titanic.csv")

In [28]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [29]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [31]:
not_useful = ['PassengerId',  'Name', 'Ticket', 'Cabin', 'Embarked']

In [33]:
data_clean = df.drop(columns = not_useful)

In [34]:
data_clean.shape

(891, 7)

In [35]:
data_clean.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
le = LabelEncoder()

temp = ["A","B","C","B","A"]
le.fit_transform(temp)

array([0, 1, 2, 1, 0])

In [38]:
data_clean['Sex'] = le.fit_transform(data_clean['Sex'])

In [39]:
data_clean.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [44]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


In [43]:
data_clean.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,0.383838,0.523008,0.381594,32.204208
std,0.486592,0.836071,0.47799,0.486592,1.102743,0.806057,49.693429
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,0.0,0.0,0.0,7.9104
50%,0.0,3.0,1.0,0.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,1.0,1.0,0.0,31.0
max,1.0,3.0,1.0,1.0,8.0,6.0,512.3292


In [42]:
data_clean['Age'] = data_clean.fillna(data_clean['Age'].mean())

In [45]:
def  entropy(col):
    counts = np.unique(col,return_counts=True)
    ent = 0.0
    N = float(col.shape[0])
    for ix in counts[1]:
        p = ix/N
        ent += -(p*np.log2(p))
    
    return ent

In [46]:
col = np.array([1,1,0,0,1,0,0,1])

In [47]:
entropy(col)

1.0

In [49]:
def split_data(X_data,fkey,fval):
    #Binary split
    X_Right = pd.DataFrame([],columns = X_data.columns)
    X_left = pd.DataFrame([],columns = X_data.columns)
    
    for ix in range(X_data.shape[0]):
        val = X_data[fkey].loc[ix]
        
        if (val>fval):
            X_Right = X_Right.append(X_data.loc[ix])
        else:
            X_left = X_left.append(X_data.loc[ix])
            
    return X_Right,X_left

In [52]:
R,L = split_data(data_clean,'Sex',0.5)

In [56]:
R.shape

(577, 7)

In [55]:
L.shape

(314, 7)

In [60]:
def info_gain(X_data,fkey,fval):
    right,left = split_data(X_data,fkey,fval)
    
    l = float(left.shape[0]/X_data.shape[0])
    r = float(right.shape[0]/X_data.shape[0])
    
    if (left.shape[0] == 0 or right.shape[0]==0):
        return -10000
    
    i_gain = entropy(X_data.Survived) -1*entropy(left.Survived) -r*entropy(right.Survived)
    
    return i_gain

In [61]:
info_gain(data_clean,'Sex',0.5)


-0.31572819598510127

In [63]:
data_clean.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [65]:
X = data_clean[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
Y = data_clean[['Survived']]

In [66]:
for fx in X.columns:
    print(fx)
    print(info_gain(data_clean,fx,data_clean[fx].mean()))

Pclass
-0.470003868043
Sex
-0.315728195985
Age
0.960707901876
SibSp
-0.285767020314
Parch
-0.20653035489
Fare
-0.171417911788


In [67]:
class DecisionTree:
    def __init__(self,depth = 0,max_depth = 5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
    
    
    def train(self,X_train):
        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
        
        info_gain = []
        
        for fx in features:
            i_gain = info_gain(X_train,ix,X_train[ix].mean())
            info_gain.append()
            
            selected = features[np.argmax(info_gain)]
            self.fkey = selected
            self.fval = X_train[selected].mean()
            
            data_left,data_right = split_data(X_train,self.fkey,self.fval)
            
            data_left = data_left.reset_index(drop = True)
            data_right = data_right.reset_index(drop = True)\
            
            #base Case (leaf node)
            if(data_left.shape[0]==0 or data_right.shape[0]==0):
                if(X_train.Survived.mean()>0.5):
                    self.target = "Survived"
                    
                else:
                    self.target = "Dead"
                return 
            
            #Recursive Case
            
            #create a left node 
            self.left = DecisionTree(depth = self.depth+1,max_depth = self.max_depth)
            #recursively build the left subtree
            self.left.train(data_left)
            
            #create a right node 
            self.right = DecisionTree(depth = self.depth+1,max_depth = self.max_depth)
            #recursively build the right subtree
            self.right.train(data_right)
            
            
    def predict(self,test):
        if test[self.fkey]>self.fval:
            
            if self.right is None:
                return self.target
            return self.right.predict(test)
        
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
            

In [None]:
m