In [64]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
from sklearn.metrics import accuracy_score


# Decision Tree

In [65]:
class Decision_Tree():
  def __init__(self,criterion,max_depth,min_samples_split,min_samples_leaf,no_of_features=None):
    self.criterion=criterion
    self.max_depth=max_depth
    self.min_samples_split=min_samples_split
    self.min_samples_leaf=min_samples_leaf
    self.parent_node=None
    self.no_of_features=no_of_features

    


  def fit(self, X, y):
        self.no_of_features = X.shape[1] if not self.no_of_features else min(X.shape[1],self.no_of_features)
        
        self.parent_node = self.grow_a_tree(X, y)

  def grow_a_tree(self, X, y, depth=0):
        no_of_samples=X.shape[0]
      
        no_of_attributes=X.shape[1]
        classes=np.unique(y)
        no_of_classes=len(classes)
        

      
       
        if (depth>=self.max_depth or no_of_classes==1 or no_of_samples<self.min_samples_split):
            count = Counter(y)
            _fre_class = count.most_common(1)[0][0]
            return Node(_val=_fre_class)

        indexes = np.random.choice(no_of_attributes, self.no_of_features, replace=False)
        if(self.criterion=="entropy"):
          max_gain = -1
          s_index, s_t = None, None
          for i in indexes:
            Feature_val = X[:, i]
            unique_values_of_x = np.unique(Feature_val)

            for j in unique_values_of_x:
                G = self.gain(y, Feature_val, j)

                if G > max_gain:
                    max_gain = G
                    s_index = i
                    s_t = j
        if(self.criterion=="gini"):
          max_gini=-1
          s_index, s_t = None, None
          for i in indexes:
            Feature_val = X[:, i]
            unique_values_of_x = np.unique(Feature_val)

            for j in unique_values_of_x:
                G = self.gini(y, Feature_val, j)

                if G > max_gini:
                    max_gini = G
                    s_index = i
                    s_t = j
          
        l_split_indexes, r_split_indexes= self.split(X[:, s_index], s_t)
        #print(len(l_split_indexes))
        if (len(l_split_indexes)<=self.min_samples_leaf or len(r_split_indexes)<=self.min_samples_leaf):
             hl = Counter(y)
             _fre_class = hl.most_common(1)[0][0]
             return Node(_val=_fre_class)

        l_splits = self.grow_a_tree(X[l_split_indexes, :], y[l_split_indexes], depth+1)
        r_splits = self.grow_a_tree(X[r_split_indexes, :], y[r_split_indexes], depth+1)
        


        return Node(s_index, s_t, l_splits, r_splits)

  

  

  def split(self, Feature_val, k):
    
        l_split_indexes = np.argwhere(Feature_val <= k).flatten()
        r_split_indexes= np.argwhere(Feature_val > k).flatten()
        return l_split_indexes, r_split_indexes
  def gain(self, y, Feature_val, K):
        E_of_P = self._E(y)
        l_split_indexes, r_split_indexes= self.split(Feature_val, K)
        if len(l_split_indexes) == 0 or len(r_split_indexes) == 0:
            return 0
        noOFSamples = len(y)
        noOFSamplesOFLeft, noOFSamplesOFRight = len(l_split_indexes), len(r_split_indexes)
        e_l, e_r = self._E(y[l_split_indexes]), self._E(y[r_split_indexes])
        E_of_C = (noOFSamplesOFLeft/noOFSamples) * e_l + (noOFSamplesOFRight/noOFSamples) * e_r
        return E_of_P - E_of_C
  def gini(self,y, Feature_val, K):
    G_of_P=self._G(y)
    
    l_split_indexes, r_split_indexes= self.split(Feature_val, K)
    if len(l_split_indexes) == 0 or len(r_split_indexes) == 0:
            return 0
    noOFSamples = len(y)
    noOFSamplesOFLeft, noOFSamplesOFRight = len(l_split_indexes), len(r_split_indexes)
    g_l, g_r = self._G(y[l_split_indexes]), self._G(y[r_split_indexes])
    G_of_C = (noOFSamplesOFLeft/noOFSamples) * g_l + (noOFSamplesOFRight/noOFSamples) * g_r
    return (G_of_P-G_of_C)


  def _E(self, y):
        x = np.bincount(y)
        x1 = x / len(y)
        #for i in range(x1):
          #if(i>0):
            #y+=-(i * np.log(i) )
        return -np.sum([i * np.log(i) for i in x1 if i>0])
  def _G(self, y):
        x = np.bincount(y)
        x1 = x / len(y)
        return 1-np.sum([i * i for i in x1 if i>0])

  def T(self, x, y):
        if y.leafNode ():
            return y._val

        if x[y. _attribute] <= y. _thres:
            return self.T(x, y. _l)
        return self.T(x, y. _r)

  def predict(self, X):
        return np.array([self.T(x, self.parent_node) for x in X])
class Node:
    def __init__(self, _attribute=None,  _thres=None,  _l=None,  _r=None,*,_val=None):
        self._attribute = _attribute
        self. _thres =  _thres
        self. _l =  _l
        self. _r =  _r
        self._val = _val
        
    def leafNode(self):
        if (self._val==None):
          return False
        else: return True



# Random Forest

In [66]:
from random import *


class Random_Forest:
    def __init__(self,no_of_trees,min_no_of_features,criterion,max_depth,min_samples_split,min_samples_leaf,no_of_features=None):
      self.no_of_trees=no_of_trees
      self.criterion=criterion
      self.max_depth=max_depth
      self.min_samples_split=min_samples_split
      self.min_samples_leaf=min_samples_leaf
      self.parent_node=None
      self.no_of_features=no_of_features
      self.min_no_of_features=min_no_of_features

    def fit(self, X, y):
        self.N = []
        for _ in range(self.no_of_trees):
            R= Decision_Tree(self.criterion,self.max_depth,self.min_samples_split,self.min_samples_leaf,self.no_of_features)
            no_of_samples1 = X.shape[0]
            i = np.random.choice(no_of_samples1, no_of_samples1, replace=True)
            X=X[i]
            y=y[i]
            no_of_features1=X.shape[1]
            l = randint(self.min_no_of_features,no_of_features1)
            j= np.random.choice(l,l, replace=False)
            X=X[:,j]
            
            R.fit(X, y)
            self.N.append(R)

    

    def fre(self, y):
        k = Counter(y)
        k1 = k.most_common(1)[0][0]
        return k1

    def predict(self, X):
        out = np.array([R.predict(X) for R in self.N])
        xw = np.swapaxes(out, 0, 1)
        out= np.array([Counter(i).most_common(1)[0][0] for i in xw])
        return out

# Boosting

In [67]:
class AdaBoost():
  def __init__(self,weak_learner,num_learners,learning_rate):
    self.weak_learner=weak_learner

    self.num_learners=num_learners
    self.learning_rate = learning_rate
    self.trees=[]
    #self.weak_learner=[]
    self.errors1=[]
    self.alpha1=[]
  def fit(self,X,y):
    no_of_samples=X.shape[0]
    no_of_features=X.shape[1]
    weights=(1/no_of_samples)*np.ones(no_of_samples)
    self.weights=weights
    tree=Decision_Tree("entropy",1,2,2)
    tree.fit(X,y)
    out=tree.predict(X)
    err=self.error(out,y,self.weights)
    i=0
    while(self.num_learners> i or err==0):
      self.trees.append(tree)
      tree=Decision_Tree("entropy",1,2,2)
      tree.fit(X,y)
      out=tree.predict(X)
      err=self.error(out,y,self.weights)
      alp=self.alpha(err)
      self.errors1.append(err)
      self.alpha1.append(alp)
      self.weights=self.updateWeight(self.weights,alp,out,y)
      i=i+1
  def error(self,out,y,wieghts):
    err= (sum(wieghts * (np.not_equal(y, out)).astype(int)))/sum(wieghts)
    
    return err
  def alpha(self,error):
    return np.log((1-error)/(error))
  def updateWeight(self,wieghts,alp,out,y):
    return wieghts * np.exp(alp * (np.not_equal(y, out)).astype(int))
  def predict(self,X):
    g = pd.DataFrame(index = range(len(X)), columns = range(self.num_learners)) 
    for i in range(self.num_learners):
            out_i = self.trees[i].predict(X) * self.alpha1[i]
            g.iloc[:,i] = out_i

       
    out = (1 * np.sign(g.T.sum())).astype(int)
    return out

In [68]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
actual_y=pd.read_csv('gender_submission.csv')




# Store our test passenger IDs for easy access
PassengerId = test['PassengerId']
original_train = train.copy()
original_train = train.copy()
full_data = [train, test]
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] ;
# Feature selection: remove variables no longer containing relevant information
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
test  = test.drop(drop_elements, axis = 1)


y_train = train['Survived']


x_train = train.drop(['Survived'], axis=1).values 
drop_elements1 = ['PassengerId']
actual_y= actual_y['Survived']
y_train = y_train.to_numpy()
actual_y =  actual_y.to_numpy()
x_test = test.values
#print(actual_y)





In [69]:
model2=Decision_Tree("gini",100,4,3)
model2.fit(x_train,y_train)
out1=model2.predict(x_test)
accuracy1=accuracy_score(actual_y,out1)
print(accuracy1)

0.8397129186602871


In [70]:
model2=Decision_Tree("entropy",100,2,2)
model2.fit(x_train,y_train)
out2=model2.predict(x_test)
accuracy2=accuracy_score(actual_y,out2)
print(accuracy2)

0.7727272727272727


In [71]:
model3=Random_Forest(15,5,"entropy",10,2,2,4)
model3.fit(x_train,y_train)
out3=model3.predict(x_test)
accuracy3=accuracy_score(actual_y,out3)
print(accuracy3)

0.45215311004784686


In [72]:
model4=Random_Forest(15,5,"gini",9,2,2,4)
model4.fit(x_train,y_train)
out4=model4.predict(x_test)
accuracy4=accuracy_score(actual_y,out4)
print(accuracy4)

0.5980861244019139


In [73]:
weaklearner=Decision_Tree("entropy",1,2,2)
model5=AdaBoost(weaklearner,10,0.1)
model5.fit(x_train,y_train)
out5=model5.predict(x_test)
accuracy5=accuracy_score(actual_y,out5)
print(accuracy5)

0.937799043062201


In [74]:
print(" ACCURACY TABLE")
print()
print("          DecisionTree         RandomForest          Boosting")
print("Gini    ",accuracy1,"  ",accuracy3,"  ")
print("Entrpy  ",accuracy2,"  ",accuracy4,"  ",accuracy5)

 ACCURACY TABLE

          DecisionTree         RandomForest          Boosting
Gini     0.8397129186602871    0.45215311004784686   
Entrpy   0.7727272727272727    0.5980861244019139    0.937799043062201
