In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from random import randrange
import math
from sklearn.model_selection import KFold

In [24]:
## Feature Extraction ##
df_titanic = pd.read_csv('titanic.csv')

#transforming Sex column values: male-2, female-1
df_titanic['Sex'].replace(['female','male'], [1,2], inplace=True)

#checking for NaN values
print(df_titanic.apply(axis=0, func=lambda x : any(pd.isnull(x))))
print('\n')

#total NaN values in Age column
print(sum(df_titanic['Age'].isnull()))

#Replacing NaNs with mean of column
df_titanic['Age'] = df_titanic['Age'].fillna(df_titanic['Age'].mean())

print(sum(df_titanic['Age'].isnull()))

# combine siblings and parent/child columns and create new column named 'individual'
# this will allow us to add if the survivor was alone or he had relatives
df_titanic["relatives"] = df_titanic["SibSp"] + df_titanic["Parch"]
df_titanic.loc[df_titanic["relatives"] > 0, "individual"] = 0
df_titanic.loc[df_titanic["relatives"] == 0, "individual"] = 1
df_titanic["individual"] = df_titanic["individual"].astype(int)
        
#dropping unnecessary columns
df_titanic.drop(['Cabin', 'Name', 'Ticket', 'Embarked', 'Parch', 'SibSp', 'relatives'], axis=1, inplace=True)        
        
df_titanic.head()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool


177
0


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,individual
0,1,0,3,2,22.0,7.25,0
1,2,1,1,1,38.0,71.2833,0
2,3,1,3,1,26.0,7.925,1
3,4,1,1,1,35.0,53.1,0
4,5,0,3,2,35.0,8.05,1


In [25]:
X = df_titanic.drop(['Survived'], axis=1)
y = df_titanic['Survived']

X.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,individual
0,1,3,2,22.0,7.25,0
1,2,1,1,38.0,71.2833,0
2,3,3,1,26.0,7.925,1
3,4,1,1,35.0,53.1,0
4,5,3,2,35.0,8.05,1


In [26]:
#Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=32)
X_train['Survived'] = y_train
X_test['Survived'] = y_test
# print(X_train.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
#Gini score
def get_gini_score(instances, class_):
  gini_score = 0.0
  num_of_instances = float(sum([len(instance) for instance in instances]))
  
  for instance in instances:
    
    if len(instance) == 0:
      continue
    
    #proportions
    p_j = 0.0
    
    for cl in class_:
      p = [row['Survived'] for i, row in instance.iterrows()].count(cl) / len(instance)       
      p_j += p * p
    
    gini_score += (1.0 - p_j) * (len(instance) / num_of_instances)
  
  return gini_score

def info_gain(data, col_name, class_, n_rows):
  
  total_entropy = 0.0                                                   
  column_entropy = 0
    
  for label in class_:
      total_entropy = total_entropy + (-(len(data[data['Survived'] == label])
                                           /n_rows)*math.log2(len(data[data['Survived'] == label])/n_rows))                    
    
  unique_col_values = list(data[col_name].unique())

  for column_value in unique_col_values:
      for label in class_:

          probability_value = -len(data[data[col_name] == column_value])/n_rows
          conditional_probability_value = len(data[(data['Survived'] == label) & (data[col_name]\
                                                                     == column_value)])/len(data[col_name] == column_value)
          if conditional_probability_value != 0:
              column_entropy = column_entropy + probability_value*(conditional_probability_value*math.log2(conditional_probability_value))
          else:
              continue

  return total_entropy, column_entropy                                    

   

In [0]:
#Split a data
def get_splits(col, value, data):
  
  left_branch = right_branch = pd.DataFrame(columns=data.columns)
  for i, row in data.iterrows(): 
    if row[col] < value:
      left_branch = left_branch.append(row)
    else:
      right_branch = right_branch.append(row)
  
  return left_branch, right_branch

In [0]:
#Best split based on gini score
def get_best_split(data, impurity):
  class_ = list(set([v for v in data['Survived']]))
  
  gini_score = 1000
  info_score = 0
  n_rows = len(data)
  
  for col in data.columns:
    
    if col == 'Survived':
      continue
    
    unique_values = []
    
    for i, row in data.iterrows():
      
      if row[col] in unique_values:
        continue
        
      unique_values.append(row[col])      
      instances = get_splits(col, row[col], data)
      
      if impurity == 'gini':
        score = get_gini_score(instances, class_)
        if score < gini_score:
          split_index, split_value, gini_score, split_instances = col, row[col], score, instances
      else:
        temp = info_gain(data, col, class_, n_rows)
        score = temp[0] - temp[1]
        if score > info_score:
          split_index, split_value, info_score, split_instances = col, row[col], score, instances
      
      
  return {'position':split_index, 'value':split_value, 'instances':split_instances}

#return prediction with majority voting
def set_leaf(instance):
  preds = [row['Survived'] for i, row in instance.iterrows()]
  return max(set(preds), key=preds.count)

#split children
def split_child(node, min_size, depth, max_depth, impurity):
  left_branch, right_branch = node['instances']  
  
  del(node['instances'])
  
  #check if no split
  if len(left_branch) == 0 or len(right_branch)==0:
    if len(left_branch) == 0:
      node['left_branch'] = node['right_branch'] = set_leaf(right_branch)
    else:
      node['left_branch'] = node['right_branch'] = set_leaf(left_branch)
    return
    
  # check for max depth
  if depth >= max_depth:
    node['left_branch'], node['right_branch'] = set_leaf(left_branch), set_leaf(right_branch)
    return
  
  # add to left child
  if len(left_branch) <= min_size:
    node['left_branch'] = set_leaf(left_branch)
  else:
    node['left_branch'] = get_best_split(left_branch, impurity)
    split_child(node['left_branch'], min_size, depth+1, max_depth, impurity)
    
  # add to right child
  if len(right_branch) <= min_size:
    node['right_branch'] = set_leaf(right_branch)
  else:
    node['right_branch'] = get_best_split(right_branch, impurity)
    split_child(node['right_branch'], min_size, depth+1, max_depth, impurity)

In [0]:
#build tree from data
def build_decision_tree(train, min_size, max_depth, impurity):
  
  root = get_best_split(train, impurity)
  split_child(root, min_size, 1, max_depth, impurity)
  return root

#Make a prediction
def predict(node, row):
  
  if row[node['position']] < node['value']:
    if isinstance(node['left_branch'], dict):
      return predict(node['left_branch'], row)
    else:
      return node['left_branch']
  else:
    if isinstance(node['right_branch'], dict):
      return predict(node['right_branch'], row)
    else:
      return node['right_branch']
    
#get predictions from decision tree

def decision_tree(train, test, min_size, max_depth, impurity):
  tree = build_decision_tree(train, min_size, max_depth, impurity)
  preds = []
  for i,row in test.iterrows():
    prediction = predict(tree, row)
    preds.append(prediction)
  return(preds)

In [0]:
#check accuracy
def calculate_accuracy(ground_truth, predicted):
    correct_prediction = 0
    for i in range(len(ground_truth)):
        if ground_truth[i] == predicted[i]:
            correct_prediction = correct_prediction + 1
    return correct_prediction / float(len(ground_truth)) * 100.0

In [17]:
# to run decision tree based on gini or info gain
predictions_gini = decision_tree(X_train, X_test, 10, 5, 'gini')
predictions_ig = decision_tree(X_train, X_test, 10, 5, 'ig')

ginitree {'position': 'Sex', 'value': 2.0, 'left_branch': {'position': 'Pclass', 'value': 3.0, 'left_branch': {'position': 'Age', 'value': 4.0, 'left_branch': 0.0, 'right_branch': {'position': 'Fare', 'value': 29.0, 'left_branch': {'position': 'Age', 'value': 57.0, 'left_branch': 1.0, 'right_branch': 0.0}, 'right_branch': {'position': 'Fare', 'value': 151.55, 'left_branch': 1.0, 'right_branch': 1.0}}}, 'right_branch': {'position': 'Fare', 'value': 24.15, 'left_branch': {'position': 'Age', 'value': 32.0, 'left_branch': {'position': 'Age', 'value': 8.0, 'left_branch': 1.0, 'right_branch': 1.0}, 'right_branch': 0.0}, 'right_branch': {'position': 'Age', 'value': 38.0, 'left_branch': {'position': 'Pclass', 'value': 3.0, 'left_branch': 0.0, 'right_branch': 0.0}, 'right_branch': 0.0}}}, 'right_branch': {'position': 'Age', 'value': 7.0, 'left_branch': {'position': 'Pclass', 'value': 3.0, 'left_branch': 1.0, 'right_branch': 0.0}, 'right_branch': {'position': 'Pclass', 'value': 2.0, 'left_branch

In [0]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, predictions_gini))
print(accuracy_score(y_test, predictions_ig))

## Gini Index vs Information Gain

### Gini Index:
- Generated Tree : {'position': 'Sex', 'value': 2.0, 'left_branch': {'position': 'Pclass', 'value': 3.0, 'left_branch': {'position': 'Age', 'value': 4.0, 'left_branch': 0.0, 'right_branch': {'position': 'Fare', 'value': 29.0, 'left_branch': {'position': 'Age', 'value': 57.0, 'left_branch': 1.0, 'right_branch': 0.0}, 'right_branch': {'position': 'Fare', 'value': 151.55, 'left_branch': 1.0, 'right_branch': 1.0}}}, 'right_branch': {'position': 'Fare', 'value': 24.15, 'left_branch': {'position': 'Age', 'value': 32.0, 'left_branch': {'position': 'Age', 'value': 8.0, 'left_branch': 1.0, 'right_branch': 1.0}, 'right_branch': 0.0}, 'right_branch': {'position': 'Age', 'value': 38.0, 'left_branch': {'position': 'Pclass', 'value': 3.0, 'left_branch': 0.0, 'right_branch': 0.0}, 'right_branch': 0.0}}}, 'right_branch': {'position': 'Age', 'value': 7.0, 'left_branch': {'position': 'Pclass', 'value': 3.0, 'left_branch': 1.0, 'right_branch': 0.0}, 'right_branch': {'position': 'Pclass', 'value': 2.0, 'left_branch': {'position': 'Fare', 'value': 26.2875, 'left_branch': 0.0, 'right_branch': {'position': 'Age', 'value': 54.0, 'left_branch': 0.0, 'right_branch': 0.0}}, 'right_branch': {'position': 'Age', 'value': 14.0, 'left_branch': 0.0, 'right_branch': {'position': 'Age', 'value': 62.0, 'left_branch': 0.0, 'right_branch': 0.0}}}}}

- Root feature: 'Sex'

- accuracy : 0.78 (without cross_validation)

- Hence, when we develop a tree using gini index it is giving 'Sex' more importance over other features.

### Information Gain:
- Generated Tree : {'position': 'Fare', 'value': 8.0292, 'left_branch': {'position': 'Fare', 'value': 7.25, 'left_branch': {'position': 'Age', 'value': 51.0, 'left_branch': {'position': 'Age', 'value': 25.0, 'left_branch': {'position': 'Fare', 'value': 7.0542, 'left_branch': 0.0, 'right_branch': 0.0}, 'right_branch': {'position': 'Sex', 'value': 2.0, 'left_branch': 1.0, 'right_branch': 0.0}}, 'right_branch': 0.0}, 'right_branch': {'position': 'Fare', 'value': 7.25, 'left_branch': 0.0, 'right_branch': 0.0}}, 'right_branch': {'position': 'Fare', 'value': 8.0292, 'left_branch': 0.0, 'right_branch': 0.0}}

- Root feature: 'Fare'

- accuracy : 0.58 (without cross_validation)

- Hence, when we develop a tree using information gain it is giving 'Fare' , more importance over other features.

*********
So, we can observe that gini index is better impurity metrics than information gain to decide on best split while developing decision trees.




In [0]:
#corss_validation
# def apply_cross_validation(folds, train, test, impurity):
#     k_fold = KFold(n_splits=folds)
    
#     X_train = train.drop("Survived",axis = 1)
#     Y_train = train[["Survived","PassengerId"]]
    
    
#     X_column_values = X_train.columns
    
#     Y_column_values = Y_train.columns
    
#     X_train = np.array(X_train)
#     Y_train = np.array(Y_train)
    
    
#     X_shuffled_train , predictions_shuffled_train = shuffle(X_train, Y_train, random_state=0)
    
#     accuracy_array = []
    
#     for train_index, test_index in k_fold.split(X_shuffled_train):
#         y_train, y_test = predictions_shuffled_train[train_index], predictions_shuffled_train[test_index]
       
#         X_train, X_test = X_shuffled_train[train_index], X_shuffled_train[test_index]       
        
#         X_train = pd.DataFrame(data=X_train, columns=X_column_values, dtype =object)
#         y_train = pd.DataFrame(data=y_train, columns=Y_column_values, dtype =object)        
    
#         train_new = pd.merge(X_train, y_train, on=['PassengerId'])
#         train_new = train_new.drop(["PassengerId"], axis=1)
        
#         X_test = pd.DataFrame(data=X_test, columns=X_column_values, dtype =object)
#         y_test = pd.DataFrame(data=y_test, columns=Y_column_values, dtype =object)
        
#         test_new = pd.merge(X_test, y_test, on=['PassengerId'])
#         test_new = test_new.drop(["PassengerId"], axis=1)        
        
#         # flag to create tree based on either gini or information gain
#         if impurity == 'gini':
#           predictions = decision_tree(train_new, test_new, 10, 5, 'gini')
#         else:
#           predictions = decision_tree(train_new, test_new, 10, 5, 'ig')
          
#         accuracy = calculate_accuracy(y_test.Survived,predictions)
#         accuracy_array.append(accuracy)
        
#     return max(accuracy_array)

In [42]:
# from sklearn.utils import shuffle
# folds = 5
# impurity = 'gini' #or 'ig'
# apply_cross_validation(folds, X_train, X_test, impurity)

ValueError: ignored