# Preprocessing

In [1]:
#import packages

import numpy as np
import pandas as pd
from sklearn import datasets
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

#import data
train_data = pd.read_csv ('train.csv')
test_data = pd.read_csv ('test.csv')

#convert Sex values to 0 for Male and 1 for Female
train_data[['Sex']] = (train_data[['Sex']].apply(lambda x: pd.factorize(x)[0]))
test_data[['Sex']] = (train_data[['Sex']].apply(lambda x: pd.factorize(x)[0]))

#Convert below features to be categorical

train_data['Sex'] = pd.Categorical(train_data.Sex).codes

print (train_data)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
5              6         0       3   
6              7         0       1   
7              8         0       3   
8              9         1       3   
9             10         1       2   
10            11         1       3   
11            12         1       1   
12            13         0       3   
13            14         0       3   
14            15         0       3   
15            16         1       2   
16            17         0       3   
17            18         1       2   
18            19         0       3   
19            20         1       3   
20            21         0       2   
21            22         1       2   
22            23         1       3   
23            24         1       1   
24            25         0       3   
25          

In [2]:
train_data['Age'].shape

(891,)

In [3]:
# Check if Age is important 
train_data.groupby('Survived')['Age'].mean()

Survived
0    30.626179
1    28.343690
Name: Age, dtype: float64

In [4]:
# Add a new feature FamSz for Family Size 

train_data['FamSz'] = train_data['Parch'] + train_data['SibSp'] + 1
test_data['FamSz'] = test_data['Parch'] + train_data['SibSp'] + 1
train_data.groupby('Survived')['FamSz'].value_counts()

Survived  FamSz
0         1        374
          2         72
          3         43
          6         19
          5         12
          4          8
          7          8
          11         7
          8          6
1         1        163
          2         89
          3         59
          4         21
          7          4
          5          3
          6          3
Name: FamSz, dtype: int64

In [5]:
train_data['CatFare']= pd.qcut(train_data.Fare, q=4, labels=False)

In [6]:
#Drop columns we won't be using

train_data = train_data.drop(['Name', 'Ticket','Cabin', 'Embarked', 'Age'], axis = 1)
test_data = test_data.drop(['Name', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis = 1)



print(train_data)

     PassengerId  Survived  Pclass  Sex  SibSp  Parch      Fare  FamSz  \
0              1         0       3    0      1      0    7.2500      2   
1              2         1       1    1      1      0   71.2833      2   
2              3         1       3    1      0      0    7.9250      1   
3              4         1       1    1      1      0   53.1000      2   
4              5         0       3    0      0      0    8.0500      1   
5              6         0       3    0      0      0    8.4583      1   
6              7         0       1    0      0      0   51.8625      1   
7              8         0       3    0      3      1   21.0750      5   
8              9         1       3    1      0      2   11.1333      3   
9             10         1       2    1      1      0   30.0708      2   
10            11         1       3    1      1      1   16.7000      3   
11            12         1       1    1      0      0   26.5500      1   
12            13         0       3    

In [7]:
#Find which of the remaining features show nulls

print(train_data.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Sex            0
SibSp          0
Parch          0
Fare           0
FamSz          0
CatFare        0
dtype: int64


In [8]:
#Repalace all NaN with 0 - assuming Passengers with missing ages are babies

#train_data['Age'] = train_data['Age'].fillna(0)
#test_data['Age'] = test_data['Age'].fillna(0)

#print(train_data)

In [9]:
#Create a new feature 'Status' from the 'Names' column

train_data['Status'] = np.nan
train_data['Status'][train_data.Name.str.contains('Mr.')] = 'Mr'
train_data['Status'][train_data.Name.str.contains('Mrs.')] = 'Mrs'
train_data['Status'][train_data.Name.str.contains('Miss.')] = 'Miss'
train_data['Status'][train_data.Name.str.contains('Master.')] = 'Master'
train_data['Status'][train_data.Name.str.contains('Rev.')] = 'Rev'

In [10]:
#train_data['Survived'] = pd.Categorical(train_data.Survived).codes

# Feature Selection

In [11]:
# Select features and target for our classifier

features = ['Pclass', 'Sex', 'Fare']
target = 'Survived'

In [12]:
#View first 5 rows of features data
train_data[features].head()

#View first 5 rows of target data

train_data[target].head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [13]:
#Add a cross validation set
[train_data_tr, train_data_cv] = train_test_split(train_data, test_size = .25, random_state = 42)

Xtr = train_data_tr[features]
Ytr = train_data_tr[target]
Xtrcv = train_data_cv[features]
Ytrcv = train_data_cv[target]

In [14]:
#Create the decision tree with default parameters

clf = DecisionTreeClassifier()

#Fit Decision Tree to our training features target values
clf.fit(Xtr,Ytr)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [15]:
#Check Feature Importance

clf.feature_importances_

array([ 0.12868961,  0.39577367,  0.47553671])

# Fit Models

In [16]:
#Make predictions using the train data features

predictions = clf.predict(Xtrcv)

In [17]:
predictions

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0])

In [18]:
#Check Accuracy of our Predictions on the Train Data

accuracy = accuracy_score(Ytrcv, predictions)
accuracy

0.820627802690583

In [19]:
#Do CV with 100 folds

scores = cross_val_score(clf, train_data[features], train_data[target], cv = 100, scoring = 'accuracy')
scores.mean()

0.82513888888888887

In [20]:
#Now let's try with Logistic Regression

lr = LogisticRegression()
lr.fit(Xtr,Ytr)
lr_predictions = lr.predict(Xtrcv)

In [21]:
#Let's check accuracy for the Logistic Regression model as well

lr_accuracy = accuracy_score(Ytrcv, lr_predictions)
lr_accuracy

0.77130044843049328

In [22]:
#Do CV  with 100 folds for Logistic Regression

scores = cross_val_score(lr, train_data[features], train_data[target], cv = 100, scoring = 'accuracy')
scores.mean()

0.7810555555555555