# Used Titanic Dataset (from Kaggle)

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

#reading the csv files into dataframes
train = pd.read_csv('E:/Kaggle/train.csv')
test = pd.read_csv('E:/Kaggle/test.csv')

print(type(train))
print(type(test))

train.head()

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## No. of passengers in Training data and No. of passengers in Test data

In [2]:
print('Training dataset length : ', len(train))
print('Test dataset length : ',len(test))

Training dataset length :  891
Test dataset length :  418


## Setting Male = 1, Female = 0 ('Sex' field)

In [3]:
train['Sex'] = train['Sex'].map( {'female': 2, 'male': 1} ).astype(int)
# OR
# train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 2)

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


## Dropping some not needed fields
*Ticket*, *Cabin*, *Fare*

In [5]:
train = train.drop(['Ticket', 'Cabin', 'Fare'], axis=1)

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,C
2,3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,S


## Adding a new field 'FamilySize' by combining SibSp and Parch 

In [7]:
train['FamilySize'] = train['SibSp'] + train['Parch']
train = train.drop(['SibSp', 'Parch'], axis=1)

In [8]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,C,1
2,3,1,3,"Heikkinen, Miss. Laina",2,26.0,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,S,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,S,0


In [9]:
# Getting the number of NaN values (null values) in all the fields
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
Embarked         2
FamilySize       0
dtype: int64

In [10]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,C,1
2,3,1,3,"Heikkinen, Miss. Laina",2,26.0,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,S,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,S,0
5,6,0,3,"Moran, Mr. James",1,,Q,0
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,S,0
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,S,4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27.0,S,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14.0,C,1


In [11]:
# Adding mean Age in place of null values in the 'Age' field
train['Age'] = train['Age'].fillna(train['Age'].mean())

In [12]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,C,1
2,3,1,3,"Heikkinen, Miss. Laina",2,26.0,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,S,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,S,0
5,6,0,3,"Moran, Mr. James",1,29.699118,Q,0
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,S,0
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,S,4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27.0,S,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14.0,C,1


In [13]:
train['Age'] = train['Age'].astype(int)

In [14]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38,C,1
2,3,1,3,"Heikkinen, Miss. Laina",2,26,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35,S,1
4,5,0,3,"Allen, Mr. William Henry",1,35,S,0
5,6,0,3,"Moran, Mr. James",1,29,Q,0
6,7,0,1,"McCarthy, Mr. Timothy J",1,54,S,0
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2,S,4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27,S,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14,C,1


In [15]:
train['Embarked'].isnull().sum()

2

In [16]:
# New Feature 1
# The RMS Titanic started from Southampton (S) to New York in its maiden voyage
# so let's assume that the 2 NaN values are people who got on the Titanic at Southampton (S)
train['Embarked'] = train['Embarked'].fillna('S')

In [17]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38,C,1
2,3,1,3,"Heikkinen, Miss. Laina",2,26,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35,S,1
4,5,0,3,"Allen, Mr. William Henry",1,35,S,0


In [18]:
train['Embarked'].isnull().sum()

0

In [19]:
embarkedDictionary = {
    'S' : 1,
    'C' : 2,
    'Q' : 3    
}
train['Embarked'] = train['Embarked'].map(embarkedDictionary)

In [20]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38,2,1
2,3,1,3,"Heikkinen, Miss. Laina",2,26,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35,1,1
4,5,0,3,"Allen, Mr. William Henry",1,35,1,0


In [21]:
# New Feature 2
train['PclassAndSex'] = train['Pclass']*train['Sex']

In [22]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize,PclassAndSex
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,1,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38,2,1,2
2,3,1,3,"Heikkinen, Miss. Laina",2,26,1,0,6
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35,1,1,2
4,5,0,3,"Allen, Mr. William Henry",1,35,1,0,3


In [23]:
train['PclassAndSex'].isnull().sum()

0

In [24]:
# New Feature 3
train['PclassAndAge'] = train['Pclass']*train['Age']

In [25]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize,PclassAndSex,PclassAndAge
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,1,3,66
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38,2,1,2,38
2,3,1,3,"Heikkinen, Miss. Laina",2,26,1,0,6,78
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35,1,1,2,35
4,5,0,3,"Allen, Mr. William Henry",1,35,1,0,3,105


In [26]:
train['PclassAndAge'].isnull().sum()

0

In [27]:
# New Feature 4
train['SexAndAge'] = train['Sex']*train['Age']

In [28]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,FamilySize,PclassAndSex,PclassAndAge,SexAndAge
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,1,3,66,22
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38,2,1,2,38,76
2,3,1,3,"Heikkinen, Miss. Laina",2,26,1,0,6,78,52
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35,1,1,2,35,70
4,5,0,3,"Allen, Mr. William Henry",1,35,1,0,3,105,35


In [29]:
train = train.drop(['Name'], axis=1)

In [30]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,FamilySize,PclassAndSex,PclassAndAge,SexAndAge
0,1,0,3,1,22,1,1,3,66,22
1,2,1,1,2,38,2,1,2,38,76
2,3,1,3,2,26,1,0,6,78,52
3,4,1,1,2,35,1,1,2,35,70
4,5,0,3,1,35,1,0,3,105,35


In [31]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    train.loc[:, ('Pclass', 'Sex', 'Age', 'Embarked', 'FamilySize', 'PclassAndSex', 'PclassAndAge', 'SexAndAge')] ,train.loc[:, ('Survived')], test_size=0.30, random_state=6)


In [32]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print('RandomForestClassifier accuracy on the training dataset : ', accuracy)


RandomForestClassifier accuracy on the training dataset :  0.8395522388059702
