In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train.shape

(891, 12)

In [6]:
test.shape

(418, 11)

### Mix the training and test sets together, then extracting the title of passengers into new column "Title"

In [7]:
train_test_data = [train, test]
for t in train_test_data:
    t['Title'] = t.Name.str.extract(' ([A-Za-z]+)\.')

  This is separate from the ipykernel package so we can avoid doing imports until


### Now we are replacing all titles with some other less-in-count ones

In [8]:
for t in train_test_data:
    t['Title'] = t['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    t['Title'] = t['Title'].replace('Mlle', 'Miss')
    t['Title'] = t['Title'].replace('Ms', 'Miss')
    t['Title'] = t['Title'].replace('Mme', 'Mrs')

### Now we are converting the titles into numbers and the empty titles are set to zero

In [9]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
for t in train_test_data:
    t['Title'] = t['Title'].map(title_mapping)
    t['Title'] = t['Title'].fillna(0)

In [10]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


### Now we are filling the empty "Sex" cells with male, as they are the majority

In [11]:
for d in train_test_data:
    d['Sex'] = d['Sex'].fillna('male')

### Now we are converting the Sex into numbers, mapping females into 1 and mapping males into 0

In [12]:
for d in train_test_data:
    d['Sex'] = d['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [13]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,1


### Now we are filling the empty "Embarked" cells with S, as they are the majority

In [14]:
for d in train_test_data:
    d['Embarked'] = d['Embarked'].fillna('S')

### Now we are converting the "Embarked" into numbers, mapping S to 0, C to 1, and Q to 2

In [15]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [16]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,1


### Now we are processing the age, getting the average and standard deviation and filling in the empty age cells with them, then creating age bands to take decision upon them

In [17]:
for dataset in train_test_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Creating new column AgeBand

In [18]:
train['AgeBand'] = pd.qcut(train['Age'], 4)
print (train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean())

          AgeBand  Survived
0  (-0.001, 21.0]  0.396694
1    (21.0, 28.0]  0.336538
2    (28.0, 37.0]  0.424658
3    (37.0, 80.0]  0.373874


### Mapping Age Bands to numbers

In [19]:
for dataset in train_test_data:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

In [20]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,AgeBand
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,7.25,,0,1,"(21.0, 28.0]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,71.2833,C85,1,3,"(37.0, 80.0]"
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,7.925,,0,2,"(21.0, 28.0]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,53.1,C123,0,3,"(28.0, 37.0]"
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,8.05,,0,1,"(28.0, 37.0]"


### Similarily, we do the same to the Fare column, but filling the empty with the median instead of the mean

In [21]:
for dataset in train_test_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

### Creating a new Column "FareBand"

In [22]:
train['FareBand'] = pd.qcut(train['Fare'], 4)
print (train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean())

          FareBand  Survived
0   (-0.001, 7.91]  0.197309
1   (7.91, 14.454]  0.303571
2   (14.454, 31.0]  0.454955
3  (31.0, 512.329]  0.581081


### Mapping Fare bands into numbers

In [23]:
for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

In [24]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,AgeBand,FareBand
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,0,,0,1,"(21.0, 28.0]","(-0.001, 7.91]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,3,C85,1,3,"(37.0, 80.0]","(31.0, 512.329]"
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,1,,0,2,"(21.0, 28.0]","(7.91, 14.454]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,3,C123,0,3,"(28.0, 37.0]","(31.0, 512.329]"
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,1,,0,1,"(28.0, 37.0]","(7.91, 14.454]"


### Final Step in pre-processing: we are going to approximate the siblings and parch with decision whether the passenger was alone or not, if he wasn't alone then we use the column FamilySize to indicate his family size

In [25]:
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['SibSp'] +  dataset['Parch'] + 1

### Mapping these columns into numbers, if he is alone then isAlone = 1, else isAlone =0, and so familySize feature will be equal to only 1. Then if he isn't alone i.e: isAlone=0, then we will put the number of siblings + parch + 1 in the FamilySize as described in the previous cell code

In [26]:
for dataset in train_test_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

In [27]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,AgeBand,FareBand,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,0,,0,1,"(21.0, 28.0]","(-0.001, 7.91]",2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,3,C85,1,3,"(37.0, 80.0]","(31.0, 512.329]",2,0
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,1,,0,2,"(21.0, 28.0]","(7.91, 14.454]",1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,3,C123,0,3,"(28.0, 37.0]","(31.0, 512.329]",2,0
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,1,,0,1,"(28.0, 37.0]","(7.91, 14.454]",1,1


### Finally, we now have a lot of features that we can drop as they won't affect our decision such as : name, ticket, cabin and also columns having sibsp, parch that we used already to create new features::: so we are going to drop these features/columns from the training set

In [28]:
dropping = ['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin']
train = train.drop(dropping, axis=1)
test = test.drop(dropping, axis=1)
train = train.drop(['PassengerId', 'AgeBand', 'FareBand'], axis=1)

In [29]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,IsAlone
0,0,3,0,1,0,0,1,2,0
1,1,1,1,2,3,1,3,2,0
2,1,3,1,1,1,0,2,1,1
3,1,1,1,2,3,0,3,2,0
4,0,3,0,2,1,0,1,1,1


In [30]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,IsAlone
0,892,3,0,2,0,2,1,1,1
1,893,3,1,2,0,0,3,2,0
2,894,2,0,3,1,2,1,1,1
3,895,3,0,1,1,0,1,1,1
4,896,3,1,1,1,0,3,3,0


### Now, it is time to train our model: we split our training data into two sets: X-train which is all data without the labels,  and Y_train which is the labels to the features found in X_train.

In [31]:
X_train = train.drop('Survived', axis=1)
Y_train = train['Survived']
X_test = test.drop("PassengerId", axis=1)

In [32]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,IsAlone
0,3,0,1,0,0,1,2,0
1,1,1,2,3,1,3,2,0
2,3,1,1,1,0,2,1,1
3,1,1,2,3,0,3,2,0
4,3,0,2,1,0,1,1,1


In [33]:
Y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [34]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,IsAlone
0,3,0,2,0,2,1,1,1
1,3,1,2,0,0,3,2,0
2,2,0,3,1,2,1,1,1
3,3,0,1,1,0,1,1,1
4,3,1,1,1,0,3,3,0


### Passing our training data and its labels to the Gaussian Naive Bayes model, then predicting the output of the test sets in y_pred, and getting the accuracy on the training data

In [35]:
model = GaussianNB()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
accuracy = model.score(X_train,Y_train)*100
print ("Accuracy = ",round(accuracy,2),"%")

Accuracy =  80.25 %


### Finally, creating a submission file containing of only the passengerID and the predicted survived column for each , to submit it to kaggle to get the accuracy on the test set

In [36]:
submission = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_pred})
submission.to_csv('submission2.csv', index=False)

# Submitting to kaggle Step

## Username: https://www.kaggle.com/robsgeorge
## Display Name : Robeir George