In [79]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier

In [93]:
# Read in train dataset and test dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# show the training data
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [94]:
# Get the rows count with missing value
print('Training set - num of missing rows:')
for column in train.columns:
    print (column, len(train[train[column].isnull()]))
print('=========')
print('Test set - num of missing rows:')
for column in test.columns:
    print (column, len(test[test[column].isnull()]))

Training set - num of missing rows:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
Test set - num of missing rows:
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0


In [95]:
# fill age missing data by median
train.Age.fillna(train.Age.median(), inplace=True)
test.Age.fillna(test.Age.median(), inplace=True)
train.Embarked.fillna('S', inplace=True)
test.Embarked.fillna('S', inplace=True)
test.Fare.fillna(test.Fare.median(), inplace=True)

# Feature Engineering

In [97]:
# feature engineering
train['name_len'] = train['Name'].apply(lambda x: len(x))
train['has_cabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

# features taken from Sina 
# https://www.kaggle.com/sinakhorami/titanic-best-working-classifier?scriptVersionId=566580
train['family_size'] = train['SibSp'] + train['Parch'] + 1
train['alone'] = train['family_size'].apply(lambda x: 1 if x== 1 else 0)

In [114]:
train['title'] = train['Name'].str.extract('([a-zA-Z]+\.)', expand=False).str.strip()
train['title'] = train['title'].replace('Mlle.', 'Miss.')
train['title'] = train['title'].replace('Ms.', 'Miss.')
train['title'] = train['title'].replace('Lady.', 'Miss.')
train['title'] = train['title'].replace('Mme.', 'Mrs.')
train['title'] = train['title'].replace('Sir.', 'Mr.')
train['title'] = train['title'].replace(['Dr.','Rev.','Major.','Col.','Don.','Countess.','Jonkheer.','Capt.'],
                                        'Rare.')

In [96]:
# transform sex to 0 and 1
le_sex = preprocessing.LabelEncoder()
train.Sex = le_sex.fit_transform(train.Sex)
test.Sex = le_sex.fit_transform(test.Sex)

le_embarked = preprocessing.LabelEncoder()
train.Embarked = le_embarked.fit_transform(train.Embarked)
test.Embarked = le_embarked.fit_transform(test.Embarked)

le_title = preprocessing.LabelEncoder()
train.title = le_title.fit_transform(train.title)
test.title = le_title.fit_transform(test.title)

In [73]:
# train model use cross-validation
X = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'name_len']].values
y = train.Survived.values

In [75]:
clf = linear_model.LogisticRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')

In [78]:
scores.mean()

0.7934818480970371

In [80]:
parameters = {'n_estimators': 50,
              'max_features': 'auto',
              'criterion': 'gini',
              'max_depth': 20,
              'min_samples_split': 2,
              'min_samples_leaf': 20,
              'random_state': 0,
              'n_jobs': -1
              }

clf = RandomForestClassifier(**parameters)
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
scores.mean()

0.7980580448097676

In [82]:
train.shape

(891, 13)