In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder, FunctionTransformer, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC

## Read data

In [2]:
passengers_train = pd.read_csv('train.csv', index_col='PassengerId')
passengers_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
passengers_test = pd.read_csv('test.csv', index_col='PassengerId')
passengers = passengers_train.append(passengers_test).drop('Survived', axis=1)
passengers.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450


In [4]:
passengers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 10 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Fare        1308 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Ticket      1309 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


## Data cleaning

### Extract Title from Name

In [5]:
def get_title(name):
    title_dict = {
        "Capt":       "Officer",
        "Col":        "Officer",
        "Major":      "Officer",
        "Jonkheer":   "Royalty",
        "Don":        "Royalty",
        "Sir" :       "Royalty",
        "Dr":         "Officer",
        "Rev":        "Officer",
        "the Countess":"Royalty",
        "Dona":       "Royalty",
        "Mme":        "Mrs",
        "Mlle":       "Miss",
        "Ms":         "Mrs",
        "Mr" :        "Mr",
        "Mrs" :       "Mrs",
        "Miss" :      "Miss",
        "Master" :    "Master",
        "Lady" :      "Royalty"
    }
    title = name.split(',')[1].split('.')[0].strip()
    return title_dict[title]

titles = passengers.Name.map(get_title)
passengers['Title'] = titles
passengers.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,Mr
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,Mrs
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,Miss
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,Mrs
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450,Mr


### Fill in numerical features

In [6]:
ages_and_fares = passengers.groupby(['Sex', 'Pclass', 'Title'])[['Age', 'Fare']]
ages_and_fares = ages_and_fares.apply(lambda chunk: chunk.fillna(chunk.median())).reset_index(['Sex', 'Pclass', 'Title'], drop=True)
ages_and_fares.head()

Unnamed: 0_level_0,Age,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
12,58.0,26.55
62,38.0,80.0
89,23.0,263.0
137,19.0,26.2833
178,50.0,28.7125


In [7]:
unchanged = passengers[['Parch', 'SibSp']]
unchanged.head()

Unnamed: 0_level_0,Parch,SibSp
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,0,1
3,0,0
4,0,1
5,0,0


### Transform categorical features
We account for autocorrelation by intentionally leaving out 1 possible category from each categorical feature.

#### Pclass

In [8]:
pclasses = pd.get_dummies(passengers.Pclass, prefix="Pclass")
pclasses.drop('Pclass_2', axis=1, inplace=True)
pclasses.head()

Unnamed: 0_level_0,Pclass_1,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,1,0
3,0,1
4,1,0
5,0,1


#### Cabin

In [9]:
passengers.Cabin = passengers.Cabin.map(lambda x: x[0], na_action='ignore')
# passengers.fillna({'Cabin': 'U'}, inplace=True);

In [10]:
cabins = pd.get_dummies(passengers.Cabin, prefix="Cabin")
cabins.head()

Unnamed: 0_level_0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0


#### Embarked

In [11]:
embarked = pd.get_dummies(passengers.Embarked, prefix="Embarked")
embarked.drop('Embarked_Q', axis=1, inplace=True)
embarked.head()

Unnamed: 0_level_0,Embarked_C,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,1,0
3,0,1
4,0,1
5,0,1


### Sex

In [12]:
sexes = passengers.Sex.replace({'male' : 1, 'female': 0})
sexes.head()

PassengerId
1    1
2    0
3    0
4    0
5    1
Name: Sex, dtype: int64

### Title

In [13]:
titles = pd.get_dummies(passengers.Title, prefix="Title")
titles.drop('Title_Master', axis=1, inplace=True)
titles.head()

Unnamed: 0_level_0,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,1,0,0,0
2,0,0,1,0,0
3,1,0,0,0,0
4,0,0,1,0,0
5,0,1,0,0,0


### Assemble all the features

In [14]:
Xall = pd.concat([ages_and_fares, unchanged, pclasses, cabins, embarked, sexes, titles], axis=1)
Xall.head()

Unnamed: 0_level_0,Age,Fare,Parch,SibSp,Pclass_1,Pclass_3,Cabin_A,Cabin_B,Cabin_C,Cabin_D,...,Cabin_G,Cabin_T,Embarked_C,Embarked_S,Sex,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,7.25,0,1,0,1,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
2,38.0,71.2833,0,1,1,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
3,26.0,7.925,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,35.0,53.1,0,1,1,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
5,35.0,8.05,0,0,0,1,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0


In [15]:
Xtrain = Xall.loc[passengers_train.index]
ytrain = passengers_train.Survived

In [16]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 22 columns):
Age              891 non-null float64
Fare             891 non-null float64
Parch            891 non-null int64
SibSp            891 non-null int64
Pclass_1         891 non-null uint8
Pclass_3         891 non-null uint8
Cabin_A          891 non-null uint8
Cabin_B          891 non-null uint8
Cabin_C          891 non-null uint8
Cabin_D          891 non-null uint8
Cabin_E          891 non-null uint8
Cabin_F          891 non-null uint8
Cabin_G          891 non-null uint8
Cabin_T          891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_S       891 non-null uint8
Sex              891 non-null int64
Title_Miss       891 non-null uint8
Title_Mr         891 non-null uint8
Title_Mrs        891 non-null uint8
Title_Officer    891 non-null uint8
Title_Royalty    891 non-null uint8
dtypes: float64(2), int64(3), uint8(17)
memory usage: 56.6 KB


### Standardize the features

In [17]:
scaler = MinMaxScaler().fit(Xtrain)
Xtrain = pd.DataFrame(scaler.transform(Xtrain), index=Xtrain.index, columns=Xtrain.columns)

In [18]:
Xtrain.head()

Unnamed: 0_level_0,Age,Fare,Parch,SibSp,Pclass_1,Pclass_3,Cabin_A,Cabin_B,Cabin_C,Cabin_D,...,Cabin_G,Cabin_T,Embarked_C,Embarked_S,Sex,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.271174,0.014151,0.0,0.125,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.472229,0.139136,0.0,0.125,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.321438,0.015469,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.434531,0.103644,0.0,0.125,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.434531,0.015713,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


### Logistic regression

In [19]:
# model = LogisticRegression()
# params = {'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10]}

### K-nearest neighbors

In [20]:
# model = KNeighborsClassifier()
# params = {'n_neighbors': np.arange(1, 20, 2)}

### Support Vector Machine

In [21]:
model = SVC()
params = {'C': np.geomspace(1e-3, 10, 10),
          'gamma': np.geomspace(1e-3, 10, 10)}

### Fit parameters

In [22]:
gridsearch = GridSearchCV(model, params, cv=5).fit(Xtrain, ytrain)
print(gridsearch.best_params_, gridsearch.best_score_)
model = gridsearch.best_estimator_
model.fit(Xtrain, ytrain)

{'C': 10.0, 'gamma': 0.46415888336127775} 0.83164983165


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.46415888336127775,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

### Predict on test set

In [23]:
Xtest = Xall.loc[passengers_test.index]
Xtest = pd.DataFrame(scaler.transform(Xtest), index=Xtest.index, columns=Xtest.columns)
Xtest.head()

Unnamed: 0_level_0,Age,Fare,Parch,SibSp,Pclass_1,Pclass_3,Cabin_A,Cabin_B,Cabin_C,Cabin_D,...,Cabin_G,Cabin_T,Embarked_C,Embarked_S,Sex,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,0.428248,0.015282,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
893,0.585323,0.013663,0.0,0.125,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
894,0.773813,0.018909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
895,0.334004,0.016908,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
896,0.271174,0.023984,0.166667,0.125,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [24]:
prediction = pd.DataFrame(model.predict(Xtest), index=passengers_test.index, columns=['Survived'])
prediction.head().to_clipboard()

In [25]:
prediction.to_csv('submission.csv')

## Things to add
- Try other models
- PCA?