In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df.groupby('Sex')['Survived'].count()

Sex
female    314
male      577
Name: Survived, dtype: int64

In [5]:
train_df.groupby('Embarked')['Survived'].count()

Embarked
C    168
Q     77
S    644
Name: Survived, dtype: int64

In [6]:
train_df.Age.mean()

29.69911764705882

In [7]:
train_df['Sex'].fillna('male', inplace=True)
train_df['Age'].fillna(30, inplace=True)
train_df['Embarked'].fillna('S', inplace=True)

test_df['Sex'].fillna('male', inplace=True)
test_df['Age'].fillna(30, inplace=True)
test_df['Embarked'].fillna('S', inplace=True)

In [8]:
train_df['Sex'] = train_df['Sex'].apply(lambda x: 1 if x == 'male' else 0)
train_df['Embarked'] = train_df['Embarked'].apply(lambda x: 1 if x == 'S' else 0)

test_df['Sex'] = test_df['Sex'].apply(lambda x: 1 if x == 'male' else 0)
test_df['Embarked'] = test_df['Embarked'].apply(lambda x: 1 if x == 'S' else 0)

In [9]:
train_df.to_csv('data/clean_train.csv', index=False)
test_df.to_csv('data/clean_test.csv', index=False)

In [10]:
X = train_df.copy().drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_df['Survived'].copy()

In [11]:
results_dict = {}

In [12]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score

validation = StratifiedKFold(n_splits=5)

In [13]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr_probabilities = cross_val_predict(lr, X, y, cv=validation, method='predict_proba')
y_pred = [i[1] for i in lr_probabilities]

results_dict['LogisticRegression'] = roc_auc_score(y, y_pred)

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf_probabilities = cross_val_predict(rf, X, y, cv=validation, method='predict_proba')
y_pred = [i[1] for i in rf_probabilities]

results_dict['RandomForestClassifier'] = roc_auc_score(y, y_pred)

In [15]:
from sklearn.svm import SVC

svc = SVC(probability=True)

svc_probabilities = cross_val_predict(svc, X, y, cv=validation, method='predict_proba')
y_pred = [i[1] for i in svc_probabilities]

results_dict['SVC'] = roc_auc_score(y, y_pred)

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knc_2 = KNeighborsClassifier(n_neighbors=2)

knc2_probabilities = cross_val_predict(knc_2, X, y, cv=validation, method='predict_proba')
y_pred = [i[1] for i in knc2_probabilities]

results_dict['KNeighborsClassifier K=2'] = roc_auc_score(y, y_pred)

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knc_4 = KNeighborsClassifier(n_neighbors=4)

knc4_probabilities = cross_val_predict(knc_4, X, y, cv=validation, method='predict_proba')
y_pred = [i[1] for i in knc4_probabilities]

results_dict['KNeighborsClassifier K=4'] = roc_auc_score(y, y_pred)

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knc_8 = KNeighborsClassifier(n_neighbors=8)

knc8_probabilities = cross_val_predict(knc_8, X, y, cv=validation, method='predict_proba')
y_pred = [i[1] for i in knc8_probabilities]

results_dict['KNeighborsClassifier K=8'] = roc_auc_score(y, y_pred)

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knc_16 = KNeighborsClassifier(n_neighbors=16)

knc16_probabilities = cross_val_predict(knc_16, X, y, cv=validation, method='predict_proba')
y_pred = [i[1] for i in knc16_probabilities]

results_dict['KNeighborsClassifier K=16'] = roc_auc_score(y, y_pred)

In [20]:
results_dict

{'KNeighborsClassifier K=16': 0.7287998380894555,
 'KNeighborsClassifier K=2': 0.6918480171284314,
 'KNeighborsClassifier K=4': 0.7247600634859767,
 'KNeighborsClassifier K=8': 0.7435209152206562,
 'LogisticRegression': 0.8462062868160078,
 'RandomForestClassifier': 0.840976150150726,
 'SVC': 0.7389112581088422}