In [1]:
# basics
import numpy as np
import pandas as pd

# processing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# modelling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree

# misc
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


# Pipeline at a glance:
load data -> cleaning/preprocessing -> feature eng -> format data for modelling -> fit model -> evaluate model -> generate submission file

## Step 1: Load data

In [2]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# for convenience
datasets = [df_train, df_test]

## Step 2: Basic cleaning and preprocessing

In [3]:
# Fill in missing values
mAge = pd.concat((df_train['Age'], df_test['Age']), axis=0).mean()
medFare = pd.concat((df_train['Fare'], df_test['Fare']), axis=0).median()
for df in datasets:
    df['Age'] = df['Age'].fillna(mAge)
    df['Fare'] = df['Fare'].fillna(medFare)
    df['Embarked'] = df['Embarked'].fillna('S')

## Step 3: Feature engineering

In [4]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'
    
train_titles = []
test_titles = []

for name in df_train['Name']:
    title = get_title(name)
    train_titles.append(title)
    
for name in df_test['Name']:
    title = get_title(name)
    test_titles.append(title)


df_train['Title'] = train_titles
df_test['Title'] = test_titles
     
df_train.loc[(df_train['Title'] != 'Mr') & (df_train['Title'] != 'Mrs') & (df_train['Title'] != 'Miss') & (df_train['Title'] != 'Master'), 'Title'] = 'Misc'
df_test.loc[(df_test['Title'] != 'Mr') & (df_test['Title'] != 'Mrs') & (df_test['Title'] != 'Miss') & (df_test['Title'] != 'Master'), 'Title'] = 'Misc'

# See value counts of each title
pd.value_counts(df_train['Title'])

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64

In [5]:
#thing = (pd.concat((df_train, df_test), ignore_index=True)).groupby(['Title']).mean()
#print(thing)

grouped = (pd.concat((df_train, df_test), ignore_index=True)).groupby(['Title'])  

# opted for median value to iron out any weirdness title selection might've given
grouped.Age.median()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Title
Master     6.000000
Misc      43.000000
Miss      24.000000
Mr        29.881138
Mrs       33.000000
Name: Age, dtype: float64

In [6]:
# Apply these median values to all NaNs in dataset
# this works fine on train but not on test set for some reason.. -John
for df in datasets:
    df.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))

# Check that baby out
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S,Mr
5,6,0,3,"Moran, Mr. James",male,29.881138,0,0,330877,8.4583,,Q,Mr
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.000000,0,0,17463,51.8625,E46,S,Mr
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.000000,3,1,349909,21.0750,,S,Master
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.000000,0,2,347742,11.1333,,S,Mrs
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.000000,1,0,237736,30.0708,,C,Mrs


In [7]:
df_test # why was mr. ylio imputed with an age of 4?

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,"Kelly, Mr. James",male,22.000000,0,0,330911,7.8292,,Q,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,38.000000,1,0,363272,7.0000,,S,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,26.000000,0,0,240276,9.6875,,Q,Mr
3,895,3,"Wirz, Mr. Albert",male,35.000000,0,0,315154,8.6625,,S,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,35.000000,1,1,3101298,12.2875,,S,Mrs
5,897,3,"Svensson, Mr. Johan Cervin",male,29.881138,0,0,7538,9.2250,,S,Mr
6,898,3,"Connolly, Miss. Kate",female,54.000000,0,0,330972,7.6292,,Q,Miss
7,899,2,"Caldwell, Mr. Albert Francis",male,2.000000,1,1,248738,29.0000,,S,Mr
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,27.000000,0,0,2657,7.2292,,C,Mrs
9,901,3,"Davies, Mr. John Samuel",male,14.000000,2,0,A/4 48871,24.1500,,S,Mr


In [8]:
# find fare bins based on frequency rather than value
garbage, fare_bins = pd.qcut(df_train['Fare'].append(df_test['Fare']), 4, retbins=True)
# loops through both train and test set for convenience
for df in datasets:
    # bin age
    df['Age_binned'] = pd.cut(df['Age'], [0,16,32,48,64,200], labels = [0,1,2,3,4], retbins=False)
    
    # bin fare
    df['Fare_binned'] = pd.cut(df['Fare'], fare_bins, labels = [0,1,2,3], include_lowest=True, retbins=False)
    
    # family features
    df['Family_size'] = df_train['SibSp'] + df_train['Parch']
    df['Is_Alone'] = (df['Family_size'] == 0).astype(int)

In [9]:
# drop unneeded rows
for df in datasets:
    df.drop(['Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name', 'Fare_binned', 'Embarked', 'Is_Alone'], axis=1, inplace=True)
    
# need to keep passengerID for submission
# now dropping embarked and is_alone, since they're probably useless

## Step 4: Format data for modelling

In [10]:
# label encode sex since binary
le = LabelEncoder()
le.fit(df_train['Sex'])
df_train.loc[:,'Sex'] = le.transform(df_train['Sex'])
df_test.loc[:,'Sex'] = le.transform(df_test['Sex'])

# one hot encode title
df_train = pd.get_dummies(df_train, columns=['Title'])
df_test = pd.get_dummies(df_test, columns=['Title'])

In [11]:
# Split data into X and y, and select features to use
X = df_train.drop(['Survived', 'PassengerId'], axis=1)
y = df_train['Survived']
X_test = df_test.drop(['PassengerId'], axis=1)
X

Unnamed: 0,Pclass,Sex,Age_binned,Family_size,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,1,1,0,0,0,1,0
1,1,0,2,1,0,0,0,0,1
2,3,0,1,0,0,0,1,0,0
3,1,0,2,1,0,0,0,0,1
4,3,1,2,0,0,0,0,1,0
5,3,1,1,0,0,0,0,1,0
6,1,1,3,0,0,0,0,1,0
7,3,1,0,4,1,0,0,0,0
8,3,0,1,2,0,0,0,0,1
9,2,0,0,1,0,0,0,0,1


In [12]:
# standard scaler
std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)
X_test = std_scaler.transform(X_test)

# Split data X and y into train and val sets
X_train, X_val, y_train, y_val = train_test_split(
      X, y, test_size=0.2, random_state=42)

## Step 5: Train (or fit) the model(s)

In [13]:
# logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)

# fit rf
rf = RandomForestClassifier(random_state=12)
rf.fit(X_train, y_train)

# gaussian NB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# svm
svm = SVC(kernel='linear')
svm.fit(X_train,y_train)

# dtree
tre = tree.DecisionTreeClassifier()
tre = tre.fit(X_train, y_train)

In [14]:
# experimenting with tuning KNN

n_neighbors = [6,8,10,12,14,16,18,20]
algorithm = ['auto']
weights = ['uniform']
leaf_size = list(range(1,30,5))
n_jobs = [-1]
hyperparams = {'algorithm': algorithm, 'weights': weights, 'leaf_size': leaf_size, 
               'n_neighbors': n_neighbors, 'n_jobs': n_jobs}
gd_knn=GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=2, 
                cv=10, scoring = "roc_auc")
gd_knn.fit(X, y)
print(gd_knn.best_score_)
print(gd_knn.best_estimator_)

# fit the best one, takes like 3 minutes
gd_knn.best_estimator_.fit(X, y)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=6, wei

[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=1, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=1, n_jobs=-1, n_neigh

[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=8, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighb

[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=18, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=18, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=18, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=18, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=18, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=20, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=20, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=20, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=20, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=20, weights=uniform 
[CV]  algorithm=auto, leaf_size=6, n_jobs=-1, n_neighbors=20, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=6, n_jobs=-1, n_neigh

[CV]  algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform 
[CV]  algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform 
[CV]  algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform 
[CV]  algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform 
[CV]  algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=14, weights=uniform 
[CV]  algorithm=auto, leaf_size=11, n_jobs=-1, n_neighbors=14, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=11, n_jobs

[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=6, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neig

[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform 
[CV]  algorithm=auto, leaf_size=16, n_jobs=-1, n_neighbors=16, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=16, n_jobs

[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=8, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=10, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=

[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=18, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=18, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=18, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=18, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=18, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=20, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=20, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=20, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=20, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=20, weights=uniform 
[CV]  algorithm=auto, leaf_size=21, n_jobs=-1, n_neighbors=20, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=21, n_jobs

[CV]  algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform 
[CV]  algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform 
[CV]  algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform 
[CV]  algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform 
[CV]  algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=12, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=14, weights=uniform 
[CV]  algorithm=auto, leaf_size=26, n_jobs=-1, n_neighbors=14, weights=uniform, total=   0.1s
[CV] algorithm=auto, leaf_size=26, n_jobs

[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:  1.7min finished


KNeighborsClassifier(algorithm='auto', leaf_size=26, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=18, p=2,
           weights='uniform')

## Step 6: Evaluate the model

In [15]:
# define a scoring function
def acc(y: np.array, y_pred: np.array) -> float:
    return np.sum(y_pred==y)/len(y)

In [16]:
# convenience
models = {"LR":lr, "RF":rf, "GNB":gnb, "SVM":svm, "TREE":tre, "Tuned KNN":gd_knn.best_estimator_}

for name, clf in models.items():
    y_pred_trn = clf.predict(X_train)
    y_pred_val = clf.predict(X_val)
    print(name, 'train set accuracy', acc(y_train, y_pred_trn))
    print(name, 'val set accuracy', acc(y_val, y_pred_val))
    print('')

LR train set accuracy 0.8342696629213483
LR val set accuracy 0.8212290502793296

RF train set accuracy 0.8525280898876404
RF val set accuracy 0.8044692737430168

GNB train set accuracy 0.7963483146067416
GNB val set accuracy 0.7877094972067039

SVM train set accuracy 0.8314606741573034
SVM val set accuracy 0.8156424581005587

TREE train set accuracy 0.8553370786516854
TREE val set accuracy 0.8156424581005587

Tuned KNN train set accuracy 0.824438202247191
Tuned KNN val set accuracy 0.8324022346368715



## Step 7: Generate Submission

In [18]:
# take the best model (knn) and predict for test set
y_pred_test = gd_knn.best_estimator_.predict(X_test)

# Create a Kaggle submission
sub = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                    'Survived': y_pred_test})

sub.to_csv('week_4_baseline.csv', index=False)