In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
cd gdrive/MyDrive/MachineLearning/HandsOnMachineLearning/titanic

/content/gdrive/MyDrive/MachineLearning/HandsOnMachineLearning/titanic


In [3]:
# np.set_printoptions(threshold=np.inf)
np.set_printoptions(threshold=1000)

NameError: name 'np' is not defined

In [75]:
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split


original_set = pd.read_csv('train.csv')
X = original_set.drop('Survived', axis=1)
y = original_set['Survived'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)

# Transformation

In [86]:
cat_attribs = ['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked']
num_attribs = ['Age', 'SibSp', 'Parch', 'Fare']
onehot_attribs = ['Pclass', 'Sex', 'Embarked', 'Deck']
scale_attribs = ['Age', 'SibSp', 'Parch', 'Fare_per_person']


class AttribsAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X, columns=(cat_attribs + num_attribs + ['PassengerId', 'Name']))
        X['Deck'] = X['Cabin'].str.slice(0, 1)
        count_per_ticket = X.groupby('Ticket').size()
        X['Fare_per_person'] = X['Fare'] / X['Ticket'].map(count_per_ticket)
        X.drop(['PassengerId', 'Name', 'Cabin', 'Ticket', 'Fare'], axis=1, inplace=True)
        print(X.columns)
        return X


impute_ct = ColumnTransformer([
    ('impute_cat', SimpleImputer(strategy='most_frequent'), cat_attribs),
    ('impute_num', SimpleImputer(strategy='mean'), num_attribs),
], remainder='passthrough')

add_cols_pipeline = Pipeline([
    ('attribs_adder', AttribsAdder()),
])

encode_ct = ColumnTransformer([
    ('onehot', OneHotEncoder(), onehot_attribs),
    ('std', StandardScaler(), scale_attribs),
], remainder='passthrough')

transform_pipeline = Pipeline([
    ('impute', impute_ct),
    ('add_cols', add_cols_pipeline),
    ('encode', encode_ct),
])
X_train_prepared = transform_pipeline.fit_transform(X_train)
print(X_train_prepared)

Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
[[ 0.          0.          1.         ... -0.46508428 -0.46618317
  -0.39100516]
 [ 0.          1.          0.         ... -0.46508428 -0.46618317
  -0.74279934]
 [ 1.          0.          0.         ... -0.46508428 -0.46618317
   7.54319351]
 ...
 [ 0.          0.          1.         ...  0.47833454  3.11571343
  -0.42172436]
 [ 1.          0.          0.         ... -0.46508428 -0.46618317
   0.69561655]
 [ 1.          0.          0.         ... -0.46508428 -0.46618317
   0.2491472 ]]


# Visualization

In [87]:
onehot_cats = transform_pipeline[2].transformers_[0][1].categories_
onehot_cats = np.concatenate(onehot_cats)
cats = np.concatenate([onehot_cats, scale_attribs])

X_train_prepared = pd.DataFrame(X_train_prepared, columns=(cats))
X_train_prepared.columns = X_train_prepared.columns.astype(str)
train_set_prepared = pd.concat([X_train_prepared, y_train], axis=1)

print(train_set_prepared.corr()['Survived'])

1                  0.025290
2                  0.064680
3                 -0.075414
female             0.079456
male              -0.079456
C                  0.043932
Q                  0.011275
S                 -0.045939
A                 -0.015642
B                  0.041643
C                  0.112061
D                 -0.046294
E                 -0.039981
F                 -0.071295
G                 -0.010684
T                 -0.033160
Age               -0.014166
SibSp             -0.060291
Parch              0.021399
Fare_per_person    0.092154
Survived           1.000000
Name: Survived, dtype: float64


# Testing models

In [78]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


forest_clf = RandomForestClassifier(random_state=42)
forest_scores = cross_val_score(forest_clf, X_train_prepared, y_train,
                                scoring='accuracy', cv=3)
print(forest_scores.mean())

0.7879126334077934


In [79]:
from sklearn.neighbors import KNeighborsClassifier


knn_clf = KNeighborsClassifier()
knn_scores = cross_val_score(knn_clf, X_train_prepared, y_train,
                             scoring='accuracy', cv=3)
print(knn_scores.mean())

0.8160302095521753


In [97]:
from sklearn.svm import SVC


svc = SVC(random_state=42)
svc_scores = cross_val_score(svc, X_train_prepared, y_train,
                             scoring='accuracy', cv=3)
print(svc_scores.mean())

0.8174248602394544


# Tuning models

## KNN

In [49]:
from sklearn.model_selection import GridSearchCV


param_grid = [{
    'weights': ['uniform', 'distance'],
    'n_neighbors': [4, 6, 8, 10, 12, 13, 15, 16, 17],
    'n_jobs': [-1]
}]

grid_search = GridSearchCV(knn_clf, param_grid, cv=15, scoring='accuracy',
                           verbose=2, return_train_score=True)
grid_search.fit(X_train_prepared, y_train)

Fitting 15 folds for each of 18 candidates, totalling 270 fits
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.0s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.2s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.1s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.0s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.1s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.0s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.1s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.1s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.2s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.0s
[CV] END ..........n_jobs=-1, n_neighbors=4, weights=uniform; total time=   0.0s
[CV] END ..........n_jobs=-1, n_neighbors=4, w

In [50]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'n_jobs': -1, 'n_neighbors': 10, 'weights': 'uniform'}
0.8135047281323876


In [51]:
from sklearn.metrics import accuracy_score


full_pipeline = Pipeline([
    ('transform', transform_pipeline),
    ('predict', KNeighborsClassifier(**grid_search.best_params_))
])
full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))

Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
0.7541899441340782


## Random forest

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = [{
    'n_estimators': [111, 112, 113, 115],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'n_jobs': [-1]
}]

forest_search = GridSearchCV(forest_clf, param_grid, cv=15, scoring='accuracy',
                             verbose=2, return_train_score=True)
forest_search.fit(X_train_prepared, y_train)

In [81]:
print(forest_search.best_params_)
print(forest_search.best_score_)

{'criterion': 'gini', 'n_estimators': 113, 'n_jobs': -1}
0.8176418439716312


In [88]:
from sklearn.metrics import accuracy_score


full_pipeline = Pipeline([
    ('transform', transform_pipeline),
    ('predict', RandomForestClassifier(**forest_search.best_params_,
                                       random_state=42))
])
full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))

Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
0.776536312849162


## SVC

In [106]:
from sklearn.model_selection import GridSearchCV


param_grid = [{
    'C': [4, 5, 6, 7],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
}]

svc_search = GridSearchCV(svc, param_grid, cv=3, scoring='accuracy',
                             verbose=2, return_train_score=True)
svc_search.fit(X_train_prepared, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] END ....................C=4, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ....................C=4, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ....................C=4, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ......................C=4, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ......................C=4, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ......................C=4, gamma=scale, kernel=poly; total time=   0.0s
[CV] END .......................C=4, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .......................C=4, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .......................C=4, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ...................C=4, gamma=scale, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=4, gamma=scale, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=4, gamma=scale, ke

In [107]:
print(svc_search.best_params_)
print(svc_search.best_score_)

{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}
0.8286647992530346


In [109]:
from sklearn.metrics import accuracy_score


full_pipeline = Pipeline([
    ('transform', transform_pipeline),
    ('predict', SVC(**svc_search.best_params_,
                    random_state=42))
])
full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))

Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
0.770949720670391


# Final prediction

In [110]:
full_pipeline.fit(X, y)
final_set = pd.read_csv('test.csv')
passengerIds = final_set['PassengerId'].copy()
predictions = pd.DataFrame(full_pipeline.predict(final_set),
                           columns=['Survived'], index=passengerIds)
print(predictions)

Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
Index(['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Deck',
       'Fare_per_person'],
      dtype='object')
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0
...               ...
1305                0
1306                1
1307                0
1308                0
1309                0

[418 rows x 1 columns]


In [111]:
predictions.to_csv('result.csv')