In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [8]:
import os
TITANIC_PATH = os.path.join("titanic")

In [9]:
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [10]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [11]:
def preproc(data):
    data = data.drop(["Ticket", "PassengerId", "Name", "Cabin"], axis=1)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True) #1 missing in test set
    data['Sex'] = data['Sex'].astype('category')
    data['Sex'] = data['Sex'].cat.codes
    return data

train_data = preproc(train_data)
train_data = train_data.dropna() # drop 2 missing in train
encoder = preprocessing.LabelEncoder()
train_data['Embarked'] = encoder.fit_transform(train_data['Embarked'])
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [14]:
y = train_data["Survived"]
X = train_data.drop("Survived", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


kfold = StratifiedKFold(n_splits=5, random_state=None)

In [15]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC(kernel='rbf'))])

param_grid = {
            'preprocessing': [StandardScaler(), None],
            'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_1 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)
grid_1.fit(X_train, y_train)
grid_1.best_params_

{'classifier__C': 1,
 'classifier__gamma': 0.1,
 'preprocessing': StandardScaler()}

In [16]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC(kernel='linear'))])
param_grid = {
            'preprocessing': [StandardScaler(),],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}
grid_3 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)
grid_3.fit(X_train, y_train)
grid_3.best_params_

{'classifier__C': 0.01, 'preprocessing': StandardScaler()}

In [17]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', LogisticRegression())])
param_grid = {
            'preprocessing': [StandardScaler()],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
}
grid_4 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)
grid_4.fit(X_train, y_train)
grid_4.best_params_

Traceback (most recent call last):
  File "/home/nikodemszyszka/miniconda3/envs/ml/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/nikodemszyszka/miniconda3/envs/ml/lib/python3.7/site-packages/sklearn/pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/nikodemszyszka/miniconda3/envs/ml/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/nikodemszyszka/miniconda3/envs/ml/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/nikodemszyszka/miniconda3/envs/ml/lib/python3.7/site-packages/sklearn/model_selection/

{'classifier__C': 0.01,
 'classifier__penalty': 'l2',
 'preprocessing': StandardScaler()}

In [18]:
models = []
models.append(('SVM rbf', grid_1.best_estimator_))
models.append(('SVM linear', grid_3.best_estimator_))
models.append(('Logistic regression', grid_4.best_estimator_))


precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test, model.predict(X_test)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test, model.predict(X_test)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test, model.predict(X_test)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test, model.predict(X_test)) ))
    precision_score.append(metrics.precision_score(y_test, model.predict(X_test)))
    recall_score.append(metrics.recall_score(y_test, model.predict(X_test)))
    f1_score.append( metrics.f1_score(y_test, model.predict(X_test)))
    accuracy_score.append(metrics.accuracy_score(y_test, model.predict(X_test)))

SVM rbf
precision_score: 0.7605633802816901
recall_score: 0.782608695652174
f1_score: 0.7714285714285714
accuracy_score: 0.8202247191011236
SVM linear
precision_score: 0.726027397260274
recall_score: 0.7681159420289855
f1_score: 0.7464788732394365
accuracy_score: 0.797752808988764
Logistic regression
precision_score: 0.7777777777777778
recall_score: 0.7101449275362319
f1_score: 0.7424242424242424
accuracy_score: 0.8089887640449438


In [19]:
d = {'precision_score': precision_score, 
     'recall_score': recall_score, 
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['SVM rbf', 'SVM linear', 'Logistic Regression'])#, 'SVM poly'
df

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy_score
0,SVM rbf,0.760563,0.782609,0.771429,0.820225
1,SVM linear,0.726027,0.768116,0.746479,0.797753
2,Logistic Regression,0.777778,0.710145,0.742424,0.808989


In [27]:

tree = DecisionTreeClassifier(max_depth=3, random_state=0)
tree.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.823
Accuracy on test set: 0.820


In [14]:
kaggle_test = preproc(test_data)
kaggle_test['Embarked'] = encoder.fit_transform(kaggle_test['Embarked'])

In [19]:
pred = grid_1.best_estimator_.predict(kaggle_test)
df = test_data["PassengerId"]
df = pd.concat([test_data["PassengerId"], pd.DataFrame(pred, columns = ["Survived"])], axis=1)
compression_opts = dict(method='zip',
                        archive_name='out.csv')  
df.to_csv('out.zip', index=False,
          compression=compression_opts) 