In [154]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as sm
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

warnings.filterwarnings("ignore")

%matplotlib inline

In [155]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

train_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,1
1,1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S,0
2,1240,2,"Giles, Mr. Ralph",male,24.0,0,0,248726,13.5,,S,0
3,221,3,"Sunderland, Mr. Victor Francis",male,16.0,0,0,SOTON/OQ 392089,8.05,,S,1
4,1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60.0,1,0,24065,26.0,,S,0


In [156]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg",female,20.0,1,0,236853,26.0,,S
1,1215,1,"Rowe, Mr. Alfred G",male,33.0,0,0,113790,26.55,,S
2,823,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0,,S
3,864,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S
4,11,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S


In [157]:
train_df["Title"] = train_df.Name.apply(lambda x: x.split(" ")[1].replace(".", ""))
test_df["Title"] = test_df.Name.apply(lambda x: x.split(" ")[1].replace(".", ""))

title_stat = train_df.groupby("Title").Name.agg(["count"])
title_stat["is_rare"] = title_stat["count"] < 10
titles = title_stat[~title_stat.is_rare].index.values

train_df.Title = train_df.Title.apply(lambda x: x if x in titles else "rare_title")
test_df.Title = test_df.Title.apply(lambda x: x if x in titles else "rare_title")

train_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,Title
0,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,1,Miss
1,1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S,0,Mr
2,1240,2,"Giles, Mr. Ralph",male,24.0,0,0,248726,13.5,,S,0,Mr
3,221,3,"Sunderland, Mr. Victor Francis",male,16.0,0,0,SOTON/OQ 392089,8.05,,S,1,Mr
4,1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60.0,1,0,24065,26.0,,S,0,Mrs


In [158]:
train_df.groupby("Title").PassengerId.count()

Title
Master         38
Miss          163
Mr            487
Mrs           123
rare_title     39
Name: PassengerId, dtype: int64

In [159]:
train_df["Children"] = train_df.Age.apply(lambda x: 1 if x < 18 else 0)
train_df["Mother"] = train_df.apply(lambda x: 1 if x.Sex == "female" and x.Age > 18 and x.Parch > 0 else 0, axis=1)
train_df["Family_size"] = train_df.SibSp + train_df.Parch + 1

test_df["Children"] = test_df.Age.apply(lambda x: 1 if x < 18 else 0)
test_df["Mother"] = test_df.apply(lambda x: 1 if x.Sex == "female" and x.Age > 18 and x.Parch > 0 else 0, axis=1)
test_df["Family_size"] = test_df.SibSp + test_df.Parch + 1

train_df["Deck"] = train_df.Cabin.astype("str").apply(lambda x: x[0] if x != "nan" else "no_deck")
test_df["Deck"] = test_df.Cabin.astype("str").apply(lambda x: x[0] if x != "nan" else "no_deck")

In [160]:
train_df.Age = train_df.Age.fillna(train_df.Age.dropna().median()).astype("int16")
train_df.Fare = train_df.Fare.fillna(train_df.Fare.dropna().median())
train_df.Embarked = train_df.Embarked.fillna("no_info")

test_df.Age = test_df.Age.fillna(train_df.Age.dropna().median()).astype("int16")
test_df.Fare = test_df.Fare.fillna(train_df.Fare.dropna().median())
test_df.Embarked = test_df.Embarked.fillna("no_info")

In [161]:
train_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,Title,Children,Mother,Family_size,Deck
0,888,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0,B42,S,1,Miss,0,0,1,B
1,1249,3,"Lockyer, Mr. Edward",male,28,0,0,1222,7.8792,,S,0,Mr,0,0,1,no_deck
2,1240,2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S,0,Mr,0,0,1,no_deck
3,221,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S,1,Mr,1,0,1,no_deck
4,1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26.0,,S,0,Mrs,0,0,2,no_deck


In [162]:
train_df.Embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

In [163]:
train_v1 = train_df.copy()
test_v1 = test_df.copy()

unnecessary_cols = ["Name", "Ticket", "SibSp", "Parch", "Cabin"]

train_v1 = train_v1.drop(unnecessary_cols, axis=1)
test_v1 = test_v1.drop(unnecessary_cols, axis=1)

encoder = LabelEncoder()
encoding_cols = ["Sex", "Embarked", "Title", "Deck"]

full_df = pd.concat([train_v1, test_v1], axis=0)

for col in encoding_cols:
    full_df[col] = encoder.fit_transform(full_df[col])
    
train_v1 = full_df[full_df.Survived.notnull()]
train_v1.Survived = train_v1.Survived.astype("int8")

test_v1 = full_df[full_df.Survived.isnull()]
test_v1.drop("Survived", axis=1, inplace=True)

feature_cols = [col for col in train_v1.columns if col not in {"PassengerId", "Survived"}]
target_col = "Survived"

X = train_v1[feature_cols].values
y = train_v1[target_col]

In [164]:
train_v1.head()

Unnamed: 0,Age,Children,Deck,Embarked,Family_size,Fare,Mother,PassengerId,Pclass,Sex,Survived,Title
0,19,0,1,2,1,30.0,0,888,1,0,1,1
1,28,0,8,2,1,7.8792,0,1249,3,1,0,2
2,24,0,8,2,1,13.5,0,1240,2,1,0,2
3,16,1,8,2,1,8.05,0,221,3,1,1,2
4,60,0,8,2,2,26.0,0,1105,2,0,0,3


In [165]:
from sklearn.tree import DecisionTreeClassifier

RANDOM_STATE = 42

model = DecisionTreeClassifier(max_depth=2, random_state=RANDOM_STATE)
model.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [166]:
def make_submission(ids, preds, output_path='submission.csv'):
    subm = pd.DataFrame()
    subm['PassengerId'] = ids
    subm['Survived'] = preds
    subm.to_csv(output_path, index=False)

In [167]:
X_test = test_v1[feature_cols].values

res = model.predict(X_test)

In [168]:
ids = test_v1['PassengerId']

In [150]:
# 1. скор при первой отправке
make_submission(ids, res, output_path='res1.csv')

In [169]:
# отрисовка дерева
from ipywidgets import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

In [171]:
#dot_file = StringIO()
#dot_data = export_graphviz(model, out_file=dot_file, feature_names = feature_cols)
#graph = pydotplus.graph_from_dot_data(dot_data)
#Image(graph.create_png())

In [172]:
from sklearn.model_selection import GridSearchCV

tree_params = {
    'max_depth': list(range(1, 8)),
    'min_samples_leaf': list(range(1, 10))
}

clf = GridSearchCV(model, tree_params, cv = 5)
clf.fit(X,y)
clf.best_params_

{'max_depth': 3, 'min_samples_leaf': 6}

In [173]:
from sklearn.model_selection import cross_val_score

tree_param = {
    'max_depth': [3],
    'min_samples_leaf': [6]
}

clf = GridSearchCV(model, tree_param, cv=5)
a = cross_val_score(clf, X = X, y = y, cv=5, error_score='accuracy')
a.mean()

0.7941176470588236

In [68]:
# 4. скор при второй отправке
res1 = clf.predict(X_test)

make_submission(ids, res1, output_path='res2.csv')

In [175]:
unnecessary_cols = ["Name", "Ticket", "SibSp", "Parch", "Cabin"]

train_v2 = train_df.drop(unnecessary_cols, axis=1).copy()
test_v2 = test_df.drop(unnecessary_cols, axis=1).copy()

test_v2.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,Children,Mother,Family_size,Deck
0,1167,2,female,20,26.0,S,Miss,0,0,2,no_deck
1,1215,1,male,33,26.55,S,Mr,0,0,1,no_deck
2,823,1,male,38,0.0,S,rare_title,0,0,1,no_deck
3,864,3,female,28,69.55,S,Miss,0,0,11,no_deck
4,11,3,female,4,16.7,S,Miss,1,0,3,G


In [176]:
categorical_cols = ['Pclass', 'Sex', 'Embarked', 'Title', 'Mother', 'Deck']
full_df = pd.concat([train_v2, test_v2], axis=0)
dfs = []
for col in categorical_cols:
    print(f"Encoding {col}")
    df = pd.get_dummies(full_df[col], prefix=col)
    dfs.append(df)
full_df = pd.concat([full_df] + dfs, axis=1)
full_df = full_df.drop(categorical_cols, axis=1)
train_v2 = full_df[full_df.Survived.notnull()]
train_v2.Survived = train_v1.Survived.astype("int8")
test_v2 = full_df[full_df.Survived.isnull()]
test_v2 = test_v2.drop("Survived", axis=1)

Encoding Pclass
Encoding Sex
Encoding Embarked
Encoding Title
Encoding Mother
Encoding Deck


In [177]:
set(train_v2.columns) - set(test_v2.columns)

{'Survived'}

In [178]:
feature_cols = [col for col in train_v2.columns if col not in {"PassengerId", "Survived"}]
target_col = "Survived"
X = train_v2[feature_cols].values
y = train_v2[target_col]

In [179]:
tree_params = {
    'max_depth': list(range(1, 10)),
    'min_samples_leaf': list(range(1, 10))
}
clf1 = GridSearchCV(model, tree_params, cv = 5, error_score='raise')
clf1.fit(X,y)
clf1.best_params_

{'max_depth': 3, 'min_samples_leaf': 9}

In [180]:
from sklearn.model_selection import cross_val_score
tree_param = {
    'max_depth': [3],
    'min_samples_leaf': [9]
}
clf2 = GridSearchCV(model, tree_param, cv=5)
a = cross_val_score(clf2, X = X, y = y, cv=5, error_score='accuracy')
a.mean()

0.8011764705882353

In [181]:
X_test = test_v2[feature_cols].values
res3 = clf1.predict(X_test)

In [87]:
# -. скор при последней отправке
make_submission(ids, res3, output_path='res3.csv')

In [91]:
test_v2

Unnamed: 0,Age,Children,Family_size,Fare,PassengerId,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,...,Mother_1,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_no_deck
0,20,0,2,26.0000,1167,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,33,0,1,26.5500,1215,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,38,0,1,0.0000,823,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,28,0,11,69.5500,864,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
4,4,1,3,16.7000,11,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
5,13,1,5,262.3750,956,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
6,16,1,3,8.5167,996,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
7,18,0,1,11.5000,145,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
8,24,0,1,69.3000,370,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
9,28,0,2,15.5000,613,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1


In [83]:
X_test_arg = [col for col in test_v2.columns if col not in {"PassengerId"}]
X_test = test_v2[X_test_arg].values

In [84]:
res2 = clf1.predict(X_test)

make_submission(ids, res2, output_path='res3.csv')