In [1]:
import pandas as pd
import os

DATA_ROOT = os.path.join("downloads", "datasets", "titanic")

data_train = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))

# data_test don't have target value
X_test = pd.read_csv(os.path.join(DATA_ROOT, "test.csv"))

In [2]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [3]:
data_train.keys()

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
X_train, y_train = data_train.drop(["Survived"], axis=1), data_train["Survived"]

In [5]:
# how many survived
y_train.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [6]:
X_train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
X_train.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
X_train.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [10]:
X_train["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
X_train["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [12]:
corr_mx = X_train.corr()

In [13]:
corr_mx

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Pclass,-0.035144,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,-0.5495,0.096067,0.159651,0.216225,1.0


# Preprocessing

In [14]:
def split_attribs(data):
    _NUMERIC_KINDS = set('buifc')
    num_attribs = []
    cat_attribs = []

    for attrib in data:
        if data[attrib].dtype.kind in _NUMERIC_KINDS:
            num_attribs.append(attrib)
        else:
            cat_attribs.append(attrib)
    
    return num_attribs, cat_attribs

In [15]:
INDEXES = list(X_train.keys())
def get_index_of(name, l=INDEXES):
    return l.index(name)

In [16]:
danger_group = ((X_train["Sex"] == "male") & (X_train["Age"] >= 60.)) | ((X_train["Sex"] == "female") & (X_train["Age"] > 50.))
danger_group.value_counts()

False    852
True      39
dtype: int64

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class AttribsAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_danger=True):
        self.sex_idx = 0 # like in ATTRIBS_ADDER
        self.age_idx = 1
        self.add_danger = add_danger
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.add_danger:
            danger = ((X["Sex"] == "male") & (X["Age"] >= 60.)) | ((X["Sex"] == "female") & (X["Age"] > 50.))
            danger = [1 if x else 0 for  x in danger]
        return X

class CatAttribsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, convert_ticket=True):
        self.convert_ticket = convert_ticket
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.convert_ticket:
            ticket_old = np.copy(X["Ticket"].values)

            for idx, x in enumerate(ticket_old):
                if x.isdigit():
                    X["Ticket"].values[idx] = 1
                else:
                    X["Ticket"].values[idx] = 0
        return X

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("std_scaler", StandardScaler())
])

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


cat_fill_pipeline= ColumnTransformer([
    # ("imputer_cab", SimpleImputer(strategy="constant", fill_value="unknown"), ["Cabin"]),
    ("imputer_emb", SimpleImputer(strategy="most_frequent"), ["Embarked"]),
], remainder="passthrough")

cat_change_pipeline = ColumnTransformer([
    # ("ordinar", OrdinalEncoder(), ["Cabin"]),
    ("one", OneHotEncoder(), ["Sex", "Embarked"]),
    ("adder", CatAttribsTransformer(), ["Ticket"])
])

cabin_scaler = StandardScaler()

In [20]:
def change(data, new=False):
    NUM_ATTRIBS, CAT_ATTRIBS = split_attribs(data)
    num_data = data[NUM_ATTRIBS]
    cat_data = data[CAT_ATTRIBS]

    NUM_ATTRIBS.remove("PassengerId")
    if new:
        num_data = pd.DataFrame(num_pipeline.fit_transform(num_data[NUM_ATTRIBS]), columns=NUM_ATTRIBS)
    else:
        num_data = pd.DataFrame(num_pipeline.transform(num_data[NUM_ATTRIBS]), columns=NUM_ATTRIBS)

    danger = ((cat_data["Sex"] == "male") & (num_data["Age"] >= 60.)) | ((cat_data["Sex"] == "female") & (num_data["Age"] > 50.))
    danger = pd.DataFrame({"Danger": [1 if x else 0 for  x in danger]})
    
    cat_att_0 = "Embarked"
    cat_att_1 = [cat_att_0, *[x for x in cat_data if x != cat_att_0 and x != "Cabin"]]

    if new:
        cat_data = pd.DataFrame(cat_fill_pipeline.fit_transform(cat_data[cat_att_1]), columns=cat_att_1)
        cat_data = cat_change_pipeline.fit_transform(cat_data)
    else:
        cat_data = pd.DataFrame(cat_fill_pipeline.transform(cat_data[cat_att_1]), columns=cat_att_1)
        cat_data = cat_change_pipeline.transform(cat_data)

    cat_OH = [list(x) for x in cat_change_pipeline.named_transformers_["one"].categories_]
    cat_OH = [y for x in cat_OH for y in x]
    cat_att_2 = [*cat_OH, "Ticket"]
    cat_data = pd.DataFrame(cat_data, columns=cat_att_2)
    
    # if new:
    #     cat_data["Cabin"] = cabin_scaler.fit_transform(cat_data["Cabin"].values.reshape(-1, 1))
    # else:
    #     cat_data["Cabin"] = cabin_scaler.fit(cat_data["Cabin"].values.reshape(-1, 1))

    return pd.concat([num_data, cat_data, danger], axis=1)

In [21]:
X_train = change(X_train, new=True)
X_test = change(X_test)

In [22]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    float64
 1   Age     891 non-null    float64
 2   SibSp   891 non-null    float64
 3   Parch   891 non-null    float64
 4   Fare    891 non-null    float64
 5   female  891 non-null    object 
 6   male    891 non-null    object 
 7   C       891 non-null    object 
 8   Q       891 non-null    object 
 9   S       891 non-null    object 
 10  Ticket  891 non-null    object 
 11  Danger  891 non-null    int64  
dtypes: float64(5), int64(1), object(6)
memory usage: 83.7+ KB


# Training

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn_clf = KNeighborsClassifier(n_jobs=-1)
param_grid = [
    {"weights": ["uniform", "distance"], "n_neighbors": [1, 2, 3, 5, 8, 10, 15, 30, 50]},
]

grid_srch = GridSearchCV(knn_clf, param_grid=param_grid, n_jobs=-1, cv=5)
grid_srch.fit(X_train, y_train)
grid_srch.best_score_

0.8069612704789405

In [24]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
param_grid = [
    {"criterion": ["gini", "entropy"], "n_estimators": [100, 200, 300, 500, 1000], "bootstrap": [True, False]},
]

grid_srch = GridSearchCV(rnd_clf, param_grid=param_grid, n_jobs=-1, cv=5)
grid_srch.fit(X_train, y_train)
grid_srch.best_score_

0.8103508882053857

In [27]:
from sklearn.svm import SVC

svc_clf = SVC()
param_grid = [
    {"kernel": ["linear", "poly", "sigmoid", "rbf"], "C": [0.1, 0.5, 1., 2., 5., 10., 20., 50.]},
]

grid_srch = GridSearchCV(svc_clf, param_grid=param_grid, n_jobs=-1, cv=5)
grid_srch.fit(X_train, y_train)
grid_srch.best_score_

0.8305065595380077