In [188]:
# Import needed moduls

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [189]:
# Load data

titanic_train = pd.read_csv("../data/titanic/train.csv")
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [190]:
# Delete no needed (for my opinion) columns

titanic_train.drop(labels=["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)
titanic_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [191]:
# Fill "Age" column nones by mean value

titanic_train["Age"].fillna(titanic_train["Age"].median(), inplace=True)

# No split data for Train and test yet, and I know that necessary 
# to filling gaps each sample differently (mean() at least)

# Check result and other gaps

print(titanic_train["Age"].isna().sum())
titanic_train.info()

0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [192]:
# Check mode for object column to fill gaps
# Fill gaps and check result

titanic_train["Embarked"].fillna(titanic_train["Embarked"].mode()[0], inplace=True)
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [193]:
titanic_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [194]:
for i in range(0, int(titanic_train["Age"].max()//15)):
    titanic_train[f"Age{i}"] = titanic_train["Age"].map(lambda x: int(x >= 15*i and x < 15*(i+1)))
titanic_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age0,Age1,Age2,Age3,Age4
0,0,3,male,22.0,1,0,7.25,S,0,1,0,0,0
1,1,1,female,38.0,1,0,71.2833,C,0,0,1,0,0
2,1,3,female,26.0,0,0,7.925,S,0,1,0,0,0
3,1,1,female,35.0,1,0,53.1,S,0,0,1,0,0
4,0,3,male,35.0,0,0,8.05,S,0,0,1,0,0


In [195]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
titanic_train[["Age_scaled", "Fare_scaled"]] = scaler.fit_transform(titanic_train[["Age", "Fare"]])
titanic_train.drop(["Age", "Fare"], axis=1, inplace=True)
titanic_train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age0,Age1,Age2,Age3,Age4,Age_scaled,Fare_scaled
0,0,3,male,1,0,S,0,1,0,0,0,-0.565736,-0.502445
1,1,1,female,1,0,C,0,0,1,0,0,0.663861,0.786845
2,1,3,female,0,0,S,0,1,0,0,0,-0.258337,-0.488854
3,1,1,female,1,0,S,0,0,1,0,0,0.433312,0.42073
4,0,3,male,0,0,S,0,0,1,0,0,0.433312,-0.486337


In [196]:
# Converting categorical string feature to binominal integer

titanic_train["Sex"] = titanic_train["Sex"].apply(lambda x: 1 if x=="male" else 0)
titanic_train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age0,Age1,Age2,Age3,Age4,Age_scaled,Fare_scaled
0,0,3,1,1,0,S,0,1,0,0,0,-0.565736,-0.502445
1,1,1,0,1,0,C,0,0,1,0,0,0.663861,0.786845
2,1,3,0,0,0,S,0,1,0,0,0,-0.258337,-0.488854
3,1,1,0,1,0,S,0,0,1,0,0,0.433312,0.42073
4,0,3,1,0,0,S,0,0,1,0,0,0.433312,-0.486337


In [197]:

titanic_train = pd.concat([titanic_train, 
                           pd.get_dummies(titanic_train["Pclass"], prefix='Class'),
                           pd.get_dummies(titanic_train["SibSp"], prefix='SibSp'),
                           pd.get_dummies(titanic_train["Parch"], prefix='Parch'),
                           pd.get_dummies(titanic_train["Embarked"], prefix='Embarked')],
                           axis=1)
titanic_train.drop(["Embarked", "Pclass", "Parch", "SibSp"], axis=1, inplace=True)

In [198]:
titanic_train.head()

Unnamed: 0,Survived,Sex,Age0,Age1,Age2,Age3,Age4,Age_scaled,Fare_scaled,Class_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,1,0,0,0,-0.565736,-0.502445,0,...,1,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0.663861,0.786845,1,...,1,0,0,0,0,0,0,1,0,0
2,1,0,0,1,0,0,0,-0.258337,-0.488854,0,...,1,0,0,0,0,0,0,0,0,1
3,1,0,0,0,1,0,0,0.433312,0.42073,1,...,1,0,0,0,0,0,0,0,0,1
4,0,1,0,0,1,0,0,0.433312,-0.486337,0,...,1,0,0,0,0,0,0,0,0,1


In [199]:
# Splitting the dataset on training and testing

from sklearn.model_selection import train_test_split

X = titanic_train.drop("Survived", axis=1)
y = titanic_train["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [200]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [201]:
models = {"LinearSVC": LinearSVC(),
          "KNN": KNeighborsClassifier(),
          "SVC": SVC(),
          "LogisticRegression": LogisticRegression(),
          "RandomForestClassifier": RandomForestClassifier()}


# Create an empty dictionary called results
results = {}

In [202]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    results[model_name] = model.score(X_test, y_test)

# View the results
results

{'LinearSVC': 0.8044692737430168,
 'KNN': 0.8212290502793296,
 'SVC': 0.8100558659217877,
 'LogisticRegression': 0.8156424581005587,
 'RandomForestClassifier': 0.8212290502793296}

In [203]:
from sklearn.model_selection import GridSearchCV

param_RFC = {'n_estimators': [i for i in range(1, 106, 15)],
             'max_depth': [i for i in range(1, 31, 3)]}

# Setup the grid search
grid_RFC = GridSearchCV(RandomForestClassifier(random_state=42, ),
                    param_RFC,
                    cv=5)

# Fit the grid search to the data
grid_RFC.fit(X_train, y_train)

# Find the best parameters
grid_RFC.best_params_, grid_RFC.best_score_

({'max_depth': 7, 'n_estimators': 31}, 0.814596670934699)

In [204]:
#Check score of estimator (accuracy)
grid_RFC.best_estimator_.fit(X_train, y_train)
grid_RFC.best_estimator_.score(X_test, y_test)

0.8379888268156425

In [205]:
param_LR = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'C': [10**(i-2) for i in range(1, 5, 1)],
            'solver': ['lbfgs', 'liblinear']}

# Setup the grid search
grid_LR = GridSearchCV(LogisticRegression(),
                    param_LR,
                    cv=5)

# Fit the grid search to the data
grid_LR.fit(X_train, y_train)

# Find the best parameters
grid_LR.best_params_, grid_LR.best_score_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

({'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}, 0.801930463902295)

In [206]:
#Check score of estimator (accuracy)
grid_LR.best_estimator_.fit(X_train, y_train)
grid_LR.best_estimator_.score(X_test, y_test)

0.8156424581005587

In [207]:
param_knn = {'n_neighbors': [5, 7, 8, 9, 10, 12, 15, 20],
              'weights': ['uniform', 'distance'],
              "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']}

# Setup the grid search
grid_knn = GridSearchCV(KNeighborsClassifier(),
                    param_knn,
                    cv=5)

# Fit the grid search to the data
grid_knn.fit(X_train, y_train)

# Find the best parameters
grid_knn.best_params_, grid_knn.best_score_

({'algorithm': 'auto', 'n_neighbors': 9, 'weights': 'uniform'},
 0.7837092484979808)

In [208]:
grid_knn.best_estimator_.fit(X_train, y_train)
grid_knn.best_estimator_.score(X_test, y_test)

0.8100558659217877

In [209]:
from sklearn.model_selection import cross_val_score

print(f"Accuracy {np.mean(cross_val_score(grid_RFC.best_estimator_, X, y, cv=5, scoring='accuracy'))}")
print(f"Recall {np.mean(cross_val_score(grid_RFC.best_estimator_, X, y, cv=5, scoring='recall'))}")
print(f"Precision {np.mean(cross_val_score(grid_RFC.best_estimator_, X, y, cv=5, scoring='precision'))}")
print(f"F1 {np.mean(cross_val_score(grid_RFC.best_estimator_, X, y, cv=5, scoring='f1'))}")

Accuracy 0.8182097796748478
Recall 0.6519607843137254
Precision 0.8407044118672026
F1 0.7323053763440859


In [211]:
# Load the test and valid data

titanic_test = pd.read_csv("../data/titanic/test.csv")
y_valid = pd.read_csv("../data/titanic/gender_submission.csv", index_col = 0).squeeze("columns")
titanic_test.head()

# Same transformations as on train dataset

# Delete no needed (for my opinion) columns
titanic_test.drop(labels=["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

# Fill "Age" and "Fare"column nones by mean value
titanic_test["Age"].fillna(titanic_test["Age"].median(), inplace=True)
titanic_test["Fare"].fillna(titanic_test["Fare"].median(), inplace=True)

# Fill "Embarked" column nones by mode value
titanic_test["Embarked"].fillna(titanic_test["Embarked"].mode()[0], inplace=True)

# Converting categorical string feature to binominal integer
titanic_test["Sex"] = titanic_test["Sex"].apply(lambda x: 1 if x=="male" else 0)

for i in range(0, int(titanic_test["Age"].max()//15)):
    titanic_test[f"Age{i}"] = titanic_test["Age"].map(lambda x: int(x >= 15*i and x < 15*(i+1)))

titanic_test[["Age_scaled", "Fare_scaled"]] = scaler.fit_transform(titanic_test[["Age", "Fare"]])
titanic_test.drop(["Age", "Fare"], axis=1, inplace=True)


titanic_test = pd.concat([titanic_test, 
                           pd.get_dummies(titanic_test["Pclass"], prefix='Class'),
                           pd.get_dummies(titanic_test["Embarked"], prefix='Embarked'),
                           pd.get_dummies(titanic_test["Parch"], prefix='Parch'),
                           pd.get_dummies(titanic_test["SibSp"], prefix='SibSp')],
                           axis=1)
titanic_test.drop(["Embarked", "Pclass", "Parch", "SibSp", "Parch_9"], axis=1, inplace=True)

titanic_test.head()

Unnamed: 0,Sex,Age0,Age1,Age2,Age3,Age4,Age_scaled,Fare_scaled,Class_1,Class_2,...,Parch_4,Parch_5,Parch_6,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8
0,1,0,0,1,0,0,0.386231,-0.497413,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,1,0,1.37137,-0.512278,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,1,2.553537,-0.4641,0,1,...,0,0,0,1,0,0,0,0,0,0
3,1,0,1,0,0,0,-0.204852,-0.482475,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,-0.598908,-0.417492,0,0,...,0,0,0,0,1,0,0,0,0,0


In [212]:
# Create resulting dataframe with predictions

result_predictions = pd.DataFrame({"PassengerId":np.arange(892, 1310),
                                   "Survived" : grid_RFC.best_estimator_.predict(titanic_test)})
result_predictions

# Save result to csv file

result_predictions.to_csv("../data/titanic/my_second3_predictions.csv", index=False)

Feature names must be in the same order as they were in fit.

