In [70]:
# 2023 OCT 28

In [71]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns

# functions for data pre-processing

**drop features that are unnecessary for machine learning**

In [72]:
def drop_features(df, features):
    df.drop(features, axis=1, inplace=True)

**encode**

In [73]:
from sklearn.preprocessing import LabelEncoder

def encode_features(df, features):
    encoder = LabelEncoder()
    for feature in features:
        encoder.fit(df[feature])
        df[feature] = encoder.transform(df[feature])
        
#         print(encoder.classes_)

**null**

In [74]:
def fill_nulls(df, instructions):
    for feature, method in instructions.items():
        if method == "N":
            df[feature].fillna("N", inplace=True)
        elif method == "mean":
            df[feature].fillna(df[feature].mean(), inplace=True)
        else:
            print("<!> unknown method")

**combined**

In [75]:
def preprocess_features(df, features_to_drop, instructions_to_fill_null, features_to_encode):
    drop_features(df, features_to_drop)
    fill_nulls(df, instructions_to_fill_null)
    
    print("number of nulls in dataset:", df.isnull().sum().sum())
    
    encode_features(df, features_to_encode)

# load data

In [76]:
titanic_df = pd.read_csv("../data/titanic_train.csv")

feature_names = ["PaxID", "Survived", "Class", "Name", "Sex", "Age", "SibSpo", "ParChi", "Ticket", "Fare", "Cabin", "PortEmba"]
titanic_df.columns = feature_names
display(titanic_df.head(3))

Unnamed: 0,PaxID,Survived,Class,Name,Sex,Age,SibSpo,ParChi,Ticket,Fare,Cabin,PortEmba
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


**extract features and targets**

In [77]:
y = titanic_df["Survived"]
X = titanic_df.drop("Survived", axis=1, inplace=False)

display(X)

Unnamed: 0,PaxID,Class,Name,Sex,Age,SibSpo,ParChi,Ticket,Fare,Cabin,PortEmba
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# pre-processing

In [78]:
features_to_drop = ["PaxID", "Name", "Ticket"]
instructions_to_fill_null = {"Age": "mean", "Cabin": "N", "PortEmba": "N"}
features_to_encode = ["Cabin", "Sex", "PortEmba"]

preprocess_features(X, features_to_drop, instructions_to_fill_null, features_to_encode)

display(X)

number of nulls in dataset: 0


Unnamed: 0,Class,Sex,Age,SibSpo,ParChi,Fare,Cabin,PortEmba
0,3,1,22.000000,1,0,7.2500,146,3
1,1,0,38.000000,1,0,71.2833,81,0
2,3,0,26.000000,0,0,7.9250,146,3
3,1,0,35.000000,1,0,53.1000,55,3
4,3,1,35.000000,0,0,8.0500,146,3
...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,146,3
887,1,0,19.000000,0,0,30.0000,30,3
888,3,0,29.699118,1,2,23.4500,146,3
889,1,1,26.000000,0,0,30.0000,60,0


# train & predict

**prepare estimators & metrics**

In [106]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [107]:
# create objects
dt_er = DecisionTreeClassifier(random_state=11)
rf_er = RandomForestClassifier(random_state=11)
lr_er = LogisticRegression(solver='liblinear')

# train & predict <1>: No Fold

**divide dataset**

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 8) (179, 8) (712,) (179,)


**train, predict, evaluation (T-P-E)**

In [108]:
def run_direct(estimator):
    print("--------------")
    estimator.fit(X_train , y_train)
    predictions = estimator.predict(X_test)
    print(f"accuracy score: {accuracy_score(y_test, predictions): .3f}", "\n")

In [109]:
# DecisionTreeClassifier T-P-E
run_direct(dt_er)

# RandomForestClassifier T-P-E
run_direct(rf_er)

# LogisticRegression T-P-E
run_direct(lr_er)

--------------
accuracy score:  0.799 

--------------
accuracy score:  0.844 

--------------
accuracy score:  0.855 



# train & predict <2>: K-Fold

In [91]:
from sklearn.model_selection import KFold

def run_k_fold(estimator, num_folds):
    print("--------------")
    # k-fold setting
    k_fold = KFold(n_splits=num_folds)
    fold_accuracies = []
    
    for iteration, (train_indices, test_indices) in enumerate(k_fold.split(X)):  # split(df) total data or features only ok (?)
        # divide into folds
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
        
        #
#         display(X_test.head(3))
#         print("taining set labels:\n", y_train.value_counts(), sep="")
#         print("test set labels:\n", y_test.value_counts(), sep="")

        # train & predict
        estimator.fit(X_train, y_train)
        predictions = estimator.predict(X_test)

        # accuracy
        fold_accuracy = accuracy_score(y_test, predictions)
        fold_accuracies.append(fold_accuracy)

        #
        print(f"fold #{iteration}, fold accuracy score: {accuracy_score(y_test, predictions): .3f}")
#         print(f"training set size: {X_train.shape[0]}, test set size: {X_test.shape[0]}")
#         print(f"training set indices: {test_indices}")

    # evaluation: (mean fold performances)
    print("\n<EVALUATION RESULT>")
    print("accuracy score:", np.mean(fold_accuracies), "\n")

In [92]:
num_folds = 5

# DecisionTreeClassifier T-P-E
run_k_fold(dt_er, num_folds)

# RandomForestClassifier T-P-E
run_k_fold(rf_er, num_folds)

# LogisticRegression T-P-E
run_k_fold(lr_er, num_folds)

--------------
fold #0, fold accuracy score:  0.749
fold #1, fold accuracy score:  0.764
fold #2, fold accuracy score:  0.820
fold #3, fold accuracy score:  0.781
fold #4, fold accuracy score:  0.792

<EVALUATION RESULT>
accuracy score: 0.7811813445483649 

--------------
fold #0, fold accuracy score:  0.765
fold #1, fold accuracy score:  0.809
fold #2, fold accuracy score:  0.826
fold #3, fold accuracy score:  0.770
fold #4, fold accuracy score:  0.854

<EVALUATION RESULT>
accuracy score: 0.804758018956751 

--------------
fold #0, fold accuracy score:  0.788
fold #1, fold accuracy score:  0.798
fold #2, fold accuracy score:  0.775
fold #3, fold accuracy score:  0.747
fold #4, fold accuracy score:  0.837

<EVALUATION RESULT>
accuracy score: 0.7890025735986441 



# train & predict <3>: Stratified K-Fold

In [104]:
from sklearn.model_selection import StratifiedKFold

def run_stratified_k_fold(estimator, num_folds, targets):
    print("--------------")
    # k-fold setting
    k_fold = StratifiedKFold(n_splits=num_folds)
    fold_accuracies = []
        
    # need to pass targets to achieve good sampling
    for iteration, (train_indices, test_indices) in enumerate(k_fold.split(X, y)):  # total data or features only ok (?)
        # divide into folds
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
        
        #
#         display(X_test.head(3))
#         print("taining set labels:\n", y_train.value_counts(), sep="")
#         print("test set labels:\n", y_test.value_counts(), sep="")

        # train & predict
        estimator.fit(X_train, y_train)
        predictions = estimator.predict(X_test)

        # accuracy
        fold_accuracy = accuracy_score(y_test, predictions)
        fold_accuracies.append(fold_accuracy)

        #
        print(f"fold #{iteration}, fold accuracy score: {accuracy_score(y_test, predictions): .3f}")
#         print(f"training set size: {X_train.shape[0]}, test set size: {X_test.shape[0]}")
#         print(f"training set indices: {test_indices}")

    # evaluation: (mean batch performances)
    print("\n<EVALUATION RESULT>")
    print("accuracy score:", np.mean(fold_accuracies), "\n")

In [105]:
num_folds = 5

# DecisionTreeClassifier T-P-E
run_stratified_k_fold(dt_er, num_folds, y)

# RandomForestClassifier T-P-E
run_stratified_k_fold(rf_er, num_folds, y)

# LogisticRegression T-P-E
run_stratified_k_fold(lr_er, num_folds, y)

--------------
fold #0, fold accuracy score:  0.749
fold #1, fold accuracy score:  0.775
fold #2, fold accuracy score:  0.809
fold #3, fold accuracy score:  0.758
fold #4, fold accuracy score:  0.803

<EVALUATION RESULT>
accuracy score: 0.7789341535371289 

--------------
fold #0, fold accuracy score:  0.793
fold #1, fold accuracy score:  0.798
fold #2, fold accuracy score:  0.860
fold #3, fold accuracy score:  0.781
fold #4, fold accuracy score:  0.860

<EVALUATION RESULT>
accuracy score: 0.8182097796748478 

--------------
fold #0, fold accuracy score:  0.788
fold #1, fold accuracy score:  0.798
fold #2, fold accuracy score:  0.781
fold #3, fold accuracy score:  0.764
fold #4, fold accuracy score:  0.820

<EVALUATION RESULT>
accuracy score: 0.7901261691042621 



# train & predict <3*>: cross_val_score

In [112]:
from sklearn.model_selection import cross_val_score

def run_cross_val_score(estimator, num_folds):
    print("--------------")
    
    fold_accuracies = cross_val_score(estimator, X, y, scoring="accuracy", cv=num_folds)
    print(fold_accuracies)

    # evaluation: (mean batch performances)
    print("\n<EVALUATION RESULT>")
    print("accuracy score:", np.mean(fold_accuracies), "\n")

In [113]:
num_folds = 5

# DecisionTreeClassifier T-P-E
run_cross_val_score(dt_er, num_folds)

# RandomForestClassifier T-P-E
run_cross_val_score(rf_er, num_folds)

# LogisticRegression T-P-E
run_cross_val_score(lr_er, num_folds)

--------------
[0.74860335 0.7752809  0.80898876 0.75842697 0.80337079]

<EVALUATION RESULT>
accuracy score: 0.7789341535371289 

--------------
[0.79329609 0.79775281 0.85955056 0.78089888 0.85955056]

<EVALUATION RESULT>
accuracy score: 0.8182097796748478 

--------------
[0.7877095  0.79775281 0.78089888 0.76404494 0.82022472]

<EVALUATION RESULT>
accuracy score: 0.7901261691042621 



# train & predict <4>: GridSearchCV

In [114]:
from sklearn.model_selection import GridSearchCV

def run_GridSearchCV(estimator, params_roster, num_folds):
    print("--------------")
    # setting
    estimator_grid = GridSearchCV(estimator, param_grid=params_roster, scoring="accuracy", cv=num_folds)
    # estimator_grid = GridSearchCV(estimator, param_grid=params_roster, scoring="accuracy", cv=num_folds, refit=True, return_train_score=True)

    # initiate training
    estimator_grid.fit(X_train, y_train)

    # check the training result
#     scores_df = pd.DataFrame(estimator_grid.cv_results_)
#     display(scores_df[['params', 'mean_test_score', 'rank_test_score', 
#         'split0_test_score', 'split1_test_score', 'split2_test_score']])

    # check the optimal setting
    print("optimal hyper parameters:", f"{estimator_grid.best_params_}")
    print("best score (accuracy score):", f"{estimator_grid.best_score_: .4f}")
    
    # (re)fit to the optimal estimator
    optimal_estimator = estimator_grid.best_estimator_
    
    # predict & evaluate for the optimal estimator
    predictions = optimal_estimator.predict(X_test)
    print(f"accuracy score: {accuracy_score(y_test, predictions): .4f}")

In [115]:
num_folds = 5

# DecisionTreeClassifier T-P-E
params_roster = {"max_depth": [2, 3, 5, 10], "min_samples_split": [2, 3, 5], "min_samples_leaf":[1, 5, 8]}  # -> 4 X 3 X 3 grid
run_GridSearchCV(dt_er, params_roster, num_folds)

--------------
optimal hyper parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5}
best score (accuracy score):  0.7993
accuracy score:  0.8659
