#### Setup & Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

#### Load

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

#### Explore the Dataset

In [3]:
print(df_train.shape)
print(df_test.shape)

(891, 12)
(418, 11)


In [4]:
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [5]:
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
None


In [6]:
print(df_train.describe(include="all"))

        PassengerId    Survived      Pclass                 Name   Sex  \
count    891.000000  891.000000  891.000000                  891   891   
unique          NaN         NaN         NaN                  891     2   
top             NaN         NaN         NaN  Dooley, Mr. Patrick  male   
freq            NaN         NaN         NaN                    1   577   
mean     446.000000    0.383838    2.308642                  NaN   NaN   
std      257.353842    0.486592    0.836071                  NaN   NaN   
min        1.000000    0.000000    1.000000                  NaN   NaN   
25%      223.500000    0.000000    2.000000                  NaN   NaN   
50%      446.000000    0.000000    3.000000                  NaN   NaN   
75%      668.500000    1.000000    3.000000                  NaN   NaN   
max      891.000000    1.000000    3.000000                  NaN   NaN   

               Age       SibSp       Parch  Ticket        Fare Cabin Embarked  
count   714.000000  891.000000 

In [7]:
print(df_test.describe(include="all"))

        PassengerId      Pclass                      Name   Sex         Age  \
count    418.000000  418.000000                       418   418  332.000000   
unique          NaN         NaN                       418     2         NaN   
top             NaN         NaN  Peter, Master. Michael J  male         NaN   
freq            NaN         NaN                         1   266         NaN   
mean    1100.500000    2.265550                       NaN   NaN   30.272590   
std      120.810458    0.841838                       NaN   NaN   14.181209   
min      892.000000    1.000000                       NaN   NaN    0.170000   
25%      996.250000    1.000000                       NaN   NaN   21.000000   
50%     1100.500000    3.000000                       NaN   NaN   27.000000   
75%     1204.750000    3.000000                       NaN   NaN   39.000000   
max     1309.000000    3.000000                       NaN   NaN   76.000000   

             SibSp       Parch    Ticket        Far

In [8]:
print(df_train.sample(5))

     PassengerId  Survived  Pclass                                       Name  \
300          301         1       3   Kelly, Miss. Anna Katherine "Annie Kate"   
583          584         0       1                        Ross, Mr. John Hugo   
813          814         0       3         Andersson, Miss. Ebba Iris Alfrida   
430          431         1       1  Bjornstrom-Steffansson, Mr. Mauritz Hakan   
86            87         0       3                     Ford, Mr. William Neal   

        Sex   Age  SibSp  Parch      Ticket    Fare Cabin Embarked  
300  female   NaN      0      0        9234   7.750   NaN        Q  
583    male  36.0      0      0       13049  40.125   A10        C  
813  female   6.0      4      2      347082  31.275   NaN        S  
430    male  28.0      0      0      110564  26.550   C52        S  
86     male  16.0      1      3  W./C. 6608  34.375   NaN        S  


In [9]:
print(df_test.sample(5))

     PassengerId  Pclass                                           Name  \
202         1094       1                         Astor, Col. John Jacob   
173         1065       3                               Torfa, Mr. Assad   
69           961       1            Fortune, Mrs. Mark (Mary McDougald)   
12           904       1  Snyder, Mrs. John Pillsbury (Nelle Stevenson)   
355         1247       1                       Julian, Mr. Henry Forbes   

        Sex   Age  SibSp  Parch    Ticket      Fare        Cabin Embarked  
202    male  47.0      1      0  PC 17757  227.5250      C62 C64        C  
173    male   NaN      0      0      2673    7.2292          NaN        C  
69   female  60.0      1      4     19950  263.0000  C23 C25 C27        S  
12   female  23.0      1      0     21228   82.2667          B45        S  
355    male  50.0      0      0    113044   26.0000          E60        S  


#### Preprocessing & Data Cleaning

Drop the Unimportant Features(Name, Tickect, Cabin, PassengerId)

In [10]:
df_test_passengers_id = df_test["PassengerId"]
df_train.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)
df_test.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)

Imputing

In [11]:
df_train["Age"].fillna(df_train["Age"].median(), inplace=True)
df_train["Embarked"].fillna(df_train["Embarked"].mode()[0], inplace=True)

df_test["Age"].fillna(df_test["Age"].median(), inplace=True)
df_test["Fare"].fillna(df_test["Fare"].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["Age"].fillna(df_train["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["Embarked"].fillna(df_train["Embarked"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme

Convert Categorical Data to Numerical & Create Extra 2 Columns(FamilySize and IsAlone)

In [12]:
df_train = pd.get_dummies(df_train, columns=["Embarked", "Sex"], drop_first=True)
df_test = pd.get_dummies(df_test, columns=["Embarked", "Sex"], drop_first=True)

In [13]:
df_train["Embarked_Q"] = df_train["Embarked_Q"].map({True: 1, False: 0})
df_train["Embarked_S"] = df_train["Embarked_S"].map({True: 1, False: 0})
df_train["Sex_male"] = df_train["Sex_male"].map({True: 1, False: 0})
df_test["Embarked_Q"] = df_test["Embarked_Q"].map({True: 1, False: 0})
df_test["Embarked_S"] = df_test["Embarked_S"].map({True: 1, False: 0})
df_test["Sex_male"] = df_test["Sex_male"].map({True: 1, False: 0})

df_train["FamilySize"] = df_train["SibSp"] + df_train["Parch"] + 1
df_train["IsAlone"] = (df_train["FamilySize"] == 1).astype(int)

df_test["FamilySize"] = df_test["SibSp"] + df_test["Parch"] + 1
df_test["IsAlone"] = (df_test["FamilySize"] == 1).astype(int)

In [14]:
print(df_train.dtypes)
print(df_train.sample(5))

Survived        int64
Pclass          int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked_Q      int64
Embarked_S      int64
Sex_male        int64
FamilySize      int64
IsAlone         int64
dtype: object
     Survived  Pclass   Age  SibSp  Parch      Fare  Embarked_Q  Embarked_S  \
158         0       3  28.0      0      0    8.6625           0           1   
340         1       2   2.0      1      1   26.0000           0           1   
483         1       3  63.0      0      0    9.5875           0           1   
269         1       1  35.0      0      0  135.6333           0           1   
120         0       2  21.0      2      0   73.5000           0           1   

     Sex_male  FamilySize  IsAlone  
158         1           1        1  
340         1           3        0  
483         0           1        1  
269         0           1        1  
120         1           3        0  


Handling Missing Values

In [15]:
df_train["Fare"] = np.log1p(df_train["Fare"])
df_train["Age"] = np.log1p(df_train["Age"])
df_train["FamilySize"] = np.log1p(df_train["FamilySize"])

In [16]:
df_test["Fare"] = np.log1p(df_test["Fare"])
df_test["Age"] = np.log1p(df_test["Age"])
df_test["FamilySize"] = np.log1p(df_test["FamilySize"])

#### Train Test Split

In [17]:
X = df_train.drop(columns=["Survived"])
y = df_train["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Shape of Training Data : {X.shape}\n"
      f"Shape of Testing Data : {y.shape}")

Shape of Training Data : (891, 10)
Shape of Testing Data : (891,)


#### Hyperparameter Tuning(Random Search) & Train Data with 5 Models

In [18]:
rf = RandomForestClassifier()

param_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

random_search_rf = RandomizedSearchCV(rf, param_rf, cv=5, scoring="accuracy", n_jobs=2)
random_search_rf.fit(X_train, y_train)

print(f"Best Parameters: {random_search_rf.best_params_}")

Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_depth': None}


In [19]:
xgb = XGBClassifier()

param_xgb = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 10],
    "subsample": [0.6, 0.8, 1.0]
}

random_search_xgb = RandomizedSearchCV(xgb, param_xgb, cv=5, scoring="accuracy", n_jobs=2)
random_search_xgb.fit(X_train, y_train)

print(f"Best Parameters: {random_search_xgb.best_params_}")

Best Parameters: {'subsample': 0.6, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}


In [20]:
lg_1 = LogisticRegression(penalty="l1")

param_lg_1 = {
    "C": np.logspace(-4, 4, 10),
    "solver": ["saga", "liblinear"]
}

random_search_lg_1 = RandomizedSearchCV(lg_1, param_lg_1, cv=5, scoring="accuracy", n_jobs=2)
random_search_lg_1.fit(X_train, y_train)

print(f"Best Parameters: {random_search_lg_1.best_params_}")

Best Parameters: {'solver': 'liblinear', 'C': np.float64(2.782559402207126)}


In [21]:
lg_2 = LogisticRegression(penalty="l2")

param_lg_2 = {
    "C": np.logspace(-4, 4, 10),
    "solver": ["saga", "liblinear", "lbfgs"]
}

random_search_lg_2 = RandomizedSearchCV(lg_2, param_lg_2, cv=5, scoring="accuracy", n_jobs=2)
random_search_lg_2.fit(X_train, y_train)

print(f"Best Parameters: {random_search_lg_2.best_params_}")

Best Parameters: {'solver': 'liblinear', 'C': np.float64(166.81005372000558)}


In [22]:
dt = DecisionTreeClassifier()

param_dt = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

random_search_dt = RandomizedSearchCV(dt, param_dt, cv=5, scoring="accuracy", n_jobs=2)
random_search_dt.fit(X_train, y_train)

print(f"Best Parameters: {random_search_dt.best_params_}")

Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 3, 'criterion': 'entropy'}


In [23]:
lgbm = LGBMClassifier()

param_lgbm = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "num_leaves": [20, 31, 40],
    "max_depth": [3, 5, 10]
}

random_search_lgbm = RandomizedSearchCV(lgbm, param_lgbm, cv=5, scoring="accuracy", n_jobs=2)
random_search_lgbm.fit(X_train, y_train)

print(f"Best Parameters: {random_search_lgbm.best_params_}")

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 207
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
Best Parameters: {'num_leaves': 20, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}


In [24]:
voting_clf = VotingClassifier(estimators=[
    ("rf", random_search_rf.best_estimator_),
    ("lg_1", random_search_lg_1.best_estimator_),
    ("lg_2", random_search_lg_2.best_estimator_),
    ("xgb", random_search_xgb.best_estimator_),
    ("dt", random_search_dt.best_estimator_),
    ("lgbm", random_search_lgbm.best_estimator_)
], voting="soft")
voting_clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 207
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


#### Make Prediction

In [25]:
y_pred = voting_clf.predict(X_test)
y_pred_real = voting_clf.predict(df_test)

#### Evaluate the Model

In [26]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score : {accuracy}")

f_score = f1_score(y_test, y_pred)
print(f"F1 Score : {f_score}")

Accuracy Score : 0.8100558659217877
F1 Score : 0.7571428571428571
