In [92]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from xgboost import XGBRFClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, classification_report

import os
import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [30]:
DATA_DIR = os.path.join("Dataset","2_processed_data","train_iteratoin4.csv")
df = pd.read_csv(DATA_DIR)

In [31]:
df.shape

(891, 33)

In [32]:
backward_selection_simple = ['Name_Words',
 'Name_Length',
 'Name_Init_ordinalencode',
 'Name_Init_master',
 'Name_Init_miss',
 'Name_Init_mr',
 'Name_Init_mrs',
 'Name_Init_rev',
 'Sex_labelencode',
 'Embraked_labelencoded',
 'Embarked_C',
 'Pclass',
 'SibSp',
 'Parch',
 'Age_Power_MinMax',
 'Fare_Power']

In [33]:
correlated_fe = ['Fare_Power',
 'Name_Init_mrs',
 'Name_Init_miss',
 'Name_Init_ordinalencode',
 'Sex_labelencode',
 'Name_Length',
 'Pclass',
 'Name_Init_mr']

In [34]:
TARGET_COL = ["Survived"]

## Tunning Logistic Regression

In [93]:
X = np.array(df[backward_selection_simple])
y = np.array(df[TARGET_COL])

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [95]:
model = LogisticRegression()

In [96]:
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print(f"Train f1: {f1_score(y_train, y_pred_train)}")
print(f"Test f1 : {f1_score(y_test, y_pred_test)}")

Train f1: 0.7655913978494624
Test f1 : 0.78


In [97]:
print(classification_report(y_pred_test, y_test))

              precision    recall  f1-score   support

           0       0.88      0.85      0.87       171
           1       0.76      0.80      0.78        97

    accuracy                           0.84       268
   macro avg       0.82      0.83      0.82       268
weighted avg       0.84      0.84      0.84       268



#### Grid Search

In [98]:
# parameters
DATA = df.copy()
FEATURES = backward_selection_simple
SCORING = "f1"
CV = 3
SEED = 0
SHUFFLE = True
cv = StratifiedKFold(n_splits=CV, shuffle=SHUFFLE, random_state=SEED)

In [99]:
# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],  # 'liblinear' is required for 'l1' penalty
    'max_iter': [100, 200, 300, 400, 500]
}

In [100]:
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring=SCORING, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation score: 0.7675070771073106


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [101]:
# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Best parameters found:  {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation score:  0.7675070771073106


In [102]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
# model = LogisticRegression(best_model)
best_model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print(f"Train f1: {f1_score(y_train, y_pred_train)}")
print(f"Test f1 : {f1_score(y_test, y_pred_test)}")

Train f1: 0.7655913978494624
Test f1 : 0.78


In [103]:
print(classification_report(y_pred_test, y_test))

              precision    recall  f1-score   support

           0       0.88      0.85      0.87       171
           1       0.76      0.80      0.78        97

    accuracy                           0.84       268
   macro avg       0.82      0.83      0.82       268
weighted avg       0.84      0.84      0.84       268



#### Random Search

In [104]:
# Set up GridSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=cv, scoring=SCORING, n_jobs=-1)

# Fit the model
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

Best parameters: {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 200, 'C': 1}
Best cross-validation score: 0.7675070771073106


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [105]:
# Print the best parameters and the best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

Best parameters found:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 200, 'C': 1}
Best cross-validation score:  0.7675070771073106


In [106]:
# Evaluate the best model on the test set
best_model = random_search.best_estimator_
# model = LogisticRegression(best_model)
best_model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print(f"Train f1: {f1_score(y_train, y_pred_train)}")
print(f"Test f1 : {f1_score(y_test, y_pred_test)}")

Train f1: 0.7655913978494624
Test f1 : 0.78


In [107]:
print(classification_report(y_pred_test, y_test))

              precision    recall  f1-score   support

           0       0.88      0.85      0.87       171
           1       0.76      0.80      0.78        97

    accuracy                           0.84       268
   macro avg       0.82      0.83      0.82       268
weighted avg       0.84      0.84      0.84       268



## Tunning XGB Random Forest Boost

In [108]:
X = np.array(df[correlated_fe])
y = np.array(df[TARGET_COL])

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [116]:
model = XGBRFClassifier()

In [117]:
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print(f"Train f1: {f1_score(y_train, y_pred_train)}")
print(f"Test f1 : {f1_score(y_test, y_pred_test)}")

Train f1: 0.8388520971302428
Test f1 : 0.7589743589743588


In [118]:
print(classification_report(y_pred_test, y_test))

              precision    recall  f1-score   support

           0       0.89      0.84      0.86       176
           1       0.72      0.80      0.76        92

    accuracy                           0.82       268
   macro avg       0.80      0.82      0.81       268
weighted avg       0.83      0.82      0.83       268



#### Grid Search

In [119]:
# parameters
FEATURES = correlated_fe
SCORING = "f1"
CV = 3
SEED = 0
SHUFFLE = True
cv = StratifiedKFold(n_splits=CV, shuffle=SHUFFLE, random_state=SEED)

In [144]:
from scipy.stats import uniform, randint

# Define the parameter distributions
param_distributions = {
    'n_estimators': np.arange(50, 310, 10),             # Number of boosting rounds (trees)
    'max_depth': np.arange(3, 15),                  # Maximum depth of a tree
    'learning_rate': np.arange(0.01, 0.3, 0.01),            # Learning rate
    'subsample': np.arange(0.4, 0.5, 0.1),                 # Fraction of samples used for fitting the trees
    'colsample_bytree': np.arange(0.4, 0.5, 0.1),          # Fraction of features used for each tree
    'gamma': np.arange(0, 5, 0.1),                         # Minimum loss reduction required to make a further partition
    'min_child_weight': np.arange(1, 10, 1)               # Minimum sum of instance weight (hessian) needed in a child
}

In [145]:
param_distributions

{'n_estimators': array([ 50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150, 160, 170,
        180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300]),
 'max_depth': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
        0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
        0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29]),
 'subsample': array([0.4]),
 'colsample_bytree': array([0.4]),
 'gamma': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2,
        1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5,
        2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8,
        3.9, 4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9]),
 'min_child_weight': array([1, 2, 3, 4, 5, 6, 7, 8, 9])}

In [146]:
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_distributions, cv=cv, scoring=SCORING, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

KeyboardInterrupt: 