## Imports and data reading


In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import numpy as np

import polars as pl

df = pl.read_csv("../../Data/Titanic/advancedTrain.csv")

df.head()

PassengerId,Survived,Sex_female,Age,Embarked_C,IsChild,HasSmallFamily,PC1,PC2,PC3
i64,i64,i64,f64,i64,i64,i64,f64,f64,f64
1,0,0,-0.565419,0,0,1,-1.140324,-0.013866,0.054224
2,1,1,0.663488,1,0,1,2.010718,-0.259612,0.217963
3,1,1,-0.258192,0,0,0,-1.078463,0.03864,0.052638
4,1,1,0.433068,0,0,1,1.782793,-0.453069,0.223803
5,0,0,0.433068,0,0,0,-1.067523,0.047926,0.052358


## Splitting data


In [2]:
X, y = df.drop("Survived", "PassengerId"), df["Survived"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Models


In [3]:
estimators = {
    "RandomForest": RandomForestClassifier,
    "LogisticRegression": LogisticRegression,
    "SVM": SVC,
    "GradientBoosting": GradientBoostingClassifier
    }

models = {
    'RandomForest': {
        'model': estimators["RandomForest"](random_state=42),
        'params': {
            'n_estimators': np.arange(1, 100),
            'max_depth': np.arange(1, 10)
        }
    },
    'LogisticRegression': {
        'model': estimators["LogisticRegression"](random_state=42, max_iter=1000),
        'params': {
            'C': np.arange(1, 10, 0.1),
            'penalty': ['l2']
        }
    },
    'SVM': {
        'model': estimators["SVM"](random_state=42),
        'params': {
            'C': np.arange(1, 10, 0.1),
            'kernel': ['rbf', 'linear']
        }
    },
    'GradientBoosting': {
        'model': estimators["GradientBoosting"](random_state=42),
        'params': {
            'n_estimators': np.arange(1, 100),
            'learning_rate': np.arange(1, 10)
        }
    }
}

## Gridsearch


In [4]:
results = {}
total_score = 0
for model_key, model_values in models.items():
    print(f"Training {model_key}...")

    grid_search = GridSearchCV(
        estimator=model_values["model"],
        param_grid=model_values["params"],
        cv=5,
        scoring="accuracy",
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    results[model_key] = {
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "test_score": accuracy,
        "total_score": (grid_search.best_score_ + 2 * accuracy) / 3
    }
    if results[model_key]["total_score"] > total_score:
        total_score = results[model_key]["total_score"]
        results["best_model"] = {model_key : results[model_key]}

Training RandomForest...
Training LogisticRegression...
Training SVM...
Training GradientBoosting...


In [5]:
for key, value in results.items():
    display(key, value)

'RandomForest'

{'best_params': {'max_depth': np.int64(8), 'n_estimators': np.int64(52)},
 'cv_score': np.float64(0.8328966807840047),
 'test_score': 0.8268156424581006,
 'total_score': np.float64(0.828842655233402)}

'best_model'

{'RandomForest': {'best_params': {'max_depth': np.int64(8),
   'n_estimators': np.int64(52)},
  'cv_score': np.float64(0.8328966807840047),
  'test_score': 0.8268156424581006,
  'total_score': np.float64(0.828842655233402)}}

'LogisticRegression'

{'best_params': {'C': np.float64(2.6000000000000014), 'penalty': 'l2'},
 'cv_score': np.float64(0.8047276666994977),
 'test_score': 0.7988826815642458,
 'total_score': np.float64(0.8008310099426631)}

'SVM'

{'best_params': {'C': np.float64(1.0), 'kernel': 'rbf'},
 'cv_score': np.float64(0.8286220821432089),
 'test_score': 0.8100558659217877,
 'total_score': np.float64(0.8162446046622615)}

'GradientBoosting'

{'best_params': {'learning_rate': np.int64(3), 'n_estimators': np.int64(1)},
 'cv_score': np.float64(0.8201910765291048),
 'test_score': 0.7988826815642458,
 'total_score': np.float64(0.8059854798858654)}

In [6]:
model_name = list(results["best_model"].keys())[0]
model_params = results[model_name]["best_params"]
model = estimators[model_name](**model_params)

model.fit(X, y)

In [7]:
from joblib import dump

filename = f'{type(model).__name__}__{"__".join([(str(feature)) for feature in model.feature_names_in_])}.joblib'
%store filename
dump(model, f'../../Models/Titanic/{filename}')

Stored 'filename' (str)


['../../Models/Titanic/RandomForestClassifier__Sex_female__Age__Embarked_C__IsChild__HasSmallFamily__PC1__PC2__PC3.joblib']