## Imports and data reading


In [48]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import numpy as np

import polars as pl

df = pl.read_csv("../Data/Titanic/advancedTrain.csv")

df.head()

PassengerId,Survived,Pclass,Sex_female,Age,Fare,Embarked_C,Embarked_Q,FamilySize,IsChild,HasCabinRegistered,HasSmallFamily
i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64
1,0,3,0,-0.565419,-0.879247,0,0,2,0,0,1
2,1,1,1,0.663488,1.360456,1,0,2,0,1,1
3,1,3,1,-0.258192,-0.798092,0,0,1,0,0,0
4,1,1,1,0.433068,1.061442,0,0,2,0,1,1
5,0,3,0,0.433068,-0.783739,0,0,1,0,0,0


## Splitting data


In [49]:
X, y = df.drop("Survived", "PassengerId"), df["Survived"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Models


In [50]:
models = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': np.arange(1, 100),
            'max_depth': np.arange(1, 10)
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': np.arange(1, 10, 0.1),
            'penalty': ['l2']
        }
    },
    'SVM': {
        'model': SVC(random_state=42),
        'params': {
            'C': np.arange(1, 10, 0.1),
            'kernel': ['rbf', 'linear']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': np.arange(1, 100),
            'learning_rate': np.arange(1, 10)
        }
    }
}

## Gridsearch


In [51]:
results = {}
for model_key, model_values in models.items():
    print(f"Training {model_key}...")

    grid_search = GridSearchCV(
        estimator=model_values["model"],
        param_grid=model_values["params"],
        cv=5,
        scoring="accuracy",
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    results[model_key] = {
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "test_score": accuracy
    }


Training RandomForest...
Training LogisticRegression...
Training SVM...
Training GradientBoosting...


In [52]:
for key, value in results.items():
    display(key, value)

'RandomForest'

{'best_params': {'max_depth': np.int64(5), 'n_estimators': np.int64(70)},
 'cv_score': np.float64(0.8412685905643652),
 'test_score': 0.8212290502793296}

'LogisticRegression'

{'best_params': {'C': np.float64(5.200000000000004), 'penalty': 'l2'},
 'cv_score': np.float64(0.8089333202009259),
 'test_score': 0.8156424581005587}

'SVM'

{'best_params': {'C': np.float64(1.0), 'kernel': 'rbf'},
 'cv_score': np.float64(0.8300009849305624),
 'test_score': 0.8156424581005587}

'GradientBoosting'

{'best_params': {'learning_rate': np.int64(3), 'n_estimators': np.int64(1)},
 'cv_score': np.float64(0.8188023244361273),
 'test_score': 0.7988826815642458}

In [53]:
from joblib import dump

model = grid_search.best_estimator_
filename = f'{type(model).__name__}__{"__".join([(str(feature)) for feature in model.feature_names_in_])}.joblib'
dump(model, f'../Models/Titanic/{filename}')

['../Models/Titanic/GradientBoostingClassifier__Pclass__Sex_female__Age__Fare__Embarked_C__Embarked_Q__FamilySize__IsChild__HasCabinRegistered__HasSmallFamily.joblib']