## Imports and data reading

In [98]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import numpy as np

import polars as pl

df = pl.read_csv("../Data/baseDataframe.csv")

df.head()

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q
i64,i64,i64,f64,i64,i64,f64,i64,i64
0,3,1,-0.565419,1,0,-0.879247,0,0
1,1,0,0.663488,1,0,1.360456,1,0
1,3,0,-0.258192,0,0,-0.798092,0,0
1,1,0,0.433068,1,0,1.061442,0,0
0,3,1,0.433068,0,0,-0.783739,0,0


## Splitting data

In [99]:
X, y = df.drop("Survived"), df["Survived"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Models

In [100]:
models = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': np.arange(1, 100),
            'max_depth': np.arange(1, 10)
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': np.arange(1, 10, 0.1),
            'penalty': ['l2']
        }
    },
    'SVM': {
        'model': SVC(random_state=42),
        'params': {
            'C': np.arange(1, 10, 0.1),
            'kernel': ['rbf', 'linear']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': np.arange(1, 100),
            'learning_rate': np.arange(1, 10)
        }
    }
}

## Gridsearch

In [101]:
results = {}
for model_key, model_values in models.items():
    print(f"Training {model_key}...")

    grid_search = GridSearchCV(
        estimator=model_values["model"],
        param_grid=model_values["params"],
        cv=5,
        scoring="accuracy",
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    results[model_key] = {
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "test_score": accuracy
    }


Training RandomForest...
Training LogisticRegression...
Training SVM...
Training GradientBoosting...


In [102]:
for key, value in results.items():
    display(key, value)

'RandomForest'

{'best_params': {'max_depth': np.int64(6), 'n_estimators': np.int64(13)},
 'cv_score': np.float64(0.8314389835516597),
 'test_score': 0.8156424581005587}

'LogisticRegression'

{'best_params': {'C': np.float64(1.0), 'penalty': 'l2'},
 'cv_score': np.float64(0.7892642568698907),
 'test_score': 0.7988826815642458}

'SVM'

{'best_params': {'C': np.float64(2.4000000000000012), 'kernel': 'rbf'},
 'cv_score': np.float64(0.8243967300305328),
 'test_score': 0.8156424581005587}

'GradientBoosting'

{'best_params': {'learning_rate': np.int64(1), 'n_estimators': np.int64(5)},
 'cv_score': np.float64(0.825844577957254),
 'test_score': 0.7597765363128491}

In [103]:
final_model =RandomForestClassifier(max_depth=6, n_estimators=13)
final_model.fit(X, y)

X_kaggle = pl.read_csv("../Data/Titanic/baseDataframeSolution.csv")
solution = final_model.predict(X_kaggle)

solution

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
X_solution = pl.read_csv("../Data/Titanic/test.csv")

In [None]:
pl.DataFrame({
 "PassengerId" : X_solution["PassengerId"],
 "Survived" : solution
}).write_csv("../Data/Titanic/solution.csv")