## Imports and data reading


In [28]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import numpy as np

import polars as pl

df = pl.read_csv("../Data/Titanic/advancedDataframeSolution.csv")

df.head()

PassengerId,Survived,Sex_female,Embarked_C,IsChild,HasSmallFamily,PC1,PC3
i64,i64,i64,i64,i64,i64,f64,f64
1,0,0,0,0,1,-1.140324,0.054224
2,1,1,1,0,1,2.010718,0.217963
3,1,1,0,0,0,-1.078463,0.052638
4,1,1,0,0,1,1.782793,0.223803
5,0,0,0,0,0,-1.067523,0.052358


## Splitting data


In [29]:
X, y = df.drop("Survived", "PassengerId"), df["Survived"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Models


In [30]:
models = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': np.arange(1, 100),
            'max_depth': np.arange(1, 10)
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': np.arange(1, 10, 0.1),
            'penalty': ['l2']
        }
    },
    # 'SVM': {
    #     'model': SVC(random_state=42),
    #     'params': {
    #         'C': np.arange(1, 10, 0.1),
    #         'kernel': ['rbf', 'linear']
    #     }
    # },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': np.arange(1, 100),
            'learning_rate': np.arange(1, 10)
        }
    }
}

## Gridsearch


In [31]:
results = {}
for model_key, model_values in models.items():
    print(f"Training {model_key}...")

    grid_search = GridSearchCV(
        estimator=model_values["model"],
        param_grid=model_values["params"],
        cv=5,
        scoring="accuracy",
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    results[model_key] = {
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "test_score": accuracy
    }


Training RandomForest...
Training LogisticRegression...
Training GradientBoosting...


In [32]:
for key, value in results.items():
    display(key, value)

'RandomForest'

{'best_params': {'max_depth': np.int64(6), 'n_estimators': np.int64(93)},
 'cv_score': np.float64(0.8342657342657344),
 'test_score': 0.8156424581005587}

'LogisticRegression'

{'best_params': {'C': np.float64(1.4000000000000004), 'penalty': 'l2'},
 'cv_score': np.float64(0.7850389047572146),
 'test_score': 0.7877094972067039}

'GradientBoosting'

{'best_params': {'learning_rate': np.int64(1), 'n_estimators': np.int64(8)},
 'cv_score': np.float64(0.823076923076923),
 'test_score': 0.8156424581005587}

In [33]:
final_model =RandomForestClassifier(max_depth=6, n_estimators=13)
final_model.fit(X, y)

X_kaggle = pl.read_csv("../Data/Titanic/baseDataframeSolution.csv")
solution = final_model.predict(X_kaggle)

solution

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Age
- Embarked_Q
- Fare
- Parch
- PassengerId
- ...
Feature names seen at fit time, yet now missing:
- HasSmallFamily
- IsChild
- PC1
- PC3
- Sex_female


In [None]:
X_solution = pl.read_csv("../Data/Titanic/test.csv")

In [None]:
pl.DataFrame({
 "PassengerId" : X_solution["PassengerId"],
 "Survived" : solution
}).write_csv("../Data/Titanic/solution.csv")