In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from bayes_opt import BayesianOptimization

%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv('./felix_cleaned.csv')

In [7]:
# Splitting the data
X = df.drop(['class'], axis=1)  # Features excluding 'id' and 'class'
y = df['class']  # Target variable

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

# Objective function using log loss
def objective_log_loss(n_estimators, max_depth, min_samples_split, max_features):
    model = RandomForestClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        max_features=min(max_features, 0.999),  # Fraction, must be <= 1.0
    )
    # Use negative log loss as the scoring parameter
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_log_loss').mean()

# Hyperparameters bounds for Bayesian optimization
param_bounds = {
    'n_estimators': (10, 250),
    'max_depth': (1, 50),
    'min_samples_split': (2, 25),
    'max_features': (0.1, 0.999),
}

# Bayesian optimization
optimizer_log_loss = BayesianOptimization(f=objective_log_loss, pbounds=param_bounds, random_state=1)
optimizer_log_loss.maximize(init_points=5, n_iter=15)

# Best parameters found
best_params_log_loss = optimizer_log_loss.max['params']

# Retrain the model with the best parameters
optimized_rf_log_loss = RandomForestClassifier(
    n_estimators=int(best_params_log_loss['n_estimators']),
    max_depth=int(best_params_log_loss['max_depth']),
    min_samples_split=int(best_params_log_loss['min_samples_split']),
    max_features=best_params_log_loss['max_features']
)
optimized_rf_log_loss.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_probs = optimized_rf_log_loss.predict_proba(X_test)

# Calculate and print the log loss for the test set
test_log_loss = log_loss(y_test, y_pred_probs)
print(f'Test Log Loss: {test_log_loss}')


|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-2.189   [0m | [0m21.43    [0m | [0m0.7476   [0m | [0m2.003    [0m | [0m82.56    [0m |
| [95m2        [0m | [95m-1.471   [0m | [95m8.191    [0m | [95m0.183    [0m | [95m6.284    [0m | [95m92.93    [0m |
| [0m3        [0m | [0m-1.529   [0m | [0m20.44    [0m | [0m0.5844   [0m | [0m11.64    [0m | [0m174.5    [0m |
| [0m4        [0m | [0m-1.844   [0m | [0m11.02    [0m | [0m0.8894   [0m | [0m2.63     [0m | [0m170.9    [0m |
| [0m5        [0m | [0m-2.215   [0m | [0m21.45    [0m | [0m0.6023   [0m | [0m5.229    [0m | [0m57.54    [0m |
| [0m6        [0m | [0m-1.844   [0m | [0m8.084    [0m | [0m0.5801   [0m | [0m6.753    [0m | [0m94.25    [0m |
| [0m7        [0m | [0m-1.523   [0m | [0m20.09    [0m | [0m0.8619   [0m | [0m12.61    [0m | [0m174.4   