In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix,
    ConfusionMatrixDisplay, log_loss, top_k_accuracy_score, roc_auc_score
)
from sklearn.preprocessing import label_binarize

# 1) Loading the data
file_path1 = "/content/drive/MyDrive/feature_engineered_with_pca_optimal.csv"
import pandas as pd

df1 = pd.read_csv(file_path1)

df1 = df1.loc[:, ~df1.columns.astype(str).str.startswith("Unnamed")]

# 1-based to 0-based slicing:
X_df1 = df1.iloc[:, 0:35]


# Convert labels from one-hot to 0..3
y1 = df1.iloc[:,35]


# 2) Train/test splitting
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df1, y1, test_size=0.2, random_state=42, stratify=y1
)

In [None]:
#Random Forest CV sweep
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score, f1_score

# Base RF pipeline
rf_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("clf", RandomForestClassifier(
        class_weight="balanced_subsample",
        n_jobs=-1,
        random_state=42,
        oob_score=False
    )),
])

# Cross Validation + scoring
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scoring = {
    "neg_log_loss": "neg_log_loss",
    "accuracy": "accuracy",
    "f1_macro": "f1_macro",
}

# Search of parameters
param_dist = {
    "clf__n_estimators": [400, 600, 800, 1000],
    "clf__max_depth": [None, 4, 6, 8, 10],
    "clf__max_features": ["sqrt", "log2", 0.5],
    "clf__min_samples_leaf": [5, 10, 20],
    "clf__min_samples_split": [10, 20, 40],
    "clf__max_leaf_nodes": [None, 32, 64, 128],
    "clf__bootstrap": [True],
    "clf__max_samples": [0.6, 0.7, 0.8, 0.9],
}

search = RandomizedSearchCV(
    rf_pipe,
    param_distributions=param_dist,
    n_iter=60,
    cv=cv,
    scoring=scoring,
    refit="neg_log_loss",
    return_train_score=True,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Start search
search.fit(X_train_df, y_train)

# CV results table
cvres = pd.DataFrame(search.cv_results_)
rf_results = (
    cvres
      .assign(
          train_logloss = -cvres["mean_train_neg_log_loss"],
          cv_logloss    = -cvres["mean_test_neg_log_loss"],
          train_acc     =  cvres["mean_train_accuracy"],
          cv_acc        =  cvres["mean_test_accuracy"],
          train_f1_macro=  cvres["mean_train_f1_macro"],
          cv_f1_macro   =  cvres["mean_test_f1_macro"],
      )[
          [
            "param_clf__n_estimators","param_clf__max_depth","param_clf__max_features",
            "param_clf__min_samples_split","param_clf__min_samples_leaf","param_clf__max_leaf_nodes",
            "param_clf__bootstrap","param_clf__max_samples",
            "train_logloss","cv_logloss","train_acc","cv_acc","train_f1_macro","cv_f1_macro"
          ]
      ]
      .rename(columns={
          "param_clf__n_estimators":"n_estimators",
          "param_clf__max_depth":"max_depth",
          "param_clf__max_features":"max_features",
          "param_clf__min_samples_split":"min_samples_split",
          "param_clf__min_samples_leaf":"min_samples_leaf",
          "param_clf__max_leaf_nodes":"max_leaf_nodes",
          "param_clf__bootstrap":"bootstrap",
          "param_clf__max_samples":"max_samples",
      })
      .sort_values("cv_logloss", ascending=True)
      .reset_index(drop=True)
)
display(rf_results)

# Best parameters after analysis
print("\nBest by CV log-loss:")
print(search.best_params_)
print(f"Best CV log-loss: {-search.best_score_:.5f}")


plt.figure(figsize=(7,4))
plt.scatter(rf_results["n_estimators"], rf_results["cv_logloss"])
plt.xlabel("n_estimators"); plt.ylabel("CV Log-loss (lower is better)")
plt.title("RandomizedSearchCV — CV Log-loss vs n_estimators")
plt.grid(True); plt.show()

plt.figure(figsize=(7,4))
plt.scatter(rf_results["n_estimators"], rf_results["cv_f1_macro"])
plt.xlabel("n_estimators"); plt.ylabel("CV F1 (macro)")
plt.title("RandomizedSearchCV — CV F1 (macro) vs n_estimators")
plt.grid(True); plt.show()

# Final model
rf_final = search.best_estimator_
rf_final.fit(X_train_df, y_train)


proba_test = rf_final.predict_proba(X_test_df)
pred_test  = rf_final.predict(X_test_df)

test_logloss  = log_loss(y_test, proba_test, labels=np.unique(y_train))
test_acc      = accuracy_score(y_test, pred_test)
test_f1_macro = f1_score(y_test, pred_test, average="macro")

print(f"\nTest log-loss: {test_logloss:.4f}")
print(f"Test accuracy: {test_acc:.4f}")
print(f"Test F1 (macro): {test_f1_macro:.4f}")


The results (macro-F1) of RF were not as good as the XGBoost, thus we continue with the XGBoost for rest of analysis