Analysis of the Raw+SFF

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix,
    ConfusionMatrixDisplay, log_loss, top_k_accuracy_score, roc_auc_score
)
from sklearn.preprocessing import label_binarize

# 1) Loading the data
file_path = "/content/drive/MyDrive/feature_engineered_with_pca_optimal.csv"
import pandas as pd

df = pd.read_csv(file_path)
print(df.head())
df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed")]


X_df = df.iloc[:, 0:35]
y = df.iloc[:,35]

# 2) Train/test splitting
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
%pip install -U xgboost
import xgboost, sys
print("xgboost", xgboost.__version__, "in", sys.executable)


In [None]:
# RandomizedSearchCV for XGBoost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score, f1_score
from xgboost import XGBClassifier
from scipy.stats import randint, uniform, loguniform

# model
xgb_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("xgb", XGBClassifier(
        objective="multi:softprob",
        num_class=4,
        tree_method="hist",
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    ))
])

# Cross Validation and scoring
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scoring = {
    "neg_log_loss": "neg_log_loss",
    "accuracy": "accuracy",
    "f1_macro": "f1_macro",
}

# Parameters
param_dist = {
    "xgb__n_estimators": randint(100, 1001),
    "xgb__learning_rate": loguniform(1e-3, 3e-1),
    "xgb__max_depth": randint(3, 9),
    "xgb__min_child_weight": randint(1, 9),
    "xgb__subsample": uniform(0.6, 0.4),
    "xgb__colsample_bytree": uniform(0.6, 0.4),
    "xgb__gamma": loguniform(1e-8, 1e1),
    "xgb__reg_alpha": loguniform(1e-8, 1e1),
    "xgb__reg_lambda": loguniform(1e-2, 1e2),
}

search = RandomizedSearchCV(
    estimator=xgb_pipe,
    param_distributions=param_dist,
    n_iter=50,
    cv=cv,
    scoring=scoring,
    refit="neg_log_loss",
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

# searching
search.fit(X_train_df, y_train)

# results table
cvres = pd.DataFrame(search.cv_results_)
results = (
    cvres.assign(
        train_logloss = -cvres["mean_train_neg_log_loss"],
        cv_logloss    = -cvres["mean_test_neg_log_loss"],
        train_acc     =  cvres["mean_train_accuracy"],
        cv_acc        =  cvres["mean_test_accuracy"],
        train_f1_macro=  cvres["mean_train_f1_macro"],
        cv_f1_macro   =  cvres["mean_test_f1_macro"],
    )[
        ["param_xgb__n_estimators","param_xgb__learning_rate","param_xgb__max_depth",
         "param_xgb__min_child_weight","param_xgb__subsample","param_xgb__colsample_bytree",
         "param_xgb__gamma","param_xgb__reg_alpha","param_xgb__reg_lambda",
         "train_logloss","cv_logloss","train_acc","cv_acc","train_f1_macro","cv_f1_macro"]
    ]
    .rename(columns={
        "param_xgb__n_estimators":"n_estimators",
        "param_xgb__learning_rate":"learning_rate",
        "param_xgb__max_depth":"max_depth",
        "param_xgb__min_child_weight":"min_child_weight",
        "param_xgb__subsample":"subsample",
        "param_xgb__colsample_bytree":"colsample_bytree",
        "param_xgb__gamma":"gamma",
        "param_xgb__reg_alpha":"reg_alpha",
        "param_xgb__reg_lambda":"reg_lambda",
    })
    .sort_values("cv_logloss")
    .reset_index(drop=True)
)
display(results.head(20))

print("\nBest params (by CV log-loss):")
print(search.best_params_)
print(f"Best CV log-loss: {-search.best_score_:.5f}")

# ----- quick visuals (optional)
plt.figure(figsize=(7,4))
plt.scatter(results["n_estimators"], results["cv_logloss"])
plt.xlabel("n_estimators"); plt.ylabel("CV Log-loss (lower is better)")
plt.title("XGB RandomizedSearch — CV Log-loss vs n_estimators")
plt.grid(True); plt.show()

plt.figure(figsize=(7,4))
plt.scatter(results["learning_rate"].astype(float), results["cv_f1_macro"])
plt.xlabel("learning_rate"); plt.ylabel("CV F1 (macro)")
plt.title("XGB RandomizedSearch — CV F1 (macro) vs learning_rate")
plt.grid(True); plt.show()

# Final model
final_model = search.best_estimator_
final_model.fit(X_train_df, y_train)

proba_test = final_model.predict_proba(X_test_df)
pred_test  = final_model.predict(X_test_df)

test_logloss  = log_loss(y_test, proba_test, labels=np.unique(y_train))
test_acc      = accuracy_score(y_test, pred_test)
test_f1_macro = f1_score(y_test, pred_test, average="macro")

print(f"\nTest log-loss: {test_logloss:.4f}")
print(f"Test accuracy: {test_acc:.4f}")
print(f"Test F1 (macro): {test_f1_macro:.4f}")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier

# best params from the search above
best_params = {
    "xgb__colsample_bytree": float(0.7334834444556088),
    "xgb__gamma":            float(1.931084870540406e-07),
    "xgb__learning_rate":    float(0.040957144541603416),
    "xgb__max_depth":        7,
    "xgb__min_child_weight": 2,
    "xgb__n_estimators":     443,
    "xgb__reg_alpha":        float(0.31044435499483225),
    "xgb__reg_lambda":       float(0.07068974950624607),
    "xgb__subsample":        float(0.6727299868828402),
}

# pipeline
xgb_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("xgb", XGBClassifier(
        objective="multi:softprob",
        num_class=4,              # <-- set to your number of classes
        tree_method="hist",
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    ))
])

# applying the best params to the pipeline
xgb_pipe.set_params(**best_params)

# fittin on training data
xgb_pipe.fit(X_train_df, y_train)

# predicting
y_pred = xgb_pipe.predict(X_test_df)

# confusion matrix
labels = np.unique(y_train)
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in labels], columns=[f"pred_{l}" for l in labels])
display(cm_df)

# plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
fig, ax = plt.subplots(figsize=(6, 5))
disp.plot(ax=ax, cmap="Blues", values_format="d", colorbar=False)
plt.title("XGBoost Confusion Matrix (best params)")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report

# F1 scores
f1_macro    = f1_score(y_test, y_pred, average="macro")
f1_weighted = f1_score(y_test, y_pred, average="weighted")
print(f"F1 (macro):    {f1_macro:.4f}")
print(f"F1 (weighted): {f1_weighted:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))

In [None]:

feat_names = list(X_train_df.columns)

# SHAP global summary for interpretability
try:
    import shap

    X_test_imp = pd.DataFrame(
        xgb_pipe.named_steps["imp"].transform(X_test_df),
        columns=feat_names, index=X_test_df.index
    )
    explainer = shap.TreeExplainer(xgb_pipe.named_steps["xgb"])
    sample_n = min(400, len(X_test_imp))
    X_shap = X_test_imp.sample(sample_n, random_state=42)
    shap_values = explainer(X_shap, check_additivity=False)

    shap.summary_plot(shap_values, X_shap, show=True)


except Exception as e:
    print("\nSHAP skipped (optional). Reason:", repr(e))
    print("Tip: `pip install shap` to enable SHAP plots.")


Analysis of Raw features

In [None]:
X_df = df.iloc[:, 0:25]
y = df.iloc[:,35]

# 2) Train/test splitting
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

# model
xgb_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("xgb", XGBClassifier(
        objective="multi:softprob",
        num_class=4,
        tree_method="hist",
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    ))
])

# Cross Validation and scoring
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scoring = {
    "neg_log_loss": "neg_log_loss",
    "accuracy": "accuracy",
    "f1_macro": "f1_macro",
}

# Parameters
param_dist = {
    "xgb__n_estimators": randint(100, 1001),
    "xgb__learning_rate": loguniform(1e-3, 3e-1),
    "xgb__max_depth": randint(3, 9),
    "xgb__min_child_weight": randint(1, 9),
    "xgb__subsample": uniform(0.6, 0.4),
    "xgb__colsample_bytree": uniform(0.6, 0.4),
    "xgb__gamma": loguniform(1e-8, 1e1),
    "xgb__reg_alpha": loguniform(1e-8, 1e1),
    "xgb__reg_lambda": loguniform(1e-2, 1e2),
}

search = RandomizedSearchCV(
    estimator=xgb_pipe,
    param_distributions=param_dist,
    n_iter=50,
    cv=cv,
    scoring=scoring,
    refit="neg_log_loss",
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

# searching
search.fit(X_train_df, y_train)

# results table
cvres = pd.DataFrame(search.cv_results_)
results = (
    cvres.assign(
        train_logloss = -cvres["mean_train_neg_log_loss"],
        cv_logloss    = -cvres["mean_test_neg_log_loss"],
        train_acc     =  cvres["mean_train_accuracy"],
        cv_acc        =  cvres["mean_test_accuracy"],
        train_f1_macro=  cvres["mean_train_f1_macro"],
        cv_f1_macro   =  cvres["mean_test_f1_macro"],
    )[
        ["param_xgb__n_estimators","param_xgb__learning_rate","param_xgb__max_depth",
         "param_xgb__min_child_weight","param_xgb__subsample","param_xgb__colsample_bytree",
         "param_xgb__gamma","param_xgb__reg_alpha","param_xgb__reg_lambda",
         "train_logloss","cv_logloss","train_acc","cv_acc","train_f1_macro","cv_f1_macro"]
    ]
    .rename(columns={
        "param_xgb__n_estimators":"n_estimators",
        "param_xgb__learning_rate":"learning_rate",
        "param_xgb__max_depth":"max_depth",
        "param_xgb__min_child_weight":"min_child_weight",
        "param_xgb__subsample":"subsample",
        "param_xgb__colsample_bytree":"colsample_bytree",
        "param_xgb__gamma":"gamma",
        "param_xgb__reg_alpha":"reg_alpha",
        "param_xgb__reg_lambda":"reg_lambda",
    })
    .sort_values("cv_logloss")
    .reset_index(drop=True)
)
display(results.head(20))

print("\nBest params (by CV log-loss):")
print(search.best_params_)
print(f"Best CV log-loss: {-search.best_score_:.5f}")

# ----- quick visuals (optional)
plt.figure(figsize=(7,4))
plt.scatter(results["n_estimators"], results["cv_logloss"])
plt.xlabel("n_estimators"); plt.ylabel("CV Log-loss (lower is better)")
plt.title("XGB RandomizedSearch — CV Log-loss vs n_estimators")
plt.grid(True); plt.show()

plt.figure(figsize=(7,4))
plt.scatter(results["learning_rate"].astype(float), results["cv_f1_macro"])
plt.xlabel("learning_rate"); plt.ylabel("CV F1 (macro)")
plt.title("XGB RandomizedSearch — CV F1 (macro) vs learning_rate")
plt.grid(True); plt.show()

# Final model
final_model = search.best_estimator_
final_model.fit(X_train_df, y_train)

proba_test = final_model.predict_proba(X_test_df)
pred_test  = final_model.predict(X_test_df)

test_logloss  = log_loss(y_test, proba_test, labels=np.unique(y_train))
test_acc      = accuracy_score(y_test, pred_test)
test_f1_macro = f1_score(y_test, pred_test, average="macro")

print(f"\nTest log-loss: {test_logloss:.4f}")
print(f"Test accuracy: {test_acc:.4f}")
print(f"Test F1 (macro): {test_f1_macro:.4f}")



# best params from the search above
best_params = {
    "xgb__colsample_bytree": float(0.7334834444556088),
    "xgb__gamma":            float(1.931084870540406e-07),
    "xgb__learning_rate":    float(0.040957144541603416),
    "xgb__max_depth":        7,
    "xgb__min_child_weight": 2,
    "xgb__n_estimators":     443,
    "xgb__reg_alpha":        float(0.31044435499483225),
    "xgb__reg_lambda":       float(0.07068974950624607),
    "xgb__subsample":        float(0.6727299868828402),
}

# pipeline
xgb_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("xgb", XGBClassifier(
        objective="multi:softprob",
        num_class=4,              # <-- set to your number of classes
        tree_method="hist",
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    ))
])

# applying the best params to the pipeline
xgb_pipe.set_params(**best_params)

# fittin on training data
xgb_pipe.fit(X_train_df, y_train)

# predicting
y_pred = xgb_pipe.predict(X_test_df)

# confusion matrix
labels = np.unique(y_train)
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in labels], columns=[f"pred_{l}" for l in labels])
display(cm_df)

# plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
fig, ax = plt.subplots(figsize=(6, 5))
disp.plot(ax=ax, cmap="Blues", values_format="d", colorbar=False)
plt.title("XGBoost Confusion Matrix (best params)")
plt.tight_layout()
plt.show()


Analysis of PCA features (17 features)

In [None]:
X_df = df.iloc[:, 36:53]
y = df.iloc[:,35]

# 2) Train/test splitting
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

# model
xgb_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("xgb", XGBClassifier(
        objective="multi:softprob",
        num_class=4,
        tree_method="hist",
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    ))
])

# Cross Validation and scoring
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scoring = {
    "neg_log_loss": "neg_log_loss",
    "accuracy": "accuracy",
    "f1_macro": "f1_macro",
}

# Parameters
param_dist = {
    "xgb__n_estimators": randint(100, 1001),
    "xgb__learning_rate": loguniform(1e-3, 3e-1),
    "xgb__max_depth": randint(3, 9),
    "xgb__min_child_weight": randint(1, 9),
    "xgb__subsample": uniform(0.6, 0.4),
    "xgb__colsample_bytree": uniform(0.6, 0.4),
    "xgb__gamma": loguniform(1e-8, 1e1),
    "xgb__reg_alpha": loguniform(1e-8, 1e1),
    "xgb__reg_lambda": loguniform(1e-2, 1e2),
}

search = RandomizedSearchCV(
    estimator=xgb_pipe,
    param_distributions=param_dist,
    n_iter=50,
    cv=cv,
    scoring=scoring,
    refit="neg_log_loss",
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

# searching
search.fit(X_train_df, y_train)

# results table
cvres = pd.DataFrame(search.cv_results_)
results = (
    cvres.assign(
        train_logloss = -cvres["mean_train_neg_log_loss"],
        cv_logloss    = -cvres["mean_test_neg_log_loss"],
        train_acc     =  cvres["mean_train_accuracy"],
        cv_acc        =  cvres["mean_test_accuracy"],
        train_f1_macro=  cvres["mean_train_f1_macro"],
        cv_f1_macro   =  cvres["mean_test_f1_macro"],
    )[
        ["param_xgb__n_estimators","param_xgb__learning_rate","param_xgb__max_depth",
         "param_xgb__min_child_weight","param_xgb__subsample","param_xgb__colsample_bytree",
         "param_xgb__gamma","param_xgb__reg_alpha","param_xgb__reg_lambda",
         "train_logloss","cv_logloss","train_acc","cv_acc","train_f1_macro","cv_f1_macro"]
    ]
    .rename(columns={
        "param_xgb__n_estimators":"n_estimators",
        "param_xgb__learning_rate":"learning_rate",
        "param_xgb__max_depth":"max_depth",
        "param_xgb__min_child_weight":"min_child_weight",
        "param_xgb__subsample":"subsample",
        "param_xgb__colsample_bytree":"colsample_bytree",
        "param_xgb__gamma":"gamma",
        "param_xgb__reg_alpha":"reg_alpha",
        "param_xgb__reg_lambda":"reg_lambda",
    })
    .sort_values("cv_logloss")
    .reset_index(drop=True)
)
display(results.head(20))

print("\nBest params (by CV log-loss):")
print(search.best_params_)
print(f"Best CV log-loss: {-search.best_score_:.5f}")

# ----- quick visuals (optional)
plt.figure(figsize=(7,4))
plt.scatter(results["n_estimators"], results["cv_logloss"])
plt.xlabel("n_estimators"); plt.ylabel("CV Log-loss (lower is better)")
plt.title("XGB RandomizedSearch — CV Log-loss vs n_estimators")
plt.grid(True); plt.show()

plt.figure(figsize=(7,4))
plt.scatter(results["learning_rate"].astype(float), results["cv_f1_macro"])
plt.xlabel("learning_rate"); plt.ylabel("CV F1 (macro)")
plt.title("XGB RandomizedSearch — CV F1 (macro) vs learning_rate")
plt.grid(True); plt.show()

# Final model
final_model = search.best_estimator_
final_model.fit(X_train_df, y_train)

proba_test = final_model.predict_proba(X_test_df)
pred_test  = final_model.predict(X_test_df)

test_logloss  = log_loss(y_test, proba_test, labels=np.unique(y_train))
test_acc      = accuracy_score(y_test, pred_test)
test_f1_macro = f1_score(y_test, pred_test, average="macro")

print(f"\nTest log-loss: {test_logloss:.4f}")
print(f"Test accuracy: {test_acc:.4f}")
print(f"Test F1 (macro): {test_f1_macro:.4f}")



# best params from the search above
best_params = {
    "xgb__colsample_bytree": float(0.7334834444556088),
    "xgb__gamma":            float(1.931084870540406e-07),
    "xgb__learning_rate":    float(0.040957144541603416),
    "xgb__max_depth":        7,
    "xgb__min_child_weight": 2,
    "xgb__n_estimators":     443,
    "xgb__reg_alpha":        float(0.31044435499483225),
    "xgb__reg_lambda":       float(0.07068974950624607),
    "xgb__subsample":        float(0.6727299868828402),
}

# pipeline
xgb_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("xgb", XGBClassifier(
        objective="multi:softprob",
        num_class=4,              # <-- set to your number of classes
        tree_method="hist",
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    ))
])

# applying the best params to the pipeline
xgb_pipe.set_params(**best_params)

# fittin on training data
xgb_pipe.fit(X_train_df, y_train)

# predicting
y_pred = xgb_pipe.predict(X_test_df)

# confusion matrix
labels = np.unique(y_train)
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in labels], columns=[f"pred_{l}" for l in labels])
display(cm_df)

# plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
fig, ax = plt.subplots(figsize=(6, 5))
disp.plot(ax=ax, cmap="Blues", values_format="d", colorbar=False)
plt.title("XGBoost Confusion Matrix (best params)")
plt.tight_layout()
plt.show()
