### Import and Setup

In [None]:
pip install catboost

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import seaborn as sns
import matplotlib.pyplot as plt


### Load feature dataset

In [2]:
#X = np.load("dataset/X.npy")
#y = np.load("dataset/y.npy")
#subjects = np.load("dataset/subjects.npy")  # shape: [n_epochs]
df_features = pd.read_csv('features.csv')
X = df_features.drop(columns=['stage']).values
y = df_features['stage'].values
print("Shapes:", X.shape, y.shape)

Shapes: (76468, 20) (76468,)


In [3]:
df_features.columns

Index(['mean', 'std', 'skew', 'kurtosis', 'zcr', 'hjorth_activity',
       'hjorth_mobility', 'hjorth_complexity', 'total_power',
       'spectral_entropy', 'delta', 'theta', 'alpha', 'sigma', 'beta',
       'delta_rel', 'theta_rel', 'alpha_rel', 'sigma_rel', 'beta_rel',
       'stage'],
      dtype='object')

In [4]:
df_features.head()

Unnamed: 0,mean,std,skew,kurtosis,zcr,hjorth_activity,hjorth_mobility,hjorth_complexity,total_power,spectral_entropy,...,theta,alpha,sigma,beta,delta_rel,theta_rel,alpha_rel,sigma_rel,beta_rel,stage
0,5.115091e-07,2.8e-05,1.924459e-08,3.021933e-10,0.137713,8.102265e-10,0.34948,3.749523,7.770155e-10,0.620958,...,7.277871e-11,1.644147e-11,6.56226e-12,3.217395e-11,0.04593,0.006753,0.001526,0.000609,0.002985,0
1,7.63657e-07,3.5e-05,-1.603229e-07,7.080403e-10,0.093698,1.231188e-09,0.255861,5.067197,1.05231e-09,0.700043,...,4.663916e-11,1.628946e-11,4.89784e-12,2.254003e-11,0.062462,0.00422,0.001474,0.000443,0.002039,0
2,-3.662068e-07,2.7e-05,-9.277772e-07,2.864727e-10,0.110704,7.241649e-10,0.294709,3.671449,6.605786e-10,0.578271,...,1.029808e-10,2.606369e-11,4.53155e-12,1.355609e-11,0.040626,0.00966,0.002445,0.000425,0.001272,0
3,2.066176e-07,4.3e-05,3.050465e-06,1.393325e-09,0.107369,1.848005e-09,0.315363,4.107859,1.734744e-09,1.000681,...,1.395701e-10,2.619525e-11,1.429057e-11,5.811992e-11,0.093116,0.011894,0.002232,0.001218,0.004953,0
4,2.25859e-07,3.5e-05,-4.085115e-07,5.861009e-10,0.130043,1.238705e-09,0.347631,3.711145,1.156982e-09,0.825841,...,1.178899e-10,2.127705e-11,8.778774e-12,4.383952e-11,0.073124,0.010566,0.001907,0.000787,0.003929,0


In [5]:
df_features['stage'].value_counts()

stage
0    52133
2    12917
4     5254
3     4119
1     2045
Name: count, dtype: int64

### Train test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
test_subject = subjects[-1]  # last subject as test example

train_idx = subjects != test_subject
test_idx  = subjects == test_subject

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


### Define models

In [7]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    
    "XGBoost": XGBClassifier(
        max_depth=6,
        learning_rate=0.05,
        n_estimators=300,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="multi:softmax"
    ),

    "CatBoost": CatBoostClassifier(
        iterations=300,
        depth=6,
        learning_rate=0.05,
        verbose=False
    ),

    "SVM (RBF)": SVC(kernel='rbf', C=3, gamma='scale'),

    "Logistic Regression": LogisticRegression(
        max_iter=200,
        multi_class='multinomial'
    ),

    "KNN (k=7)": KNeighborsClassifier(n_neighbors=7)
}


### Plot confusion matrix

In [14]:
def plot_cm(cm, classes, title):
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues",
                xticklabels=classes, yticklabels=classes)
    plt.title(title)
    plt.ylabel("True")
    plt.xlabel("Predicted")
    plt.show()


### Training loop for all models

In [15]:
results = []

#stage_labels = ["W", "N1", "N2", "N3", "R"]  # modify if needed
stage_labels = ['0', '1', '2','3', '4']


In [None]:
for name, model in models.items():
    print("==============================================")
    print(f"Training model: {name}")
    print("==============================================")

    # Many models require standardized inputs → use Pipeline
    clf = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])

    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)

    acc = accuracy_score(y_test, preds)
    macro_f1 = f1_score(y_test, preds, average="macro")
    per_class_f1 = f1_score(y_test, preds, average=None)

    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print("Per-class F1:", dict(zip(stage_labels, per_class_f1)))
    print("\nClassification Report:\n", classification_report(y_test, preds, target_names=stage_labels))

    cm = confusion_matrix(y_test, preds)
    plot_cm(cm, stage_labels, title=f"Confusion Matrix: {name}")

    # Save results
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Macro F1": macro_f1,
        **{f"F1_{s}": f for s, f in zip(stage_labels, per_class_f1)}
    })

results_df = pd.DataFrame(results)
results_df


Training model: RandomForest


### Comparison table

In [12]:
results_df.style.background_gradient(cmap="Blues").set_precision(3)


AttributeError: 'Styler' object has no attribute 'set_precision'

### Best model

In [13]:
best_model_name = results_df.sort_values("Macro F1", ascending=False).iloc[0]["Model"]
print("Best model:", best_model_name)

best_model = models[best_model_name]
joblib.dump(best_model, f"models/{best_model_name}.pkl")


Best model: XGBoost


NameError: name 'joblib' is not defined