In [None]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import (
    RandomizedSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format='retina'

## Read the dataset


In [None]:
file_path = "../Data/features_3_sec.csv"
data = pd.read_csv(file_path)
data.drop(labels="filename", axis=1, inplace=True)

In [None]:
X = data.iloc[:, :-1].values
y = data["label"].values

In [None]:
# Transforming the labels to numerical values
le = LabelEncoder()
y_encoded = le.fit_transform(y)

## Split the dataset into training and testing sets


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

## Define pipeline to build a model


In [None]:
def build_pipeline(model_name, model):
    print(f"Training {model_name} model")
    pipeline = Pipeline([("scaler", StandardScaler()), ("model", model)])
    return (model_name, pipeline)

## Define the model


In [None]:
models = {
    "Logistic": LogisticRegression(max_iter=1000, multi_class="multinomial"),
    "SVC": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
}

## Train the model


In [None]:
results = []
names = []
for i in range(len(models)):
    model_name, model = models.popitem()
    model_name, model = build_pipeline(model_name, model)
    cv_results = cross_val_score(
        model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring="accuracy"
    )
    results.append(cv_results)
    names.append(model_name)
    print(f"Model: {model_name}")
    print(f"Mean accuracy: {np.mean(cv_results)}")
    print(f"Std deviation: {np.std(cv_results)}")
    print()

## Evaluate the model


In [None]:
# Comparing the models
fig = go.Figure()

for x, y in zip(results, names):
    fig.add_trace(go.Box(x=x, name=y))
fig.show()
fig.write_image("../report/graphics/baseline_models.jpg", scale=5)

## model evaluation


In [None]:
# evaluating the xgboost model

model = build_pipeline("XGBoost", XGBClassifier())[1]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=le.classes_,
    yticklabels=le.classes_,
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig("../report/graphics/confusion_matrix.pdf")
plt.show()

## Hyperparameter tuning


In [None]:
param_grid_xgb = {
    "n_estimators": [50, 100, 200, 500],  # Number of boosting rounds
    "learning_rate": [0.01, 0.1, 0.2, 0.3],  # Step size shrinkage
    "max_depth": [3, 5, 7, 10],  # Maximum depth of trees
    "subsample": [0.6, 0.8, 1.0],  # Fraction of samples per boosting round
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features for each tree
    "gamma": [0, 0.1, 0.2, 0.3],  # Minimum loss reduction required for a split
    "reg_alpha": [0, 0.1, 0.5, 1],  # L1 regularization
    "reg_lambda": [0, 0.1, 0.5, 1],  # L2 regularization
}
model = XGBClassifier()
randomcv = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid_xgb,
    n_iter=100,
    cv=3,
    verbose=2,
    n_jobs=-1,
)
randomcv.fit(X_train, y_train)
randomcv.best_params_

In [None]:
# evaluating the xgboost model with the best parameters
model = build_pipeline(
    "XGBoost",
    XGBClassifier(
        n_estimators=500,
        max_depth=5,
        learning_rate=0.2,
        gamma=0,
        colsample_bytree=0.8,
        reg_alpha=0,
        reg_lambda=1,
        subsample=0.8,
    ),
)[1]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))

In [None]:
# print the importance of each feature

xgb_model = model.named_steps["model"]
# Get feature importance scores
importance = xgb_model.feature_importances_


# Get feature names
feature_names = data.columns[:-1]
# Convert to DataFrame
importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importance})

# Sort by importance
importance_df = importance_df.sort_values(by="Importance", ascending=False)

importance_df = importance_df.head(10)

# Plot with correct y-axis labels
plt.figure(figsize=(20, 12))
sns.barplot(x="Importance", y="Feature", data=importance_df)
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.title("Feature Importance")
plt.savefig("../report/graphics/feature_importance.pdf")
plt.show()

In [None]:
# Evaluate SVM model
model = build_pipeline("SVM", SVC())[1]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))

In [None]:
# Evaluate Random Forest model
model = build_pipeline("Random Forest", RandomForestClassifier())[1]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))