# Introducing the test models



In [None]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import skfuzzy as fuzz
from sklearn.mixture import GaussianMixture
from scipy.optimize import linear_sum_assignment

from sklearn.metrics import (
    accuracy_score, f1_score, cohen_kappa_score, silhouette_score,
    confusion_matrix, classification_report, adjusted_rand_score,
    normalized_mutual_info_score
)


## 4.3 Supervised Learning Approach  
(logistic regression, decision tree, Naive Bayes and K-nearest neighbor)


Multinomial Logistic Regression


## 4.4 Unsupervised Learning Approach  
(hard) k-means, fuzzy C-means and Gaussian mixture models. Make use of PCA
for visualization

##### Load and prepare the data


In [3]:
# Load the data feature matrix and labels ## LATER ON WE CAN REMOVE THE PROMPT AND JUST LOAD THE CSV
try:
    df = pd.read_csv("features.csv")
except FileNotFoundError:
    print("Error: features.csv not found. Please ensure the file is in the current directory.")
    exit()

# Identify Non-Feature columns
Non_feat_columns = ['activity', 'trial', 'student', 'window_idx']

#Drop the non-feature columns so that we only have the features left
X = df.drop(columns=Non_feat_columns, errors='ignore')

# We use the activity column for evaluation later (since, during recordings we have labeled each entry with an activity).We don't use it for clustering since K-Means is unsupervised.
y_true = df['activity'].astype('category')
Nr_clusters = y_true.nunique() # Counts the number of unique activities and assigns it to Nr_clusters


##### Performing PCA


In [4]:
#Use StandardScaler from sklearn to standardize the features. It centers the data (mean=0) and scales it (std=1).
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Since we are going to visualize the clusters in 2D, we use PCA to reduce the dimensionality of the feature space to 2 principal components (PC1, PC2)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

##### Creating a hungarian algorithm function:
It is needed since the assignment asks to evaluate all models using Accuracy, Cohen's Kappa and F1 scores. Those metrics are usually used for supervised learning only, hence we would need to create a ground truth reference for a comparison in the unsupervised models.


In [7]:
def hungarian_align(y_true, y_pred):
    """
    Map cluster IDs to ground-truth labels using the Hungarian algorithm.
    Returns y_pred_mapped (same shape as y_pred).
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    labels_true = np.unique(y_true)
    labels_pred = np.unique(y_pred)

    # Contingency table: rows=true labels, cols=predicted clusters
    contingency = pd.crosstab(
        pd.Series(y_true, name="true"),
        pd.Series(y_pred, name="pred"),
        dropna=False
    )
    contingency = contingency.reindex(index=labels_true, columns=labels_pred, fill_value=0)

    # Hungarian on cost = -counts (maximize overlap)
    cost = -contingency.to_numpy()
    row_ind, col_ind = linear_sum_assignment(cost)

    # Build mapping for assigned columns
    mapping = {}
    for r, c in zip(row_ind, col_ind):
        true_lab = contingency.index[r]
        pred_lab = contingency.columns[c]
        mapping[pred_lab] = true_lab

    # Map any leftover pred clusters to their majority true label
    leftover_pred = set(labels_pred) - set(mapping.keys())
    for pl in leftover_pred:
        col = contingency[pl]
        mapping[pl] = col.idxmax()

    return np.array([mapping[p] for p in y_pred])

def eval_supervised_style(name, y_true, y_pred_raw):
    """
    Supervised-style metrics after aligning clusters to labels:
    - Accuracy
    - Cohen's Kappa
    - Macro F1
    Returns mapped predictions too.
    """
    y_pred_aligned = hungarian_align(y_true, y_pred_raw)
    acc = accuracy_score(y_true, y_pred_aligned)
    kappa = cohen_kappa_score(y_true, y_pred_aligned)
    f1 = f1_score(y_true, y_pred_aligned, average="macro")
    print(f"\n--- {name} (Supervised-style metrics) ---")
    print(f"Accuracy       : {acc:.4f}")
    print(f"Cohen's Kappa  : {kappa:.4f}")
    print(f"F1-Score (Macro): {f1:.4f}")
    return acc, kappa, f1, y_pred_aligned


##### K-means (hard) model


In [8]:
#Create and train the K-Means model
K_means_model = KMeans(
    n_clusters=Nr_clusters,
    init='k-means++', # Smart initialization method
    max_iter=300,
    n_init=10, # Run 10 times with different centroids to find the best result
    random_state=33
)
K_means_model.fit(X_scaled)
K_means_model_labels = K_means_model.labels_ # Get the cluster assignments (labels)

# Supervised-style metrics
acc_km, kappa_km, f1_km, y_km_aligned = eval_supervised_style("K-Means", y_true, K_means_model_labels)


--- K-Means (Supervised-style metrics) ---
Accuracy       : 0.5525
Cohen's Kappa  : 0.4258
F1-Score (Macro): 0.4925


##### Fuzzy C-means model

In [None]:
X_scaled_Transposed = X_scaled.T  # Transpose the current dataframe since skfuzzy expects (features, samples)
cntr, u, _, _, _, _, fpc = fuzz.cluster.cmeans(
    data=X_scaled_Transposed,
    c=Nr_clusters,
    m=2.0,
    error=1e-5,
    maxiter=1000,
    init=None,
    seed=33
)
Fuzzy_C_means_model_labels = np.argmax(u, axis=0)

# # Unsupervised metrics
# fcm_sil, fcm_ari, fcm_nmi = eval_unsupervised("Fuzzy C-Means", y, fcm_labels)
# print(f"FCM partition coefficient (FPC): {fpc:.4f}")  # diagnostic
# Supervised-style metrics
acc_fcm, kappa_fcm, f1_fcm, y_fcm_aligned = eval_supervised_style("Fuzzy C-Means", y_true, Fuzzy_C_means_model_labels)


--- Fuzzy C-Means (Supervised-style metrics) ---
Accuracy       : 0.7121
Cohen's Kappa  : 0.6401
F1-Score (Macro): 0.6992


##### Gaussian Mixture model



In [None]:
gmm = GaussianMixture(n_components=Nr_clusters, covariance_type="full", random_state=33, n_init=5)
gmm.fit(X_scaled)
gmm_probs = gmm.predict_proba(X_scaled)
gmm_labels = np.argmax(gmm_probs, axis=1)

# # Unsupervised metrics
# gmm_sil, gmm_ari, gmm_nmi = eval_unsupervised("Gaussian Mixture (GMM)", y, gmm_labels)
# Supervised-style metrics
acc_gmm, kappa_gmm, f1_gmm, y_gmm_aligned = eval_supervised_style("Gaussian Mixture (GMM)", y_true, gmm_labels)



--- Gaussian Mixture (GMM) (Supervised-style metrics) ---
Accuracy       : 0.6381
Cohen's Kappa  : 0.5439
F1-Score (Macro): 0.6320


##### Model Evaluation


In [14]:
sup_style_summary = pd.DataFrame([
    ("K-Means", acc_km,  kappa_km,  f1_km),
    ("FCM",     acc_fcm, kappa_fcm, f1_fcm),
    ("GMM",     acc_gmm, kappa_gmm, f1_gmm),
], columns=["Method", "Accuracy", "Cohen's Kappa", "F1-Score"])

print("\n================ Supervised-Style Metrics Summary ================\n")
print(sup_style_summary.to_string(index=False))



 Method  Accuracy  Cohen's Kappa  F1-Score
K-Means  0.552529       0.425804  0.492490
    FCM  0.712062       0.640083  0.699162
    GMM  0.638132       0.543883  0.631993


##### Plotting and Visualization 

In [None]:
# Here we assign labels to the cluster IDs based on the majority class within that cluster.
# It helps with interpreting what each cluster represents.
df_labeled_temp = pd.DataFrame({
    'Cluster_ID': K_means_model_labels,
    'True_Activity': y_true
})

# Create the Contingency Table (Cross-Tabulation)
contingency_table = pd.crosstab(
    df_labeled_temp['Cluster_ID'],
    df_labeled_temp['True_Activity']
)

# Determine the majority activity for each cluster ID
cluster_to_activity_mapping = {}
for cluster_id in contingency_table.index:
    majority_activity = contingency_table.loc[cluster_id].idxmax()
    cluster_to_activity_mapping[cluster_id] = majority_activity

unique_clusters = np.unique(K_means_model_labels) # Get the unique cluster IDs



#Plotting and Visualization

plt.figure(figsize=(12, 8))
cmap = plt.cm.get_cmap('tab10', len(unique_clusters))

# Iterate through each cluster ID to plot it separately, which allows for a clean legend
for cluster_id in unique_clusters:
    indices = K_means_model_labels == cluster_id
    activity_name = cluster_to_activity_mapping[cluster_id]

    # Plot the subset of points for this cluster
    plt.scatter(
        X_pca[indices, 0],              # PC1 on the X-axis
        X_pca[indices, 1],              # PC2 on the Y-axis
        c=[cmap(cluster_id)],           # Assign the color from the colormap
        label=f'{activity_name} (ID {cluster_id})',  # Use the Activity Name + ID
        alpha=0.8,
        s=24,
    )

# Centroids must also be transformed to the PCA space for plotting
centroids = K_means_model.cluster_centers_
centroids_pca = pca.transform(centroids)
plt.scatter(
    centroids_pca[:, 0],
    centroids_pca[:, 1],
    marker='X',
    s=150,
    linewidths=2,
    color='black',
    edgecolors='white',
    label='Centroids'
)


# Set labels, title, and legend
plt.xlabel(f"Principal Component 1 ({explained_variance[0]*100:.1f}% Variance)")
plt.ylabel(f"Principal Component 2 ({explained_variance[1]*100:.1f}% Variance)")
plt.title(f"K-Means Clustering (K={Nr_clusters}) Results in PCA Space")
plt.legend(title='K-Means Cluster ID', loc='best')
plt.grid(True, linestyle='--', alpha=0.6)


#Evaluation of the model using the true labels for reference


ari = adjusted_rand_score(y_true, K_means_model_labels)
nmi = normalized_mutual_info_score(y_true, K_means_model_labels)


print(f"Adjusted Rand Index (ARI): {ari:.4f} (0.0=random, 1.0=perfect match)")
print(f"Normalized Mutual Information (NMI): {nmi:.4f} (0.0=no mutual info, 1.0=perfect match)")

