In [None]:
from pathlib import Path
import json
import pandas as pd
import pickle

BASE_DIR = Path("../data/processed/merged_participants.csv")

with open('../config/config.json') as f:
    config = json.load(f)

In [2]:
df = pd.read_csv(BASE_DIR)
df.shape

(38, 9)

In [3]:
df.isnull().sum()

participant_id    0
fixation_mean     1
fixation_std      1
n_fixations       0
saccade_mean      0
saccade_std       0
n_saccades        0
pupil_mean        0
pupil_std         0
dtype: int64

In [4]:
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [5]:
df.isnull().sum()

participant_id    0
fixation_mean     0
fixation_std      0
n_fixations       0
saccade_mean      0
saccade_std       0
n_saccades        0
pupil_mean        0
pupil_std         0
dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [7]:
feature_cols = [c for c in df.columns if c != 'participant_id']
X = df[feature_cols]

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
kmeans = KMeans(
    n_clusters=config['model_settings']['n_clusters'],
    random_state=config['model_settings']['random_state'],
    n_init=config['model_settings']['n_init']
)

df['cluster'] = kmeans.fit_predict(X_scaled)
silhouette = silhouette_score(X_scaled, df["cluster"])
print("Silhouetter score: ", round(silhouette, 3))

Silhouetter score:  0.549


In [10]:
cluster_means = df.groupby('cluster')[["fixation_mean", "n_saccades"]].mean()
print("\nCluster centers (fixation_mean & n_saccades): ")
print(cluster_means)


Cluster centers (fixation_mean & n_saccades): 
         fixation_mean   n_saccades
cluster                            
0           359.152353  4584.363636
1           211.708757  1719.200000


In [12]:
holistic_cluster = cluster_means["fixation_mean"].idxmax() if cluster_means.loc[cluster_means["fixation_mean"].idxmax(), "n_saccades"] < cluster_means["n_saccades"].mean() else cluster_means["n_saccades"].idxmin()

df['strategy'] = df['cluster'].apply(lambda x: 'Holistic' if x == holistic_cluster else 'Piecemeal')

print("\nStrategy assigned:")
print(df[['participant_id', 'cluster', 'strategy']])


Strategy assigned:
    participant_id  cluster   strategy
0                1        0  Piecemeal
1               10        0  Piecemeal
2               11        0  Piecemeal
3               12        0  Piecemeal
4               13        0  Piecemeal
5               14        1   Holistic
6               15        0  Piecemeal
7               16        0  Piecemeal
8               17        0  Piecemeal
9               18        1   Holistic
10              19        0  Piecemeal
11               2        0  Piecemeal
12              20        0  Piecemeal
13              21        0  Piecemeal
14              22        0  Piecemeal
15              23        0  Piecemeal
16              24        0  Piecemeal
17              25        0  Piecemeal
18              26        0  Piecemeal
19              27        0  Piecemeal
20              28        1   Holistic
21              29        0  Piecemeal
22               3        0  Piecemeal
23              30        0  Piecemeal
24   

In [None]:
with open('../src/models/kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

print('KMeans model saved successfully using pickle!')