# Clustering using Hidden Markov Model 

### Import packages

In [None]:
from simpl_eeg import raw_voltage, eeg_objects, connectivity, topomap_2d, topomap_3d_brain, topomap_3d_head
from hmmlearn import hmm
import mne
import seaborn as sns
import numpy as np
import pandas as pd

### Read in the data

In [None]:
raw_full = mne.io.read_raw_eeglab('../../data/927/fixica.set')

In [None]:
entire_df = raw_full.to_data_frame()

### Data preprocessing

In [None]:
# to get average chunked data

cleaned_df = entire_df[(entire_df.T != 0).any()] # drop rows where all values are zero

sliced_df = entire_df.iloc[:5540000]
avg_chunked_list = []
for i in range(554):
    avg_sliced_array = sliced_df.iloc[i*10000:i*10000+10000, 1:].mean().to_numpy() # separate the data into chunks of per 5 seconds and get the average
    avg_chunked_list.append(avg_sliced_array)
avg_chunked_array=np.array(avg_chunked_list) 

In [None]:
# to get chunk data, not used since file is too large and the operation runs out of memory

# avg_df = entire_df.groupby("time").mean().reset_index()
# entire_list = []
# for i in range(541):
#     sliced_array = avg_df.iloc[i*10000:i*10000+10000, 1:].to_numpy().reshape(-1)
#     entire_list.append(sliced_array)
# entire_array_chunk = np.array(entire_list)

# entire_array_chunk=np.float32(entire_array_chunk)

### Hyperparameter tuning

In [None]:
tune_comp = {}
model_list = []
for n_comp in range(1, 101, 1):
    model = hmm.GaussianHMM(n_components=n_comp)
    model_list.append(model)
    model.fit(avg_chunked_array)
    result = model.decode(avg_chunked_array, algorithm="viterbi")
    tune_comp[n_comp] = result[0]

In [None]:
maximum = max(tune_comp, key=tune_comp.get) 
print(f"The best # of cluster is {maximum}, with log probability of {tune_comp[maximum]}")