### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
from sklearn.mixture import GaussianMixture
from tqdm import tqdm
from pathlib import Path

In [60]:
from sklearn.preprocessing import minmax_scale
import os
import math

### Load the data

In [3]:
%%time
embedding_data = pd.read_csv('data/input/dental_cluster_input_new.csv',index_col=0)

CPU times: user 997 ms, sys: 227 ms, total: 1.22 s
Wall time: 1.39 s


In [4]:
embedding_data

Unnamed: 0,row,column,sentence,embedding
0,0,0,Schmerzen im Kiefergelenk,[ 3.40620205e-02 1.39922276e-02 1.42524922e-...
1,0,1,Schmerzfreiheit,[-7.25212172e-02 2.69346926e-02 -4.29749489e-...
2,0,2,"essen vor allem harte, zähe Speisen, wenn das ...",[-2.02503819e-02 8.03772509e-02 8.89117736e-...
3,0,3,Essen,[-6.40096217e-02 1.45435808e-02 1.53658316e-...
4,0,4,wenn der Kiefer entspannt ist,[-0.00921524 0.05036844 0.02311897 0.030542...
...,...,...,...,...
61707,2127,24,,[]
61708,2127,25,,[]
61709,2127,26,,[]
61710,2127,27,,[]


### Find rows with NaN values

In [5]:
valid_indicies = []

for i in range(len(embedding_data['sentence'])):
    current_sentence = embedding_data['sentence'][i]
    
    if not type(current_sentence) == float:
        valid_indicies.append(i)

In [6]:
valid_dataset = embedding_data.iloc[valid_indicies].copy()

# Ensure consistency with indicies
valid_dataset.index = np.arange(len(valid_dataset))

In [7]:
# Verify no NaN values
valid_dataset.isna().sum().sum()

0

### Process the sentence embeddings

In [8]:
sentence_embeddings = valid_dataset['embedding']

In [9]:
# Method to parse the saved embeddings
def embedding_string_to_float_list(embedding_string):
    # Formatting
    formatted_string = embedding_string.split('[')[1].split(']')[0].replace("\n","")
    
    float_list = []

    for row in formatted_string.split(" "):
        if not row == '':
            float_list.append(float(row))
    
    return float_list

In [10]:
sentence_embedding_list = []

for i in tqdm(range(len(sentence_embeddings))):
    sentence_embedding_list.append(embedding_string_to_float_list(sentence_embeddings[i]))   
    

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16316/16316 [00:02<00:00, 5765.83it/s]


In [11]:
embedding_dataframe = pd.DataFrame(sentence_embedding_list)

In [12]:
# This dataframe holds the embedding of the corresponding row in valid_dataset
embedding_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.034062,0.013992,0.014252,-0.030456,0.028877,0.033559,0.017332,0.035528,-0.064723,-0.004354,...,-0.081443,-0.004202,0.000352,-0.023411,0.046932,-0.032391,-0.046974,0.059409,-0.073823,-0.023436
1,-0.072521,0.026935,-0.042975,0.031925,0.025261,0.020041,0.036362,-0.025237,0.011913,0.028049,...,0.025571,0.060598,0.021523,-0.032708,0.010081,0.019088,0.075229,-0.001422,0.061387,-0.013585
2,-0.020250,0.080377,0.008891,-0.054059,-0.061299,0.032137,-0.019862,-0.004791,-0.003684,0.029694,...,-0.004164,0.049371,-0.020510,-0.000520,0.020457,-0.005845,0.002068,-0.019667,0.048704,-0.040016
3,-0.064010,0.014544,0.015366,0.007376,0.042618,-0.004455,0.068092,-0.013669,0.001125,-0.043526,...,0.067604,-0.033537,0.022114,0.019851,0.010399,0.010969,0.074038,0.027083,-0.015570,0.033565
4,-0.009215,0.050368,0.023119,0.030543,0.028008,0.010728,0.045672,0.054670,-0.065691,-0.024182,...,-0.019737,-0.037893,0.011956,0.016581,0.018186,-0.036472,-0.006878,-0.002135,-0.009683,0.011702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16311,0.004346,0.028686,0.005832,-0.009570,-0.013020,0.059503,0.008206,-0.007141,-0.061534,0.085434,...,0.008846,0.012409,0.007637,0.000734,0.028507,0.034318,-0.004622,-0.009212,0.078548,-0.015268
16312,0.064416,0.001641,0.027482,-0.004933,-0.000269,-0.002164,-0.000324,-0.043461,0.022244,0.044613,...,-0.069417,-0.073531,-0.020159,-0.032370,0.016101,-0.037128,-0.041885,-0.020341,-0.065208,-0.070507
16313,0.024033,-0.056897,-0.009784,-0.017642,0.046868,0.050847,0.053478,-0.032478,-0.048117,-0.012153,...,-0.027825,0.004736,0.000375,0.067848,0.032955,-0.019453,-0.006210,0.010971,-0.055669,-0.066040
16314,0.007316,-0.015769,0.008998,-0.043925,0.023506,0.035210,0.027905,-0.003970,-0.041485,-0.029756,...,0.021643,0.007241,-0.025989,0.007757,0.057442,-0.004804,-0.013610,0.021214,-0.037111,-0.077928


In [13]:
# Remove the column from valid_dataset as its redundant
del valid_dataset['embedding']

#### Save the embeddings into a .tsv file for use in the visualization

In [14]:
#Check whether output directory exists
Path('./data/output').mkdir(parents=True, exist_ok=True)

In [15]:
# Write the valid embeddings to a .tsv file for visualization
embedding_dataframe.to_csv(
    f'data/output/model_simcse_data_dental_vectors_all.tsv',
    sep="\t",header=False,index=False)

### Clean '\n' from sentences and create labels dataframe

In [16]:
clean_sentences = []

for sentence in embedding_data.iloc[valid_indicies]['sentence']:
    clean_sentences.append(sentence.replace('\n',' '))

In [17]:
labels = pd.DataFrame(clean_sentences, columns=['sentence'])

In [18]:
labels

Unnamed: 0,sentence
0,Schmerzen im Kiefergelenk
1,Schmerzfreiheit
2,"essen vor allem harte, zähe Speisen, wenn das ..."
3,Essen
4,wenn der Kiefer entspannt ist
...,...
16311,vor ca 6 Jahren ich dachte habe loch im zahn D...
16312,naechltliches zaehneknirschen mit kopschmerzen...
16313,ueberpruefung der knirschschienen eventuelle a...
16314,Mundoeffenen schmerzhaft seelische Belsatung


In [19]:
# Store the cleaned labels into the valid_dataset
valid_dataset['sentence'] = labels.copy()

## Clustering kmeans

In [49]:
# Assign X to the data that has to be clustered
X = embedding_dataframe

### Loop all k in the range [2,200]
Record the CH-index, DB-index, and Silhouette index for each k

In [58]:
kmeans_mini_metrics_list = []

for i in tqdm(range(2,201)):
    cur_kmeans = MiniBatchKMeans(
        n_clusters=i,
        random_state=7,
        batch_size=8192).fit(X)
    
    predicted_labels = cur_kmeans.labels_
    
    db_score = davies_bouldin_score(X, predicted_labels)
    ch_score = calinski_harabasz_score(X, predicted_labels)
    sil_score = silhouette_score(X, predicted_labels)
    
    kmeans_mini_metrics_list.append([
        i,
        db_score,
        ch_score,
        sil_score
    ])
    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 199/199 [49:37<00:00, 14.96s/it]


In [59]:
pd.DataFrame(kmeans_mini_metrics_list, columns=['k','db','ch','sil']).to_csv('data/kmeans_metrics_list200.csv')

In [62]:
metrics = pd.DataFrame(kmeans_mini_metrics_list, columns=['k','db','ch','sil'])

In [None]:
metrics['db_scale'] = minmax_scale(metrics['db'])
metrics['ch_scale'] = minmax_scale(metrics['ch'])

In [None]:
metrics['combined_index'] = metrics['sil'] + metrics['ch_scale'] - metrics['db_scale']

## Clustering GMM

In [176]:
gm = GaussianMixture(n_components=10, random_state=7).fit(clean_embedding_dataframe)

In [177]:
gm.aic(clean_embedding_dataframe)

-75830633.61137748

In [187]:
labels['gm'] = gm.predict(clean_embedding_dataframe)

In [178]:
gm1 = GaussianMixture(n_components=5, random_state=7).fit(clean_embedding_dataframe)

In [179]:
gm1.aic(clean_embedding_dataframe)

-74312561.90158828

# Finding optimal k per question

In [136]:
# Create folder if it doesn't exist yet
Path(f'./data/output/visualization/').mkdir(parents=True, exist_ok=True)

# Select all the question IDs in the dataset
q_col = sorted(valid_dataset['column'].unique())

### KMeans helper functions

In [104]:
# Function to run kmeans on dataset and produce metrics per k
# Input - X represents the data, k_min is the lower bound of k, k_max is the upper bound of k
# Output - 4 columns representing [k, db_index, ch_index, sil_cof]
def generate_kmeans_metrics(X, k_min = 2, k_max = 10):
    
    output = []

    for i in tqdm(range(k_min,k_max+1)):
        
        if i >= 50:
            cur_kmeans = MiniBatchKMeans(n_clusters=i, random_state=7, batch_size=8192).fit(X)
        else:
            cur_kmeans = KMeans(n_clusters=i,random_state=7).fit(X)

        predicted_labels = cur_kmeans.labels_

        db_score = davies_bouldin_score(X, predicted_labels)
        ch_score = calinski_harabasz_score(X, predicted_labels)
        sil_score = silhouette_score(X, predicted_labels)

        output.append([
            i,
            db_score,
            ch_score,
            sil_score
        ])
        
    return pd.DataFrame(output, columns=['k','db','ch','sil'])

In [98]:
# Function to compute the best k clusters based on the parsed indicies
# Input: Pandas Dataframe with columns 'k', 'db', 'ch', 'sil'
# Output: k which is the optimal number of clusters, dataframe with 
def compute_metris_kmeans_best_k(df_indicies):
    
    df_indicies['db_scale'] = minmax_scale(df_indicies['db'])
    df_indicies['ch_scale'] = minmax_scale(df_indicies['ch'])
    
    df_indicies['combined_index'] = df_indicies['sil'] + df_indicies['ch_scale'] - df_indicies['db_scale']
    
    df_indicies = df_indicies.sort_values(by='combined_index', ascending=False)
    
    return int(df_indicies.iloc[0][0]), df_indicies   

In [110]:
# Function to run kmeans and output the optimal k clusters
def kmeans_compute_optimal_k(X):
    
    # Compute the min and max amount of cluster
    k_max = max(math.floor(len(X)/10), 10)
    
    k_min = max(math.ceil(len(X)/100), 2)
    
    # Run kMeans and compute metrics for k = [2,k_max]
    metrics = generate_kmeans_metrics(X,k_min,k_max)

    # Compute the optimal k based on the metrics
    k_best, vis_data = compute_metris_kmeans_best_k(metrics)

    # Save the data for the visualization
    pd.DataFrame(vis_data).to_csv(f'data/output/visualization/visualization_metrics_kmeans_{i}.csv')
    
    return k_best

### KMeans computation

In [111]:
# Loop all the question columns
for i in q_col:

    # Find the rows which include a response to this question
    indicies_of_question = valid_dataset[valid_dataset['column'] == i].index

    # Set X to the embeddings that represent the text responses
    X = embedding_dataframe.iloc[indicies_of_question]

    k_best = kmeans_compute_optimal_k(X)

    # Obtain the labels by rerunnnig KMeans with the optimal k clusters and the same random_state as before
    best_KMeans = KMeans(n_clusters=k_best,random_state=7).fit(X)

    # Save the IDs of the patients to the labels dataframe
    labels = pd.DataFrame(valid_dataset.iloc[indicies_of_question]['row'])
    labels.columns = ['Patient ID']

    # Add a column to the labels corresponding to the labels assigned by the optimal kmeans clustering output
    labels['k_means'] = best_KMeans.labels_

    # Write the labels and vectors to a .tsv file for visualization
    Path(f'./data/output/Q{i}/').mkdir(parents=True, exist_ok=True)
    labels.to_csv(f'data/output/Q{i}/kmeans_labels.tsv',sep="\t",header=True,index=False)
    X.to_csv(f'data/output/Q{i}/kmeans_vectors.tsv',sep="\t",header=False,index=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 191/191 [05:02<00:00,  1.58s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 191/191 [05:47<00:00,  1.82s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 183/183 [05:16<00:00,  1.73s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 176/176 [04:53<00:00,  1.67s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 179/179 [04:55<00:00,  1.65s/it]
100%|███████████████████████████████████████████████████████

### GMM helper functions

In [173]:
# Function to run gmm on dataset and produce metrics per k
# Input - X represents the data, k_min is the lower bound of k, k_max is the upper bound of k
# Output - 6 columns representing [k, db_index, ch_index, sil_cof, aic, bic]
def generate_gmm_metrics(X, k_min = 2, k_max = 20):
    
    output = []

    for i in tqdm(range(k_min,k_max+1)):
        cur_gmm = GaussianMixture(n_components=i, random_state=7).fit(X)

        predicted_labels = cur_gmm.predict(X)

        db_score = davies_bouldin_score(X, predicted_labels)
        ch_score = calinski_harabasz_score(X, predicted_labels)
        sil_score = silhouette_score(X, predicted_labels)

        output.append([
            i,
            db_score,
            ch_score,
            sil_score,
            cur_gmm.aic(X),
            cur_gmm.bic(X)
        ])
        
    return pd.DataFrame(output, columns=['k','db','ch','sil','aic','bic'])

In [174]:
# Function to compute the best k clusters based on the parsed indicies
# Input: Pandas Dataframe with columns 'k', 'db', 'ch', 'sil', 'aic', 'bic'
# Output: k which is the optimal number of clusters, dataframe
def compute_metris_gmm_best_k(df_indicies):
    
    df_indicies['db_scale'] = minmax_scale(df_indicies['db'])
    df_indicies['ch_scale'] = minmax_scale(df_indicies['ch'])
    df_indicies['aic_scale'] = minmax_scale(df_indicies['aic'])
    df_indicies['bic_scale'] = minmax_scale(df_indicies['bic'])
    
    df_indicies['combined_index'] = (
        df_indicies['sil'] + df_indicies['ch_scale'] - df_indicies['db_scale'] -
        df_indicies['aic_scale'] - df_indicies['bic_scale']
    )
    
    df_indicies = df_indicies.sort_values(by='combined_index', ascending=False)
    
    return int(df_indicies.iloc[0][0]), df_indicies   

In [175]:
# Function to run kmeans and output the optimal k clusters
def gmm_compute_optimal_k(X):
    
    # Compute the min and max amount of cluster
    k_max = max(math.floor(len(X)/10), 10)
    
    k_min = max(math.ceil(len(X)/100), 2)
    
    # Run gmm and compute metrics for k = [2,k_max]
    metrics = generate_gmm_metrics(X,k_min,k_max)

    # Compute the optimal k based on the metrics
    k_best, vis_data = compute_metris_gmm_best_k(metrics)

    # Save the data for the visualization
    pd.DataFrame(vis_data).to_csv(f'data/output/visualization/visualization_metrics_gmm_{i}.csv')
    
    return k_best

### GMM computation

In [None]:
# Loop all the question columns
for i in q_col:

    # Find the rows which include a response to this question
    indicies_of_question = valid_dataset[valid_dataset['column'] == i].index

    # Set X to the embeddings that represent the text responses
    X = embedding_dataframe.iloc[indicies_of_question]

    n_best = gmm_compute_optimal_k(X)

    # Obtain the labels by rerunnnig KMeans with the optimal k clusters and the same random_state as before
    best_GMM = GaussianMixture(n_components=n_best,random_state=7).fit(X)

    # Save the IDs of the patients to the labels dataframe
    labels = pd.DataFrame(valid_dataset.iloc[indicies_of_question]['row'])
    labels.columns = ['Patient ID']

    # Add a column to the labels corresponding to the labels assigned by the optimal kmeans clustering output
    labels['gmm'] = best_GMM.predict(X)

    # Write the labels and vectors to a .tsv file for visualization
    Path(f'./data/output/Q{i}/').mkdir(parents=True, exist_ok=True)
    labels.to_csv(f'data/output/Q{i}/gmm_labels.tsv',sep="\t",header=True,index=False)
    X.to_csv(f'data/output/Q{i}/gmm_vectors.tsv',sep="\t",header=False,index=False)

 16%|█████████████████████▋                                                                                                                | 31/191 [05:54<45:44, 17.15s/it]