### Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
from sklearn.mixture import GaussianMixture
from tqdm import tqdm
from pathlib import Path

In [3]:
from sklearn.preprocessing import minmax_scale
import os
import math
from sklearn.decomposition import PCA

### Load the data

In [4]:
%%time
embedding_data = pd.read_csv('data/input/dental_cluster_input_new.csv',index_col=0)

CPU times: user 1.04 s, sys: 77.5 ms, total: 1.12 s
Wall time: 1.22 s


### Find rows with NaN values

In [5]:
valid_indicies = []

for i in range(len(embedding_data['sentence'])):
    current_sentence = embedding_data['sentence'][i]
    
    if not type(current_sentence) == float:
        valid_indicies.append(i)

In [6]:
valid_dataset = embedding_data.iloc[valid_indicies].copy()

# Ensure consistency with indicies
valid_dataset.index = np.arange(len(valid_dataset))

In [7]:
# Verify no NaN values
valid_dataset.isna().sum().sum()

0

In [8]:
# Remove from memory
del embedding_data

### Process the sentence embeddings

In [9]:
sentence_embeddings = valid_dataset['embedding']

In [10]:
# Method to parse the saved embeddings
def embedding_string_to_float_list(embedding_string):
    # Formatting
    formatted_string = embedding_string.split('[')[1].split(']')[0].replace("\n","")
    
    float_list = []

    for row in formatted_string.split(" "):
        if not row == '':
            float_list.append(float(row))
    
    return float_list

In [11]:
sentence_embedding_list = []

for i in tqdm(range(len(sentence_embeddings))):
    sentence_embedding_list.append(embedding_string_to_float_list(sentence_embeddings[i]))   
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16316/16316 [00:02<00:00, 5838.61it/s]


In [12]:
embedding_dataframe = pd.DataFrame(sentence_embedding_list)

In [13]:
# This dataframe holds the embedding of the corresponding row in valid_dataset
embedding_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.034062,0.013992,0.014252,-0.030456,0.028877,0.033559,0.017332,0.035528,-0.064723,-0.004354,...,-0.081443,-0.004202,0.000352,-0.023411,0.046932,-0.032391,-0.046974,0.059409,-0.073823,-0.023436
1,-0.072521,0.026935,-0.042975,0.031925,0.025261,0.020041,0.036362,-0.025237,0.011913,0.028049,...,0.025571,0.060598,0.021523,-0.032708,0.010081,0.019088,0.075229,-0.001422,0.061387,-0.013585
2,-0.020250,0.080377,0.008891,-0.054059,-0.061299,0.032137,-0.019862,-0.004791,-0.003684,0.029694,...,-0.004164,0.049371,-0.020510,-0.000520,0.020457,-0.005845,0.002068,-0.019667,0.048704,-0.040016
3,-0.064010,0.014544,0.015366,0.007376,0.042618,-0.004455,0.068092,-0.013669,0.001125,-0.043526,...,0.067604,-0.033537,0.022114,0.019851,0.010399,0.010969,0.074038,0.027083,-0.015570,0.033565
4,-0.009215,0.050368,0.023119,0.030543,0.028008,0.010728,0.045672,0.054670,-0.065691,-0.024182,...,-0.019737,-0.037893,0.011956,0.016581,0.018186,-0.036472,-0.006878,-0.002135,-0.009683,0.011702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16311,0.004346,0.028686,0.005832,-0.009570,-0.013020,0.059503,0.008206,-0.007141,-0.061534,0.085434,...,0.008846,0.012409,0.007637,0.000734,0.028507,0.034318,-0.004622,-0.009212,0.078548,-0.015268
16312,0.064416,0.001641,0.027482,-0.004933,-0.000269,-0.002164,-0.000324,-0.043461,0.022244,0.044613,...,-0.069417,-0.073531,-0.020159,-0.032370,0.016101,-0.037128,-0.041885,-0.020341,-0.065208,-0.070507
16313,0.024033,-0.056897,-0.009784,-0.017642,0.046868,0.050847,0.053478,-0.032478,-0.048117,-0.012153,...,-0.027825,0.004736,0.000375,0.067848,0.032955,-0.019453,-0.006210,0.010971,-0.055669,-0.066040
16314,0.007316,-0.015769,0.008998,-0.043925,0.023506,0.035210,0.027905,-0.003970,-0.041485,-0.029756,...,0.021643,0.007241,-0.025989,0.007757,0.057442,-0.004804,-0.013610,0.021214,-0.037111,-0.077928


In [14]:
# Remove the column from valid_dataset as its redundant
del valid_dataset['embedding']

#### Save the embeddings into a .tsv file for use in the visualization

In [15]:
#Check whether output directory exists
Path('./data/output').mkdir(parents=True, exist_ok=True)

In [16]:
# Write the valid embeddings to a .tsv file for visualization
embedding_dataframe.to_csv(
    f'data/output/model_simcse_data_dental_vectors_all.tsv',
    sep="\t",header=False,index=False)

### Clean '\n' from sentences and create labels dataframe

In [17]:
clean_sentences = []

for sentence in valid_dataset['sentence']:
    clean_sentences.append(sentence.replace('\n',' '))

In [18]:
labels = pd.DataFrame(clean_sentences, columns=['sentence'])

In [19]:
labels

Unnamed: 0,sentence
0,Schmerzen im Kiefergelenk
1,Schmerzfreiheit
2,"essen vor allem harte, zähe Speisen, wenn das ..."
3,Essen
4,wenn der Kiefer entspannt ist
...,...
16311,vor ca 6 Jahren ich dachte habe loch im zahn D...
16312,naechltliches zaehneknirschen mit kopschmerzen...
16313,ueberpruefung der knirschschienen eventuelle a...
16314,Mundoeffenen schmerzhaft seelische Belsatung


In [20]:
# Store the cleaned labels into the valid_dataset
valid_dataset['sentence'] = labels.copy()

#### Save the patient id's per embedding for use in the visualization

In [21]:
pID_text = pd.DataFrame(valid_dataset['row'].copy())
pID_text.columns = ['Patient ID']

In [22]:
pID_text['Text'] = labels

In [23]:
pID_text.head()

Unnamed: 0,Patient ID,Text
0,0,Schmerzen im Kiefergelenk
1,0,Schmerzfreiheit
2,0,"essen vor allem harte, zähe Speisen, wenn das ..."
3,0,Essen
4,0,wenn der Kiefer entspannt ist


In [24]:
pID_text.to_csv('./data/output/dental_labels_all.tsv', sep='\t', header=True, index=True)

# Finding optimal k per question

In [26]:
# Create folder if it doesn't exist yet
Path(f'./data/output/visualization/').mkdir(parents=True, exist_ok=True)

# Create folder if it doesn't exist yet
Path(f'./data/output/questions/').mkdir(parents=True, exist_ok=True)

# Select all the question IDs in the dataset
q_col = sorted(valid_dataset['column'].unique())

### KMeans helper functions

In [27]:
# Function to run kmeans on dataset and produce metrics per k
# Input - X represents the data, k_min is the lower bound of k, k_max is the upper bound of k
# Output - 4 columns representing [k, db_index, ch_index, sil_cof]
def generate_kmeans_metrics(X, k_min = 2, k_max = 10):
    
    output = []

    for i in range(k_min,k_max+1):
        
        cur_kmeans = KMeans(n_clusters=i,random_state=7).fit(X)

        predicted_labels = cur_kmeans.labels_

        db_score = davies_bouldin_score(X, predicted_labels)
        ch_score = calinski_harabasz_score(X, predicted_labels)
        sil_score = silhouette_score(X, predicted_labels)

        output.append([
            i,
            db_score,
            ch_score,
            sil_score
        ])
        
    return pd.DataFrame(output, columns=['k','db','ch','sil'])

In [28]:
# Function to compute the best k clusters based on the parsed indicies
# Input: Pandas Dataframe with columns 'k', 'db', 'ch', 'sil'
# Output: k which is the optimal number of clusters, dataframe with 
def compute_metris_kmeans_best_k(df_indicies):
    
    df_indicies['db_scale'] = minmax_scale(df_indicies['db'])
    df_indicies['ch_scale'] = minmax_scale(df_indicies['ch'])
    
    df_indicies['combined_index'] = df_indicies['sil'] + df_indicies['ch_scale'] - df_indicies['db_scale']
    
    df_indicies = df_indicies.sort_values(by='combined_index', ascending=False)
    
    return int(df_indicies.iloc[0][0]), df_indicies   

In [29]:
# Function to run kmeans and output the optimal k clusters
def kmeans_compute_optimal_k(X, i):
    
    # Run kMeans and compute metrics for k = [2,10]
    metrics = generate_kmeans_metrics(X)

    # Compute the optimal k based on the metrics
    k_best, vis_data = compute_metris_kmeans_best_k(metrics)

    # Save the data for the visualization
    pd.DataFrame(vis_data).to_csv(f'data/output/visualization/Q{i}_kmeans_metrics.csv')
    
    return k_best

### GMM helper functions

In [30]:
# Function to run gmm on dataset and produce metrics per k
# Input - X represents the data, k_min is the lower bound of k, k_max is the upper bound of k
# Output - 6 columns representing [k, db_index, ch_index, sil_cof, aic, bic]
def generate_gmm_metrics(X, k_min = 2, k_max = 20):
    
    output = []

    for i in range(k_min,k_max+1):
        cur_gmm = GaussianMixture(n_components=i, random_state=7).fit(X)

        predicted_labels = cur_gmm.predict(X)

        db_score = davies_bouldin_score(X, predicted_labels)
        ch_score = calinski_harabasz_score(X, predicted_labels)
        sil_score = silhouette_score(X, predicted_labels)

        output.append([
            i,
            db_score,
            ch_score,
            sil_score,
            cur_gmm.aic(X),
            cur_gmm.bic(X)
        ])
        
    return pd.DataFrame(output, columns=['k','db','ch','sil','aic','bic'])

In [31]:
# Function to compute the best k clusters based on the parsed indicies
# Input: Pandas Dataframe with columns 'k', 'db', 'ch', 'sil', 'aic', 'bic'
# Output: k which is the optimal number of clusters, dataframe
def compute_metris_gmm_best_k(df_indicies):
    
    df_indicies['db_scale'] = minmax_scale(df_indicies['db'])
    df_indicies['ch_scale'] = minmax_scale(df_indicies['ch'])
    df_indicies['aic_scale'] = minmax_scale(df_indicies['aic'])
    df_indicies['bic_scale'] = minmax_scale(df_indicies['bic'])
    
    df_indicies['combined_index'] = (
        df_indicies['sil'] + df_indicies['ch_scale'] - df_indicies['db_scale'] -
        df_indicies['aic_scale'] - df_indicies['bic_scale']
    )
    
    df_indicies = df_indicies.sort_values(by='combined_index', ascending=False)
    
    return int(df_indicies.iloc[0][0]), df_indicies   

In [32]:
# Function to run kmeans and output the optimal k clusters
def gmm_compute_optimal_k(X, i):
    
    # Run gmm and compute metrics for k = [2,10]
    metrics = generate_gmm_metrics(X,2,10)

    # Compute the optimal k based on the metrics
    k_best, vis_data = compute_metris_gmm_best_k(metrics)

    # Save the data for the visualization
    pd.DataFrame(vis_data).to_csv(f'data/output/visualization/Q{i}_gmm_metrics.csv')
    
    return k_best

## Compute KMeans and GMM

In [34]:
# Loop all the question columns
for i in tqdm(q_col):

    # Find the rows which include a response to this question
    indicies_of_question = valid_dataset[valid_dataset['column'] == i].index

    # Set X to the embeddings that represent the text responses
    X = embedding_dataframe.iloc[indicies_of_question]
    
    # Compute best k means
    k_best = kmeans_compute_optimal_k(X, i)

    # Obtain the labels by rerunnnig KMeans with the optimal k clusters and the same random_state as before
    best_KMeans = KMeans(n_clusters=k_best,random_state=7).fit(X)

    # Compute best gmm
    n_best = gmm_compute_optimal_k(X, i)

    # Obtain the labels by rerunnnig GMM with the optimal k clusters and the same random_state as before
    best_GMM = GaussianMixture(n_components=n_best,random_state=7).fit(X)

    # Save the IDs of the patients to the labels dataframe
    labels = pd.DataFrame(valid_dataset.iloc[indicies_of_question][['row','sentence']])
    labels.columns = ['Patient ID', 'Text']

    # Add a column to the labels corresponding to the labels assigned by the optimal kmeans clustering output
    labels['k_means'] = best_KMeans.labels_
    
    # Add a column to the labels corresponding to the labels assigned by the optimal kmeans clustering output
    labels['gmm'] = best_GMM.predict(X)

    # Write the labels and vectors to a .tsv file for visualization
    labels.to_csv(f'data/output/questions/Q{i}_labels.tsv',sep="\t",header=True,index=False)
    X.to_csv(f'data/output/questions/Q{i}_vectors.tsv',sep="\t",header=False,index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [12:30<00:00, 25.87s/it]


## Output data for visualization with all patients (based on top 3 questions)

In [292]:
indicies_for_pca = pd.DataFrame(index=np.arange(len(valid_dataset['row'].unique())), columns=np.arange(3))

In [293]:
# Loop the dataset to place all the rows in indicies_for_pca
for i in range(len(valid_dataset)):
    
    current_entry = valid_dataset.iloc[i]
    
    if current_entry['column'] < 3:
        indicies_for_pca.iloc[current_entry['row'],current_entry['column']] = i

In [294]:
# Loop the indicies of the desired embeddings and place them in an np array

len_embedding = len(embedding_dataframe.columns)

pca_values = np.zeros((indicies_for_pca.shape[0],len_embedding*3))

for i in range(indicies_for_pca.shape[1]):
    
    for j in range(indicies_for_pca.shape[0]):
        
        current_row_id = indicies_for_pca.iloc[j,i]
        
        if not(current_row_id is np.nan):
            pca_values[j][i*len_embedding:(i+1)*len_embedding] = embedding_dataframe.iloc[current_row_id]

In [287]:
# Export the embeddings of questions 0-2
pd.DataFrame(pca_values).to_csv(f'data/output/patient_vis_vectors_all.tsv',sep="\t",header=False,index=False)

In [295]:
# Declare a pandas dataframe to place the text of the responses
text_for_pca = pd.DataFrame("",index=np.arange(len(valid_dataset['row'].unique())), columns=np.arange(3))

In [296]:
for i in range(indicies_for_pca.shape[1]):
    
    for j in range(indicies_for_pca.shape[0]):
        
        current_row_id = indicies_for_pca.iloc[j,i]
        
        if not(current_row_id is np.nan):
            text_for_pca[i][j] = valid_dataset.loc[current_row_id]['sentence']

In [297]:
# Rename the columns with summarized question content
text_for_pca.columns = ['Chief complaint','Expected results', 'Factors that aggravate complaints']
text_for_pca.insert(0,'Patient ID',text_for_pca.index)

In [298]:
# Export the labels for visualization
text_for_pca.to_csv(f'data/output/patient_vis_labels.tsv',sep="\t",header=True,index=False)

### Computing clustering using all sentences

In [36]:
i = 100

# Set X to the embeddings that represent the text responses
X = embedding_dataframe

# Compute best k means
k_best = kmeans_compute_optimal_k(X, i)

# Obtain the labels by rerunnnig KMeans with the optimal k clusters and the same random_state as before
best_KMeans = KMeans(n_clusters=k_best,random_state=7).fit(X)

# Compute best gmm
n_best = gmm_compute_optimal_k(X, i)

# Obtain the labels by rerunnnig GMM with the optimal k clusters and the same random_state as before
best_GMM = GaussianMixture(n_components=n_best,random_state=7).fit(X)

# Save the IDs of the patients to the labels dataframe
labels = pd.DataFrame(valid_dataset[['row','sentence']])
labels.columns = ['Patient ID', 'Text']

# Add a column to the labels corresponding to the labels assigned by the optimal kmeans clustering output
labels['k_means'] = best_KMeans.labels_

# Add a column to the labels corresponding to the labels assigned by the optimal kmeans clustering output
labels['gmm'] = best_GMM.predict(X)

# Write the labels and vectors to a .tsv file for visualization
labels.to_csv(f'data/output/all_sentence_labels.tsv',sep="\t",header=True,index=False)
X.to_csv(f'data/output/all_sentence_vectors.tsv',sep="\t",header=False,index=False)