### Imports

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import minmax_scale

In [2]:
import ast
from pathlib import Path

In [3]:
amount_of_questions = 29
amount_of_patients = 2128

In [4]:
# Function to compute the best k clusters based on the parsed indicies
# Input: Pandas Dataframe with columns 'k', 'db', 'ch', 'sil'
# Output: k which is the optimal number of clusters, dataframe with 
def compute_metris_kmeans_best_k(df_indicies):
    
    df_indicies['db_scale'] = minmax_scale(df_indicies['db'])
    df_indicies['ch_scale'] = minmax_scale(df_indicies['ch'])
    
    df_indicies['combined_index'] = df_indicies['sil'] + df_indicies['ch_scale'] - df_indicies['db_scale']
    
    df_indicies = df_indicies.sort_values(by='combined_index', ascending=False)
    
    return int(df_indicies.iloc[0][0]), df_indicies

In [17]:
best_cluster_type_list = []

for i in range(amount_of_questions):

    metrics_gmm = pd.read_csv(f'./data/output/visualization/Q{i}_gmm_metrics.csv',index_col=0)
    metrics_kme = pd.read_csv(f'./data/output/visualization/Q{i}_kmeans_metrics.csv',index_col=0)

    metrics_gmm = metrics_gmm[['k','ch','db','sil']]
    metrics_kme = metrics_kme[['k','ch','db','sil']]

    metrics_gmm['type'] = np.zeros(len(metrics_gmm))
    metrics_kme['type'] = np.ones(len(metrics_kme))

    metrics = metrics_gmm.merge(metrics_kme, how='outer').copy()

    metrics['db_scale'] = minmax_scale(metrics['db'])
    metrics['ch_scale'] = minmax_scale(metrics['ch'])

    metrics['combined_index'] = metrics['sil'] + metrics['ch_scale'] - metrics['db_scale']

    metrics = metrics.sort_values(by='combined_index', ascending=False)

    first = metrics.iloc[0]
    second = metrics.iloc[1]
    if (first['combined_index'] == second['combined_index'] and not first['type'] == second['type']):
        best_cluster_type_list.append([i,first['k'],'both', 'Other: ', second['k']])
    else:
        if first['type'] == 0:
            best_cluster_type_list.append([i,first['k'],'gmm', 'Other: k_means', second['k']])
        else:
            best_cluster_type_list.append([i,first['k'],'k_means', 'Other: gmm: ', second['k']])

In [18]:
best_cluster_type_list

[[0, 2.0, 'both', 'Other: ', 2.0],
 [1, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [2, 4.0, 'k_means', 'Other: gmm: ', 3.0],
 [3, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [4, 2.0, 'k_means', 'Other: gmm: ', 3.0],
 [5, 4.0, 'k_means', 'Other: gmm: ', 5.0],
 [6, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [7, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [8, 3.0, 'k_means', 'Other: gmm: ', 4.0],
 [9, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [10, 3.0, 'k_means', 'Other: gmm: ', 4.0],
 [11, 2.0, 'gmm', 'Other: k_means', 2.0],
 [12, 2.0, 'k_means', 'Other: gmm: ', 3.0],
 [13, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [14, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [15, 2.0, 'both', 'Other: ', 2.0],
 [16, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [17, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [18, 3.0, 'k_means', 'Other: gmm: ', 3.0],
 [19, 3.0, 'k_means', 'Other: gmm: ', 3.0],
 [20, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [21, 2.0, 'k_means', 'Other: gmm: ', 2.0],
 [22, 2.0, 'k_means', 'Other: gmm: ', 10.0],
 [23, 2.0, 'k_m

In [7]:
# Read in the correct values into a dataframe
patient_classes_total = pd.DataFrame(-1,index=np.arange(amount_of_patients), columns=np.arange(amount_of_questions))

for i in range(amount_of_questions):

    cur_best_model = best_cluster_type_list[i][2]

    if cur_best_model == 'gmm':
        patient_classes_total[i] = pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t',index_col='Patient ID')['gmm']
    else:
        patient_classes_total[i] = pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t',index_col='Patient ID')['k_means']


In [8]:
# Set all nan values to -1 and shift so that nan responses have 0.0 as value
patient_classes_total = patient_classes_total.fillna(-1) + 1

In [9]:
patient_classes_total

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,8.0,2.0,1.0,2.0,4.0,0.0,0.0,3.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,1,7.0,4.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,7.0,4.0,1.0,1.0,4.0,0.0,0.0,3.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,6.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0
4,1,1.0,1.0,1.0,2.0,0.0,1.0,3.0,0.0,0.0,...,0.0,0.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2123,2,8.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2124,1,1.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2125,2,5.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2126,2,3.0,4.0,0.0,2.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Helper functions

In [14]:
# Method to remove entries including '0' from list, which implies nan column
def remove_0_strings_in_list(string_list):

    i = 0
    length_list = len(string_list)

    while(i < length_list):

        if('0' in string_list[i]):
            del string_list[i]
            i -= 1
            length_list = len(string_list)

        i += 1
    
    return string_list

In [15]:
# Method to determine if new propsed cluster adheres to minimum amount
def patient_cluster_settle(df, minimum):
    
    # Compute the count of unique clusters
    df_counts = pd.DataFrame(df['new'].value_counts())
    
    for i in range(len(df)):
        
        # Check if patient has assigned cluster
        if not (df.iloc[i]['assigned'] == 1):
            
            cur_old = df.iloc[i]['cur']
            cur_new = df.iloc[i]['new']
            
            shared_roots = list(df_counts[df_counts.index.str.startswith(cur_old)].index)
            
            shared_roots = remove_0_strings_in_list(shared_roots)
            
            if shared_roots:

                shared_min = df_counts.loc[shared_roots].values.min()
                
                if (df_counts.loc[cur_new][0]) < minimum or shared_min < minimum:
                    df.loc[i, 'assigned'] = 1
                else:
                    df.loc[i, 'cur'] = cur_new
                    
            elif (df_counts.loc[cur_new][0]) < minimum:
                df.loc[i, 'assigned'] = 1
                
            else:
                df.loc[i, 'cur'] = cur_new
            
    return df

In [16]:
# Function to find clusters per patient

def cluster_finder(patient_classes_total, minimum_amount):
    
    # Declare a dataframe to store the result
    pID_classes = pd.DataFrame('',index=np.arange(len(patient_classes_total)), columns=['cur','new','assigned'])

    pID_classes['assigned'] = np.zeros(len(pID_classes))

    pID_classes['cur'] = patient_classes_total[0].astype(str).copy()

    i = 1

    while i < len(patient_classes_total.columns) and not (pID_classes['assigned'].sum() == len(patient_classes_total)):

        for j in range(len(patient_classes_total)):

            if pID_classes.loc[j, 'assigned'] == 0:
                pID_classes.loc[j,'new'] = pID_classes.loc[j,'cur'] + str(int(patient_classes_total.loc[j,i]))

        pID_classes = patient_cluster_settle(pID_classes, minimum_amount)

        i += 1
    
    return pID_classes['cur']

In [36]:
min_amount = 25

In [37]:
cur_found_clusters = cluster_finder(patient_classes_total, min_amount)

correct_index_dict = {}

cur_cluster_amounts = pd.DataFrame(cur_found_clusters.value_counts())

nan_exceptions = cur_cluster_amounts[cur_cluster_amounts['cur'] < min_amount]

for i in range(len(nan_exceptions)):
    same_start_index = cur_cluster_amounts[cur_cluster_amounts.index.str.startswith(nan_exceptions.index[i])]
    
    target = same_start_index[same_start_index['cur'] >= min_amount].iloc[-1].name
    
    correct_index_dict[nan_exceptions.index[i]] = target

In [38]:
cur_found_clusters.value_counts()[-50:]

17      214
11      204
23      185
13      168
15      151
25      149
12      133
2612    110
16      105
21       98
24       95
18       88
14       73
264      64
27       62
22       61
262      50
263      40
28       37
2611     29
26        7
261       4
1         1
Name: cur, dtype: int64

In [39]:
correct_index_dict

{'26': '2611', '261': '2611', '1': '14'}

In [40]:
for i in range(len(cur_found_clusters)):
    
    cur_cluster = cur_found_clusters.iloc[i]
    
    if cur_cluster in correct_index_dict.keys():
        cur_found_clusters[i] = correct_index_dict[cur_cluster]

In [41]:
cur_found_clusters.value_counts()

17      214
11      204
23      185
13      168
15      151
25      149
12      133
2612    110
16      105
21       98
24       95
18       88
14       74
264      64
27       62
22       61
262      50
2611     40
263      40
28       37
Name: cur, dtype: int64

In [42]:
cluster_final = {}

count = 0

for cluster in cur_found_clusters.unique():
    cluster_final[cluster] = count
    count += 1

In [43]:
cluster_final

{'18': 0,
 '17': 1,
 '2611': 2,
 '11': 3,
 '25': 4,
 '21': 5,
 '262': 6,
 '23': 7,
 '13': 8,
 '27': 9,
 '22': 10,
 '2612': 11,
 '15': 12,
 '14': 13,
 '264': 14,
 '16': 15,
 '24': 16,
 '12': 17,
 '263': 18,
 '28': 19}

In [44]:
for i in range(len(cur_found_clusters)):
    
    cur_cluster = cur_found_clusters.iloc[i]
    
    cur_found_clusters[i] = cluster_final[cur_cluster]

In [45]:
final_assignment25 = cur_found_clusters

In [46]:
output25 = pd.DataFrame(final_assignment25)
output25.columns=[f'Min group of {min_amount}']

In [47]:
output25

Unnamed: 0,Min group of 25
0,0
1,1
2,1
3,2
4,3
...,...
2123,19
2124,3
2125,4
2126,7


In [48]:
#Check whether output directory exists
Path('./data/output').mkdir(parents=True, exist_ok=True)

In [49]:
# Write the valid embeddings to a .tsv file for visualization
output25.to_csv(f'data/output/min_group_25_clusters',sep="\t",header=True,index=True)

#### Method to find the patients based on cluster groups per question (ordered)

In [50]:
# Given list of classes in order from question 1, return all patient ids
def find_patient_ids(question_id, cluster_ids, selected_data):
    
    if(len(cluster_ids) < 1):
        print("Won't accept empty list of cluster_ids Method: find_patient_ids")
    elif(len(cluster_ids) == 1):
        return selected_data[selected_data[question_id] == cluster_ids[0]].index
    else:
        selection = selected_data[selected_data[question_id] == cluster_ids[0]]
        
        return find_patient_ids(question_id+1, cluster_ids[1:], selection)
        

In [51]:
patient_classes_total.loc[np.array(find_patient_ids(0,[2,6,1,0],patient_classes_total))]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
548,2,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
1312,2,6.0,1.0,0.0,2.0,4.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1334,2,6.0,1.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1832,2,6.0,1.0,0.0,2.0,4.0,1.0,2.0,2.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
