In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import minmax_scale

In [850]:
import ast
from pathlib import Path

In [3]:
amount_of_questions = 29
amount_of_patients = 2128

In [4]:
# Function to compute the best k clusters based on the parsed indicies
# Input: Pandas Dataframe with columns 'k', 'db', 'ch', 'sil'
# Output: k which is the optimal number of clusters, dataframe with 
def compute_metris_kmeans_best_k(df_indicies):
    
    df_indicies['db_scale'] = minmax_scale(df_indicies['db'])
    df_indicies['ch_scale'] = minmax_scale(df_indicies['ch'])
    
    df_indicies['combined_index'] = df_indicies['sil'] + df_indicies['ch_scale'] - df_indicies['db_scale']
    
    df_indicies = df_indicies.sort_values(by='combined_index', ascending=False)
    
    return int(df_indicies.iloc[0][0]), df_indicies

In [5]:
best_cluster_type_list = []

for i in range(amount_of_questions):

    metrics_gmm = pd.read_csv(f'./data/output/visualization/Q{i}_gmm_metrics.csv',index_col=0)
    metrics_kme = pd.read_csv(f'./data/output/visualization/Q{i}_kmeans_metrics.csv',index_col=0)

    metrics_gmm = metrics_gmm[['k','ch','db','sil']]
    metrics_kme = metrics_kme[['k','ch','db','sil']]

    metrics_gmm['type'] = np.zeros(len(metrics_gmm))
    metrics_kme['type'] = np.ones(len(metrics_kme))

    metrics = metrics_gmm.merge(metrics_kme, how='outer').copy()

    metrics['db_scale'] = minmax_scale(metrics['db'])
    metrics['ch_scale'] = minmax_scale(metrics['ch'])

    metrics['combined_index'] = metrics['sil'] + metrics['ch_scale'] - metrics['db_scale']

    metrics = metrics.sort_values(by='combined_index', ascending=False)

    first = metrics.iloc[0]
    second = metrics.iloc[1]
    if (first['combined_index'] == second['combined_index'] and not first['type'] == second['type']):
        best_cluster_type_list.append([i,first['k'],'both'])
    else:
        if first['type'] == 0:
            best_cluster_type_list.append([i,first['k'],'gmm'])
        else:
            best_cluster_type_list.append([i,first['k'],'k_means'])

In [6]:
best_cluster_type_list

[[0, 2.0, 'both'],
 [1, 2.0, 'k_means'],
 [2, 4.0, 'k_means'],
 [3, 2.0, 'k_means'],
 [4, 2.0, 'k_means'],
 [5, 4.0, 'k_means'],
 [6, 2.0, 'k_means'],
 [7, 2.0, 'k_means'],
 [8, 3.0, 'k_means'],
 [9, 2.0, 'k_means'],
 [10, 3.0, 'k_means'],
 [11, 2.0, 'gmm'],
 [12, 2.0, 'k_means'],
 [13, 2.0, 'k_means'],
 [14, 2.0, 'k_means'],
 [15, 2.0, 'both'],
 [16, 2.0, 'k_means'],
 [17, 2.0, 'k_means'],
 [18, 3.0, 'k_means'],
 [19, 3.0, 'k_means'],
 [20, 2.0, 'k_means'],
 [21, 2.0, 'k_means'],
 [22, 2.0, 'k_means'],
 [23, 2.0, 'k_means'],
 [24, 2.0, 'k_means'],
 [25, 10.0, 'k_means'],
 [26, 7.0, 'k_means'],
 [27, 5.0, 'k_means'],
 [28, 2.0, 'k_means']]

In [721]:
# Read in the correct values into a dataframe
patient_classes_total = pd.DataFrame(-1,index=np.arange(amount_of_patients), columns=np.arange(amount_of_questions))

for i in range(amount_of_questions):

    cur_best_model = best_cluster_type_list[i][2]

    if cur_best_model == 'gmm':
        patient_classes_total[i] = pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t',index_col='Patient ID')['gmm']
    else:
        patient_classes_total[i] = pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t',index_col='Patient ID')['k_means']


In [722]:
# Set all nan values to -1 and shift so that nan responses have 0.0 as value
patient_classes_total = patient_classes_total.fillna(-1) + 1

In [926]:
patient_classes_total

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,8.0,2.0,1.0,2.0,4.0,0.0,0.0,3.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,1,7.0,4.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,7.0,4.0,1.0,1.0,4.0,0.0,0.0,3.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,6.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0
4,1,1.0,1.0,1.0,2.0,0.0,1.0,3.0,0.0,0.0,...,0.0,0.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2123,2,8.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2124,1,1.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2125,2,5.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2126,2,3.0,4.0,0.0,2.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [723]:
# Function to find clusters per patient

def cluster_finder(patient_classes_total, minimum_amount):
    
    # Declare a dataframe to store the result
    pID_classes = pd.DataFrame('',index=np.arange(len(patient_classes_total)), columns=['cur','new','assigned'])

    pID_classes['assigned'] = np.zeros(len(pID_classes))

    pID_classes['cur'] = patient_classes_total[0].astype(str).copy()

    i = 1

    while i < len(patient_classes_total.columns) and not (pID_classes['assigned'].sum() == len(patient_classes_total)):

        for j in range(len(patient_classes_total)):

            if pID_classes.loc[j, 'assigned'] == 0:
                pID_classes.loc[j,'new'] = pID_classes.loc[j,'cur'] + str(int(patient_classes_total.loc[j,i]))

        pID_classes = patient_cluster_settle(pID_classes, minimum_amount)

        i += 1
    
    return pID_classes['cur']

In [927]:
patient_classes_total

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,8.0,2.0,1.0,2.0,4.0,0.0,0.0,3.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,1,7.0,4.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,7.0,4.0,1.0,1.0,4.0,0.0,0.0,3.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,6.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0
4,1,1.0,1.0,1.0,2.0,0.0,1.0,3.0,0.0,0.0,...,0.0,0.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2123,2,8.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2124,1,1.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2125,2,5.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2126,2,3.0,4.0,0.0,2.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [928]:
min_amount = 10

In [929]:
cur_found_clusters = cluster_finder(patient_classes_total, min_amount)

correct_index_dict = {}

cur_cluster_amounts = pd.DataFrame(cur_found_clusters.value_counts())

nan_exceptions = cur_cluster_amounts[cur_cluster_amounts['cur'] < min_amount]

for i in range(len(nan_exceptions)):
    same_start_index = cur_cluster_amounts[cur_cluster_amounts.index.str.startswith(nan_exceptions.index[i])]
    
    target = same_start_index[same_start_index['cur'] >= min_amount].iloc[-1].name
    
    correct_index_dict[nan_exceptions.index[i]] = target

In [947]:
cur_found_clusters.value_counts()[-50:]

2441     11
1522     11
1642     11
1701     11
26121    11
2122     11
23212    11
1611     10
1621     10
2422     10
2111     10
223      10
114       8
25        8
18        8
15        7
26        7
13        6
132       6
112       6
170       6
12        6
264       5
24        5
134       5
152       4
261       4
252       4
234       4
23        4
22        4
154       3
14        3
251       3
244       3
214       3
254       3
162       2
164       2
231       2
232       2
211       2
1321      1
131       1
262       1
1         1
21        1
151       1
2321      1
1121      1
Name: cur, dtype: int64

In [948]:
correct_index_dict

{'114': '1142',
 '25': '253',
 '18': '181',
 '15': '1522',
 '26': '26121',
 '13': '1311',
 '132': '13211',
 '112': '11211',
 '170': '1701',
 '12': '123',
 '264': '2641',
 '24': '2422',
 '134': '1342',
 '152': '1522',
 '261': '26121',
 '252': '2522',
 '234': '2342',
 '23': '23212',
 '22': '223',
 '154': '1542',
 '14': '143',
 '251': '2511',
 '244': '2441',
 '214': '21422',
 '254': '2541',
 '162': '1621',
 '164': '1642',
 '231': '2311',
 '232': '23212',
 '211': '2111',
 '1321': '13211',
 '131': '1311',
 '262': '26212',
 '1': '1621',
 '21': '2111',
 '151': '15112',
 '2321': '23212',
 '1121': '11211'}

In [949]:
for i in range(len(cur_found_clusters)):
    
    cur_cluster = cur_found_clusters.iloc[i]
    
    if cur_cluster in correct_index_dict.keys():
        cur_found_clusters[i] = correct_index_dict[cur_cluster]

In [950]:
cur_found_clusters.value_counts()

174      148
26122     99
27        62
2312      53
124       51
        ... 
2122      11
15211     11
173       11
2112      11
1611      10
Name: cur, Length: 90, dtype: int64

In [951]:
cluster_final = {}

count = 0

for cluster in cur_found_clusters.unique():
    cluster_final[cluster] = count
    count += 1

In [952]:
cluster_final

{'182': 0,
 '174': 1,
 '2611': 2,
 '111': 3,
 '2541': 4,
 '2121': 5,
 '2141': 6,
 '2622': 7,
 '23212': 8,
 '1342': 9,
 '27': 10,
 '224': 11,
 '1341': 12,
 '181': 13,
 '1122': 14,
 '1141': 15,
 '26122': 16,
 '221': 17,
 '1322': 18,
 '1522': 19,
 '13212': 20,
 '21422': 21,
 '143': 22,
 '15212': 23,
 '2641': 24,
 '1641': 25,
 '2322': 26,
 '142': 27,
 '2112': 28,
 '183': 29,
 '2311': 30,
 '173': 31,
 '184': 32,
 '243': 33,
 '241': 34,
 '2542': 35,
 '124': 36,
 '222': 37,
 '13211': 38,
 '2341': 39,
 '2522': 40,
 '15112': 41,
 '1542': 42,
 '2312': 43,
 '172': 44,
 '253': 45,
 '133': 46,
 '1621': 47,
 '121': 48,
 '11212': 49,
 '113': 50,
 '263': 51,
 '122': 52,
 '144': 53,
 '171': 54,
 '2512': 55,
 '11211': 56,
 '26212': 57,
 '1701': 58,
 '2442': 59,
 '2642': 60,
 '1311': 61,
 '28': 62,
 '2511': 63,
 '2122': 64,
 '2111': 65,
 '1642': 66,
 '1541': 67,
 '1612': 68,
 '123': 69,
 '141': 70,
 '1622': 71,
 '1142': 72,
 '23211': 73,
 '2342': 74,
 '233': 75,
 '2421': 76,
 '223': 77,
 '153': 78,
 '213

In [953]:
for i in range(len(cur_found_clusters)):
    
    cur_cluster = cur_found_clusters.iloc[i]
    
    cur_found_clusters[i] = cluster_final[cur_cluster]

In [955]:
final_assignment10 = cur_found_clusters

In [956]:
output10 = pd.DataFrame(final_assignment10)
output10.columns=[f'Min group of {min_amount}']

In [957]:
output10

Unnamed: 0,Min group of 10
0,0
1,1
2,1
3,2
4,3
...,...
2123,62
2124,56
2125,63
2126,74


In [958]:
#Check whether output directory exists
Path('./data/output').mkdir(parents=True, exist_ok=True)

In [959]:
# Write the valid embeddings to a .tsv file for visualization
output10.to_csv(f'data/output/min_group_10_clusters',sep="\t",header=True,index=True)

In [647]:
# Method to remove entries including '0' from list, which implies nan column
def remove_0_strings_in_list(string_list):

    i = 0
    length_list = len(string_list)

    while(i < length_list):

        if('0' in string_list[i]):
            del string_list[i]
            i -= 1
            length_list = len(string_list)

        i += 1
    
    return string_list

In [699]:
# Method to determine if new propsed cluster adheres to minimum amount
def patient_cluster_settle(df, minimum):
    
    # Compute the count of unique clusters
    df_counts = pd.DataFrame(df['new'].value_counts())
    
    for i in range(len(df)):
        
        # Check if patient has assigned cluster
        if not (df.iloc[i]['assigned'] == 1):
            
            cur_old = df.iloc[i]['cur']
            cur_new = df.iloc[i]['new']
            
            shared_roots = list(df_counts[df_counts.index.str.startswith(cur_old)].index)
            
            shared_roots = remove_0_strings_in_list(shared_roots)
            
            if shared_roots:

                shared_min = df_counts.loc[shared_roots].values.min()
                
                if (df_counts.loc[cur_new][0]) < minimum or shared_min < minimum:
                    df.loc[i, 'assigned'] = 1
                else:
                    df.loc[i, 'cur'] = cur_new
                    
            elif (df_counts.loc[cur_new][0]) < minimum:
                df.loc[i, 'assigned'] = 1
                
            else:
                df.loc[i, 'cur'] = cur_new
            
    return df

#### Method to find the patients based on cluster groups per question (ordered)

In [108]:
# Given list of classes in order from question 1, return all patient ids
def find_patient_ids(question_id, cluster_ids, selected_data):
    
    if(len(cluster_ids) < 1):
        print("Won't accept empty list of cluster_ids Method: find_patient_ids")
    elif(len(cluster_ids) == 1):
        return selected_data[selected_data[question_id] == cluster_ids[0]].index
    else:
        selection = selected_data[selected_data[question_id] == cluster_ids[0]]
        
        return find_patient_ids(question_id+1, cluster_ids[1:], selection)
        

In [716]:
patient_classes_total.loc[np.array(find_patient_ids(0,[2,6,1,0],patient_classes_total))]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
548,2,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
1312,2,6.0,1.0,0.0,2.0,4.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1334,2,6.0,1.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1832,2,6.0,1.0,0.0,2.0,4.0,1.0,2.0,2.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
