In [54]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import minmax_scale

In [107]:
amount_of_questions = 29
amount_of_patients = 2128

In [None]:
# Function to compute the best k clusters based on the parsed indicies
# Input: Pandas Dataframe with columns 'k', 'db', 'ch', 'sil'
# Output: k which is the optimal number of clusters, dataframe with 
def compute_metris_kmeans_best_k(df_indicies):
    
    df_indicies['db_scale'] = minmax_scale(df_indicies['db'])
    df_indicies['ch_scale'] = minmax_scale(df_indicies['ch'])
    
    df_indicies['combined_index'] = df_indicies['sil'] + df_indicies['ch_scale'] - df_indicies['db_scale']
    
    df_indicies = df_indicies.sort_values(by='combined_index', ascending=False)
    
    return int(df_indicies.iloc[0][0]), df_indicies

In [96]:
best_cluster_type_list = []

for i in range(amount_of_questions):

    metrics_gmm = pd.read_csv(f'./data/output/visualization/Q{i}_gmm_metrics.csv',index_col=0)
    metrics_kme = pd.read_csv(f'./data/output/visualization/Q{i}_kmeans_metrics.csv',index_col=0)

    metrics_gmm = metrics_gmm[['k','ch','db','sil']]
    metrics_kme = metrics_kme[['k','ch','db','sil']]

    metrics_gmm['type'] = np.zeros(len(metrics_gmm))
    metrics_kme['type'] = np.ones(len(metrics_kme))

    metrics = metrics_gmm.merge(metrics_kme, how='outer').copy()

    metrics['db_scale'] = minmax_scale(metrics['db'])
    metrics['ch_scale'] = minmax_scale(metrics['ch'])

    metrics['combined_index'] = metrics['sil'] + metrics['ch_scale'] - metrics['db_scale']

    metrics = metrics.sort_values(by='combined_index', ascending=False)

    first = metrics.iloc[0]
    second = metrics.iloc[1]
    if (first['combined_index'] == second['combined_index'] and not first['type'] == second['type']):
        best_cluster_type_list.append([i,first['k'],'both'])
    else:
        if first['type'] == 0:
            best_cluster_type_list.append([i,first['k'],'gmm'])
        else:
            best_cluster_type_list.append([i,first['k'],'k_means'])

In [97]:
best_cluster_type_list

[[0, 2.0, 'both'],
 [1, 2.0, 'k_means'],
 [2, 4.0, 'k_means'],
 [3, 2.0, 'k_means'],
 [4, 2.0, 'k_means'],
 [5, 4.0, 'k_means'],
 [6, 2.0, 'k_means'],
 [7, 2.0, 'k_means'],
 [8, 3.0, 'k_means'],
 [9, 2.0, 'k_means'],
 [10, 3.0, 'k_means'],
 [11, 2.0, 'gmm'],
 [12, 2.0, 'k_means'],
 [13, 2.0, 'k_means'],
 [14, 2.0, 'k_means'],
 [15, 2.0, 'both'],
 [16, 2.0, 'k_means'],
 [17, 2.0, 'k_means'],
 [18, 3.0, 'k_means'],
 [19, 3.0, 'k_means'],
 [20, 2.0, 'k_means'],
 [21, 2.0, 'k_means'],
 [22, 2.0, 'k_means'],
 [23, 2.0, 'k_means'],
 [24, 2.0, 'k_means'],
 [25, 10.0, 'k_means'],
 [26, 7.0, 'k_means'],
 [27, 5.0, 'k_means'],
 [28, 2.0, 'k_means']]

In [110]:
patient_classes_total = pd.DataFrame(-1,index=np.arange(amount_of_patients), columns=np.arange(amount_of_questions))

In [136]:
for i in range(amount_of_questions):

    cur_best_model = best_cluster_type_list[i][2]

    if cur_best_model == 'gmm':
        patient_classes_total[i] = pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t',index_col='Patient ID')['gmm']
    else:
        patient_classes_total[i] = pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t',index_col='Patient ID')['k_means']


In [154]:
temp = patient_classes_total[patient_classes_total[0] == 0]

In [155]:
temp1 = temp[temp[1] == 0]

In [156]:
temp2 = temp1[temp1[2] == 0]

In [161]:
temp3 = temp2[temp2[3] == 0]

In [165]:
temp3[temp3[4] == 1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
4,0,0.0,0.0,0.0,1.0,,0.0,2.0,,,...,,,1.0,0.0,0.0,1.0,,,,1.0
78,0,0.0,0.0,0.0,1.0,,,,,,...,,,,,,,,,,
155,0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,,...,,,,,,,,,,
182,0,0.0,0.0,0.0,1.0,,1.0,,,,...,,,,,,,,,,
511,0,0.0,0.0,0.0,1.0,3.0,,,1.0,,...,,,,,,,,,,
543,0,0.0,0.0,0.0,1.0,3.0,,,,1.0,...,,,,,,,,,,
686,0,0.0,0.0,0.0,1.0,0.0,,,,,...,,,,,,,,,,
868,0,0.0,0.0,0.0,1.0,,0.0,4.0,,1.0,...,,,,,,,,,,
894,0,0.0,0.0,0.0,1.0,,1.0,1.0,,,...,,,,,,,,,,
918,0,0.0,0.0,0.0,1.0,,,,,,...,,,,,,,,,,


In [177]:
col0_nunique = patient_classes_total[0].nunique()
col1_nunique = patient_classes_total[1].nunique()
col2_nunique = patient_classes_total[2].nunique()

for i in range(col0_nunique):
    first_selection = patient_classes_total[patient_classes_total[0] == i]
    for j in range(col1_nunique):
        second_selection = first_selection[first_selection[1] == j]
        for k in range(col2_nunique):
            third_selection = second_selection[second_selection[2] == k]
            
            print([i,j,k], len(third_selection))
            
    

[0, 0, 0] 36
[0, 0, 1] 59
[0, 0, 2] 23
[0, 0, 3] 71
[0, 1, 0] 17
[0, 1, 1] 47
[0, 1, 2] 12
[0, 1, 3] 51
[0, 2, 0] 30
[0, 2, 1] 59
[0, 2, 2] 20
[0, 2, 3] 53
[0, 3, 0] 13
[0, 3, 1] 22
[0, 3, 2] 12
[0, 3, 3] 23
[0, 4, 0] 32
[0, 4, 1] 42
[0, 4, 2] 16
[0, 4, 3] 54
[0, 5, 0] 33
[0, 5, 1] 27
[0, 5, 2] 18
[0, 5, 3] 27
[0, 6, 0] 21
[0, 6, 1] 17
[0, 6, 2] 11
[0, 6, 3] 148
[0, 7, 0] 16
[0, 7, 1] 24
[0, 7, 2] 18
[0, 7, 3] 22
[1, 0, 0] 23
[1, 0, 1] 30
[1, 0, 2] 12
[1, 0, 3] 32
[1, 1, 0] 13
[1, 1, 1] 19
[1, 1, 2] 10
[1, 1, 3] 15
[1, 2, 0] 73
[1, 2, 1] 59
[1, 2, 2] 13
[1, 2, 3] 36
[1, 3, 0] 28
[1, 3, 1] 23
[1, 3, 2] 12
[1, 3, 3] 27
[1, 4, 0] 55
[1, 4, 1] 34
[1, 4, 2] 13
[1, 4, 3] 39
[1, 5, 0] 143
[1, 5, 1] 50
[1, 5, 2] 40
[1, 5, 3] 64
[1, 6, 0] 23
[1, 6, 1] 14
[1, 6, 2] 3
[1, 6, 3] 19
[1, 7, 0] 8
[1, 7, 1] 10
[1, 7, 2] 4
[1, 7, 3] 14


In [169]:
patient_classes_total[0].nunique()

2

In [175]:
col2_nunique

4

In [138]:
pd.read_csv(f'./data/output/questions/Q3_labels.tsv', sep='\t',index_col='Patient ID')['k_means']

Patient ID
0       0
1       0
2       0
3       0
4       0
       ..
2116    0
2117    0
2118    0
2119    1
2120    1
Name: k_means, Length: 1952, dtype: int64

In [132]:
pd.read_csv(f'./data/output/questions/Q4_labels.tsv', sep='\t')[['Patient ID','k_means']]

Unnamed: 0,Patient ID,k_means
0,0,1
1,1,1
2,2,0
3,3,1
4,4,1
...,...,...
1981,2123,1
1982,2124,1
1983,2125,1
1984,2126,1


In [102]:
best_cluster_type_list[i][2]

'both'

In [104]:
pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t')[['Patient ID','k_means']]

Unnamed: 0,Patient ID,k_means
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0
...,...,...
2123,2123,1
2124,2124,0
2125,2125,1
2126,2126,1


In [84]:
best_cluster_type_list

[[0, 2.0, 'both']]