In [54]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import minmax_scale

In [283]:
import ast

In [107]:
amount_of_questions = 29
amount_of_patients = 2128

In [None]:
# Function to compute the best k clusters based on the parsed indicies
# Input: Pandas Dataframe with columns 'k', 'db', 'ch', 'sil'
# Output: k which is the optimal number of clusters, dataframe with 
def compute_metris_kmeans_best_k(df_indicies):
    
    df_indicies['db_scale'] = minmax_scale(df_indicies['db'])
    df_indicies['ch_scale'] = minmax_scale(df_indicies['ch'])
    
    df_indicies['combined_index'] = df_indicies['sil'] + df_indicies['ch_scale'] - df_indicies['db_scale']
    
    df_indicies = df_indicies.sort_values(by='combined_index', ascending=False)
    
    return int(df_indicies.iloc[0][0]), df_indicies

In [96]:
best_cluster_type_list = []

for i in range(amount_of_questions):

    metrics_gmm = pd.read_csv(f'./data/output/visualization/Q{i}_gmm_metrics.csv',index_col=0)
    metrics_kme = pd.read_csv(f'./data/output/visualization/Q{i}_kmeans_metrics.csv',index_col=0)

    metrics_gmm = metrics_gmm[['k','ch','db','sil']]
    metrics_kme = metrics_kme[['k','ch','db','sil']]

    metrics_gmm['type'] = np.zeros(len(metrics_gmm))
    metrics_kme['type'] = np.ones(len(metrics_kme))

    metrics = metrics_gmm.merge(metrics_kme, how='outer').copy()

    metrics['db_scale'] = minmax_scale(metrics['db'])
    metrics['ch_scale'] = minmax_scale(metrics['ch'])

    metrics['combined_index'] = metrics['sil'] + metrics['ch_scale'] - metrics['db_scale']

    metrics = metrics.sort_values(by='combined_index', ascending=False)

    first = metrics.iloc[0]
    second = metrics.iloc[1]
    if (first['combined_index'] == second['combined_index'] and not first['type'] == second['type']):
        best_cluster_type_list.append([i,first['k'],'both'])
    else:
        if first['type'] == 0:
            best_cluster_type_list.append([i,first['k'],'gmm'])
        else:
            best_cluster_type_list.append([i,first['k'],'k_means'])

In [97]:
best_cluster_type_list

[[0, 2.0, 'both'],
 [1, 2.0, 'k_means'],
 [2, 4.0, 'k_means'],
 [3, 2.0, 'k_means'],
 [4, 2.0, 'k_means'],
 [5, 4.0, 'k_means'],
 [6, 2.0, 'k_means'],
 [7, 2.0, 'k_means'],
 [8, 3.0, 'k_means'],
 [9, 2.0, 'k_means'],
 [10, 3.0, 'k_means'],
 [11, 2.0, 'gmm'],
 [12, 2.0, 'k_means'],
 [13, 2.0, 'k_means'],
 [14, 2.0, 'k_means'],
 [15, 2.0, 'both'],
 [16, 2.0, 'k_means'],
 [17, 2.0, 'k_means'],
 [18, 3.0, 'k_means'],
 [19, 3.0, 'k_means'],
 [20, 2.0, 'k_means'],
 [21, 2.0, 'k_means'],
 [22, 2.0, 'k_means'],
 [23, 2.0, 'k_means'],
 [24, 2.0, 'k_means'],
 [25, 10.0, 'k_means'],
 [26, 7.0, 'k_means'],
 [27, 5.0, 'k_means'],
 [28, 2.0, 'k_means']]

In [110]:
patient_classes_total = pd.DataFrame(-1,index=np.arange(amount_of_patients), columns=np.arange(amount_of_questions))

In [136]:
for i in range(amount_of_questions):

    cur_best_model = best_cluster_type_list[i][2]

    if cur_best_model == 'gmm':
        patient_classes_total[i] = pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t',index_col='Patient ID')['gmm']
    else:
        patient_classes_total[i] = pd.read_csv(f'./data/output/questions/Q{i}_labels.tsv', sep='\t',index_col='Patient ID')['k_means']


In [271]:
patient_classes_total

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0,7.0,1.0,0.0,1.0,3.0,,,2.0,,...,,,1.0,,,,,,,1.0
1,0,6.0,3.0,0.0,1.0,,,,,,...,,,,,,,,,,
2,0,6.0,3.0,0.0,0.0,3.0,,,2.0,,...,,,1.0,,,,,,,
3,1,5.0,0.0,0.0,1.0,,,,,,...,,1.0,1.0,0.0,0.0,1.0,,,,0.0
4,0,0.0,0.0,0.0,1.0,,0.0,2.0,,,...,,,1.0,0.0,0.0,1.0,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2123,1,7.0,1.0,,1.0,,,0.0,,,...,2.0,,,,,,,,3.0,
2124,0,0.0,1.0,,1.0,,,,,,...,,,,,,,,,,
2125,1,4.0,0.0,,1.0,,,,,,...,,,,,,,,,,
2126,1,2.0,3.0,,1.0,,0.0,3.0,,,...,,,,,,,,,,


In [245]:
# Function to determine amount of patient clusters
def patient_class_find_cluster_loop(i, selected_data, prev_amount, prev_classes_list, min_amount):
    
    for count in selected_data[i].value_counts():
        if count < min_amount:
            print(prev_classes_list, prev_amount)
            return
    
    for cluster_num in sorted(selected_data[i].unique()):
        selection = selected_data[selected_data[i] == cluster_num]
        prev_classes_list.append(cluster_num)
        patient_class_find_cluster_loop(i+1, selection, len(selection), prev_classes_list, min_amount)
        prev_classes_list.pop()

In [269]:
# Run the function to print out the class assignments and the amount of patients in that assignment
patient_class_find_cluster_loop(0,patient_classes_total, 0, [], 50)

[0, 0.0] 204
[0, 1.0] 133
[0, 2.0] 168
[0, 3.0] 73
[0, 4.0] 151
[0, 5.0] 105
[0, 6.0] 214
[0, 7.0] 88
[1] 991


In [304]:
patient_classes_total

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0,7.0,1.0,0.0,1.0,3.0,,,2.0,,...,,,1.0,,,,,,,1.0
1,0,6.0,3.0,0.0,1.0,,,,,,...,,,,,,,,,,
2,0,6.0,3.0,0.0,0.0,3.0,,,2.0,,...,,,1.0,,,,,,,
3,1,5.0,0.0,0.0,1.0,,,,,,...,,1.0,1.0,0.0,0.0,1.0,,,,0.0
4,0,0.0,0.0,0.0,1.0,,0.0,2.0,,,...,,,1.0,0.0,0.0,1.0,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2123,1,7.0,1.0,,1.0,,,0.0,,,...,2.0,,,,,,,,3.0,
2124,0,0.0,1.0,,1.0,,,,,,...,,,,,,,,,,
2125,1,4.0,0.0,,1.0,,,,,,...,,,,,,,,,,
2126,1,2.0,3.0,,1.0,,0.0,3.0,,,...,,,,,,,,,,


In [300]:
cluster_assignment_list = []
amount_cluster_assignments = []
with open('./data/input/found_clusters_min_2.txt') as input_file:
    for line in input_file:
        line_split1 = line.split(']')
        cluster_assignment_list.append(ast.literal_eval(line_split1[0].split('[')[1]))
        amount_cluster_assignments.append(int(line_split1[1].split('\n')[0]))

In [301]:
amount_cluster_assignments

[2,
 19,
 7,
 14,
 23,
 2,
 13,
 11,
 5,
 41,
 15,
 9,
 6,
 3,
 2,
 2,
 8,
 2,
 2,
 6,
 2,
 3,
 14,
 3,
 18,
 13,
 2,
 15,
 3,
 5,
 26,
 3,
 11,
 6,
 6,
 4,
 2,
 4,
 6,
 4,
 3,
 14,
 3,
 15,
 16,
 11,
 3,
 3,
 2,
 2,
 4,
 5,
 30,
 4,
 10,
 18,
 3,
 7,
 12,
 3,
 6,
 4,
 14,
 2,
 9,
 6,
 11,
 4,
 4,
 7,
 8,
 114,
 4,
 6,
 11,
 3,
 3,
 16,
 12,
 4,
 3,
 2,
 4,
 10,
 11,
 8,
 10,
 4,
 7,
 6,
 2,
 2,
 18,
 11,
 5,
 8,
 4,
 6,
 7,
 5,
 5,
 5,
 2,
 18,
 4,
 2,
 9,
 4,
 2,
 2,
 11,
 3,
 3,
 2,
 3,
 8,
 5,
 15,
 12,
 8,
 18,
 3,
 9,
 9,
 6,
 4,
 3,
 8,
 13,
 3,
 2,
 4,
 2,
 8,
 9,
 4,
 9,
 4,
 9,
 2,
 2,
 11,
 2,
 4,
 8,
 11,
 99,
 8,
 29,
 12,
 8,
 4,
 3,
 6,
 3,
 3,
 32,
 10,
 9,
 4,
 3,
 6,
 3,
 10,
 6,
 5,
 10,
 4,
 4]

In [292]:
cluster_assignment_list

[(0, 0.0, 0.0, 0.0, 0.0),
 (0, 0.0, 0.0, 0.0, 1.0),
 (0, 0.0, 0.0, 1.0, 1.0),
 (0, 0.0, 1.0, 0.0, 0.0),
 (0, 0.0, 1.0, 0.0, 1.0),
 (0, 0.0, 1.0, 1.0, 0.0),
 (0, 0.0, 1.0, 1.0, 1.0),
 (0, 0.0, 2.0, 0.0, 1.0),
 (0, 0.0, 2.0, 1.0, 1.0),
 (0, 0.0, 3.0, 0.0, 1.0),
 (0, 0.0, 3.0, 1.0),
 (0, 1.0, 0.0, 0.0),
 (0, 1.0, 0.0, 1.0),
 (0, 1.0, 1.0, 0.0, 1.0, 1.0),
 (0, 1.0, 1.0, 1.0, 0.0, 3.0),
 (0, 1.0, 1.0, 1.0, 1.0, 3.0),
 (0, 1.0, 2.0, 0.0),
 (0, 1.0, 2.0, 1.0, 0.0),
 (0, 1.0, 2.0, 1.0, 1.0),
 (0, 1.0, 3.0, 0.0, 0.0),
 (0, 1.0, 3.0, 0.0, 1.0, 1.0),
 (0, 1.0, 3.0, 1.0, 0.0),
 (0, 2.0, 0.0, 0.0),
 (0, 2.0, 1.0, 0.0, 0.0, 3.0, 0.0),
 (0, 2.0, 1.0, 0.0, 1.0),
 (0, 2.0, 1.0, 1.0, 1.0),
 (0, 2.0, 2.0, 0.0, 0.0),
 (0, 2.0, 2.0, 0.0, 1.0),
 (0, 2.0, 2.0, 1.0),
 (0, 2.0, 3.0, 0.0, 0.0),
 (0, 2.0, 3.0, 0.0, 1.0),
 (0, 2.0, 3.0, 1.0, 0.0),
 (0, 2.0, 3.0, 1.0, 1.0),
 (0, 3.0, 0.0, 0.0, 1.0),
 (0, 3.0, 0.0, 1.0, 1.0),
 (0, 3.0, 1.0, 0.0, 0.0),
 (0, 3.0, 1.0, 0.0, 1.0, 3.0),
 (0, 3.0, 1.0, 1.0),
 (0, 3.0, 2.

In [275]:
np.loadtxt('./data/input/found_clusters_min_2.txt')

ValueError: could not convert string '[0,' to float64 at row 0, column 1.