### Imports

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_directory = './data/input/dental_simcse/'

### Load the groups found by clustering

In [3]:
min_group_25 = pd.read_csv(data_directory+'min_group_25_clusters', sep='\t', index_col=0)
min_group_50 = pd.read_csv(data_directory+'min_group_50_clusters', sep='\t', index_col=0)

In [4]:
df_min_group = min_group_25.join(min_group_50).copy()
df_min_group.index.name = 'Patient ID'

In [5]:
df_min_group[:10]

Unnamed: 0_level_0,Min group of 25,Min group of 50
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,1,1
2,1,1
3,2,2
4,3,3
5,4,2
6,5,2
7,5,2
8,6,2
9,7,2


In [6]:
df_min_group.loc[200]

Min group of 25    17
Min group of 50     8
Name: 200, dtype: int64

### Load data for first visualization: Top 3 Questions

In [7]:
top3_labels = df_min_group.copy()

In [8]:
top3_text = pd.read_csv(data_directory+'patient_vis_labels.tsv', sep='\t',index_col='Patient ID')

In [9]:
top3_labels = top3_labels.join(top3_text)

In [10]:
top3_labels.to_csv(
    './models/dental_simcse/data/A. Top 3 questions/top3_metadata.tsv',
    sep="\t",header=True,index=True
)

## Load data for second visualization: All Sentences

In [11]:
all_sentence_labels = pd.read_csv(data_directory+'dental_labels_all.tsv', sep='\t', index_col=0)

In [12]:
all_sentence_labels[['Min group of 25', 'Min group of 50']] = np.ones((len(all_sentence_labels),2)) * -1

In [13]:
for i in range(len(all_sentence_labels)):
    
    pID = all_sentence_labels.iloc[i]['Patient ID']
    
    found_clusters = df_min_group.loc[pID].values
    
    all_sentence_labels.loc[i, 'Min group of 25'] = found_clusters[0]
    all_sentence_labels.loc[i, 'Min group of 50'] = found_clusters[1]

In [14]:
all_sentence_labels.to_csv(
    './models/dental_simcse/data/B. All Responses/all_sentences_metadata.tsv',
    sep="\t",header=True,index=True
)

In [15]:
df_min_group.loc[200].values

array([17,  8])

In [16]:
df_min_group[199:201]

Unnamed: 0_level_0,Min group of 25,Min group of 50
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1
199,13,6
200,17,8


In [17]:
all_sentence_labels[all_sentence_labels['Patient ID'] == 200]

Unnamed: 0,Patient ID,Text,Min group of 25,Min group of 50
1621,200,Verspannungen im Nacken und Kopf bereich,17.0,8.0
1622,200,"keine Verspannungen, keine Kopfschmerzen",17.0,8.0
1623,200,"Beim kauen, vor allem morgens beim aufstehen",17.0,8.0
1624,200,Klar denken,17.0,8.0
1625,200,Massage Einheiten,17.0,8.0
1626,200,"ja, ständiger beidseitiger Kopfschmerz",17.0,8.0
1627,200,nein,17.0,8.0
1628,200,Psychisch belastend und nicht aushaldbar,17.0,8.0


## Load data for per question visualizations

In [18]:
amount_of_questions = 29

In [19]:
question_directory = './models/dental_simcse/data/questions/'

In [29]:
for i in range(amount_of_questions):

    question_labels = pd.read_csv(question_directory + f'Q{i}_labels.tsv', sep='\t', index_col=0)

    question_labels = question_labels.join(df_min_group.loc[list(question_labels.index)])

    question_labels.to_csv(question_directory + f'Q{i}_labels.tsv', sep='\t', index=True, header=True)