In [None]:
%config Completer.use_jedi = False

In [None]:
# python library
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
import os

# keep only alphatical and alpha+numerical tokens
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
def count_select_code(df):
    # df to store the text and code
    df_text_code = df[['teksti', 'nakyma']]
    # collect the code --> count code --> distribution
    code_list = []
    for idx, row in df_text_code.iterrows():
        code_list.append(row['nakyma'])
    # return the dict = {code: count}
    code_num_dict = dict(Counter(code_list))
    # based on the clinician's suggestion
    # LÄH; KIR; OPER; RTG; SÄD; --> significant to clinilians's works
    num_KIR = code_num_dict['KIR']
    num_RTG = code_num_dict['RTG']
    num_LAH = code_num_dict['LÄH']
    num_SAD = code_num_dict['SÄD']
    num_OPER = code_num_dict['OPER']

    num_list = [num_KIR, num_LAH, num_OPER, num_RTG, num_SAD]
    print('KIR:{}, RTG:{}, LAH:{}, SAD:{}, OPER:{}'.format(num_list[0], num_list[1], num_list[2], num_list[3], num_list[4]))
    return num_list

In [None]:
def retrieve_samples(df, num, code_type):
    '''
    Aim: truncate dataframe by the defined num (given medical specialty)
    Input: 
        df --> sum of dataframe
        num --> truncate length
        code_type --> medical specialty
    Output:
        text_collect --> truncated dataframe
    '''
    text_collect = []
    count = 0
    for idx, row in df.iterrows():
        if row['nakyma'] == code_type:
            text_collect.append(row['teksti'])
            count += 1
        if count > num:
            break
    return text_collect

In [None]:
def write_string_file(write_str, output_path):
    '''
    Aim: write string to file
    Input: 
        write_str --> string to be written
        output_path --> write string to the path
    Output:
        written file
    '''
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(write_str)
        f.close()

In [None]:
def keep_main_text(list_item):
    '''
    Aim: keep the main text of clinical documents
    Input:
        list_item --> item in the list
    Output:
        output_item --> list of processed sentences
    '''
    line_list = []
    for line in list_item.split('\n'):
        if len(line.split(' ')) > 5:
            line_list += line.split('.')

    output_item = [sent.strip() for sent in line_list if len(sent.split(' '))>1]
    return output_item

In [None]:
def preprocess(spec_list, processed_data_path, med_spec):
    '''
    Aim: preprocess the list of medical specialty (remove punctuations; numerical-only tokens; lower all tokens)
    Input: 
        spec_list --> the list of medical specialty 
        processed_data_path --> the data path of processed_data
        med_spec --> medical specialty 
    '''
    med_folder_path = processed_data_path + '/' + med_spec
    if not os.path.exists(med_folder_path):
        os.mkdir(med_folder_path)
    
    collect_list = []
    for doc in spec_list:
        collect_list += keep_main_text(doc)
    tokenized_txt_list = []
    for sent in tqdm(collect_list):
        #print(sent)
        tokens = [tok.lower() for tok in tokenizer.tokenize(sent) if not tok.isnumeric()]
        if len(tokens) > 10:
            tokenized_txt = ' '.join(tokens)
            tokenized_txt += ' .'
            tokenized_txt_list.append(tokenized_txt)
        #print(tokenized_txt)
    output_path = med_folder_path + '/full_sample.txt'
    write_string_file('\n'.join(list(set(tokenized_txt_list))), output_path)

In [None]:
def token_list(spec_list, processed_data_path, med_spec):
    '''
    Aim: preprocess the list of medical specialty (remove punctuations; numerical-only tokens; lower all tokens)
    Input: 
        spec_list --> the list of medical specialty 
        processed_data_path --> the data path of processed_data
        med_spec --> medical specialty 
    '''
    med_folder_path = processed_data_path + '/' + med_spec
    if not os.path.exists(med_folder_path):
        os.mkdir(med_folder_path)
    
    collect_list = []
    for doc in spec_list:
        collect_list += keep_main_text(doc)
    tokenized_txt_list = []
    for sent in collect_list:
        tokens = [tok.lower() for tok in tokenizer.tokenize(sent) if not tok.isnumeric()]
        if len(tokens) > 10:
            tokenized_txt = ' '.join(tokens)
            tokenized_txt += '.'
            tokenized_txt_list.append(tokenized_txt)
        #print(tokenized_txt)
    return tokenized_txt_list

In [None]:
data_path1 = '../data/patient_record_text/patient_record/part-00000-tid-5042385561176658048-dad14b75-56bd-44a4-8a7c-b9b608ed88ea-28513-1-c000.snappy.parquet'
df1 = pd.read_parquet(data_path1)
KIR_list = retrieve_samples(df1, 30, 'KIR')
preprocess(KIR_list, '../data/processed_data/', '1') # num_sentence: 15206  


In [None]:
data_path1 = '../data/patient_record_text/patient_record/part-00000-.snappy.parquet'
data_path2 = '../data/patient_record_text/patient_record/part-00001-.snappy.parquet'
data_path3 = '../data/patient_record_text/patient_record/part-00002-.snappy.parquet'
data_path4 = '../data/patient_record_text/patient_record/part-00003-.snappy.parquet'
data_path5 = '../data/patient_record_text/patient_record/part-00004-.snappy.parquet'
data_path6 = '../data/patient_record_text/patient_record/part-00005-.snappy.parquet'
data_path7 = '../data/patient_record_text/patient_record/part-00006-.snappy.parquet'

df1 = pd.read_parquet(data_path1)
df2 = pd.read_parquet(data_path2)
df3 = pd.read_parquet(data_path3)
df4 = pd.read_parquet(data_path4)
df5 = pd.read_parquet(data_path5)
df6 = pd.read_parquet(data_path6)
df7 = pd.read_parquet(data_path7)
df = pd.concat([df1, df2, df3], ignore_index=True)

In [None]:
df = pd.concat([df1, df2, df3, df4, df5, df6, df7], ignore_index=True)
d = dict(Counter(df['nakyma'].tolist()))
preprocess(KIR_list, '../data/processed_data/', 'KIR') # num_sentence: 15206  


In [None]:
d

In [None]:
{k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

In [None]:
# show few lines of combined df
# check the column --> 'teksti' and 'nakyma' are needed 
# 'nakyma' is used to separate clinical documents by medical specialty
print(df1.columns)
df1[:5]

In [None]:
# keep main text of documents
line_list = []
for line in df1.iloc[4]['teksti'].split('\n'):
    if len(line.split(' ')) > 5:
        line_list += line.split('.')
        
[sent.strip() + '.' for sent in line_list if len(sent)>1]

In [None]:
count_select_code(df1)
count_select_code(df2)
count_select_code(df3)
count_select_code(df4)
count_select_code(df5)
count_select_code(df6)
count_select_code(df7)

In [None]:
# combine
df = pd.concat([df1, df2, df3, df4, df5, df6,df7], ignore_index=True)

In [None]:
num_spe = np.array([18879, 10893, 8029, 21090, 9709])
num_spe/np.sum(num_spe)

In [None]:
887/(887+1089+802+2109+970)

In [None]:
'''
1. concate all dataframe
the num of annotations for each medical specialty:
KIR: 887
RTG: 1089
LAH: 802
SAD: 2109
OPER: 970
'''
KIR_list = retrieve_samples(df, 3887, 'KIR')
RTG_list = retrieve_samples(df, 9089, 'RTG')
LAH_list = retrieve_samples(df, 4502, 'LÄH')
SAD_list = retrieve_samples(df, 3209, 'SÄD')
OPER_list = retrieve_samples(df, 1970, 'OPER')

In [None]:
preprocess(KIR_list, '../data/processed_data/', 'KIR') # num_sentence: 15206  
preprocess(RTG_list, '../data/processed_data/', 'RTG') # num_sentence: 9687   
preprocess(LAH_list, '../data/processed_data/', 'LAH') # num_sentence: 12019   
preprocess(SAD_list, '../data/processed_data/', 'SAD') # num_sentence: 13499   
preprocess(OPER_list, '../data/processed_data/', 'OPER') # num_sentence: 22527   

In [None]:
sets = 'SAD'
with open('../../data/processed_data/{}/full_sample.txt'.format(sets), 'r') as f:
    print(len(f.readlines()))
    f.close()

In [None]:
# CoNLL 2002 --> 16.7%
num_list = np.array([11289, 10112, 10113, 11100, 10149])
num_list/np.sum(num_list)

In [None]:
num_sample = np.round(1000*num_list/np.sum(num_list))

In [None]:
num_sample

In [None]:
for idx in range(len(num_sample)):
    prop = num_sample[idx]/num_list[idx]
    print(prop)

In [None]:
# annotaion rate is too low
# thus, we need to select informative samples to establish test dataset

In [None]:
processed_KIR_list = token_list(KIR_list, '../data/processed_data/', 'KIR')

In [None]:
len(processed_KIR_list)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_KIR_list)

In [None]:
X = X.toarray()

In [None]:
pca = PCA(n_components=100)
res = pca.fit_transform(X)

In [None]:
res[:5]

In [None]:
kmeans = Kmeans(n_cluster=2, random_state=0).fit(X)

In [None]:
from transformers import AutoTokenizer