In [None]:
%config Completer.use_jedi = False

In [None]:
# python library
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
import os
import random

# keep only alphatical and alpha+numerical tokens
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
def write_string_file(write_str, output_path):
    '''
    Aim: write string to file
    Input: 
        write_str --> string to be written
        output_path --> write string to the path
    Output:
        written file
    '''
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(write_str)
        f.close()

In [None]:
def preprocess(spec_list, processed_data_path, sets):
    '''
    Aim: preprocess the list of medical specialty (remove punctuations; numerical-only tokens; lower all tokens)
    Input: 
        spec_list --> the list of medical specialty 
        processed_data_path --> the data path of processed_data
    '''
    med_folder_path = processed_data_path
    if not os.path.exists(med_folder_path):
        os.mkdir(med_folder_path)
    
    collect_list = []
    for doc in spec_list:
        collect_list += keep_main_text(doc)
    tokenized_txt_list = []
    for sent in tqdm(collect_list):
        tokens = [tok.lower() for tok in tokenizer.tokenize(sent) if not tok.isnumeric()]
        if len(tokens) > 1:
            tokenized_txt = ' '.join(tokens)
            tokenized_txt_list.append(tokenized_txt)
        #print(tokenized_txt)
    output_path = med_folder_path + '/pretrain_text_{}.txt'.format(sets)
    write_string_file('\n'.join(tokenized_txt_list), output_path)

In [None]:
def keep_main_text(list_item):
    '''
    Aim: keep the main text of clinical documents
    Input:
        list_item --> item in the list
    Output:
        output_item --> list of processed sentences
    '''
    line_list = []
    for line in list_item.split('\n'):
        if len(line.split(' ')) > 5:
            line_list += line.split('.')

    output_item = [sent.strip() for sent in line_list if len(sent.split(' '))>1]
    return output_item

In [None]:
df = pd.read_parquet('../../data/patient_record_pretrain')

In [None]:
df_sample = df.sample(frac=0.2)

In [None]:
len(df_sample)

In [None]:
df_train, df_eval = np.split(df_sample.sample(frac=1, random_state=42), [int(0.9*len(df_sample))])

In [None]:
train_text_list = []
count = 0
for idx, row in df_train.iterrows():
    train_text_list.append(row['teksti'])
    #tok_list = [tok.lower() for tok in tokenizer.tokenize(row['teksti']) if not tok.isnumeric()]
    #count+=1
    
eval_text_list = []
for idx, row in df_eval.iterrows():
    eval_text_list.append(row['teksti'])
    

In [None]:
preprocess(train_text_list, '../../data/pretrain_data/', 'train')

In [None]:
preprocess(eval_text_list, '../../data/pretrain_data/', 'eval')