# Import required libraries

In [None]:
import sys
import pprint
import pandas as pd
import numpy as np
from scipy.special import softmax

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterGrid
import sklearn
import os

import shutil

from os import listdir
from os.path import isfile, join

import time

In [None]:
# Setting path for importing required functions for data processing

sys.path.append("/home/jupyter/sonam/adhd_nlp/final_notebook_folder/data_processing")
sys.path.append("/home/jupyter/sonam/adhd_nlp/final_notebook_folder/data_processing")

In [None]:
# Functions required for data processing

import final_process_text
import final_transform_textfiles

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 300)
pd.set_option('display.max_columns', 100)

In [None]:
# using simpletransformer ai library

from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Import Data

In [None]:
label_of_interest = "BT_yn"

In [None]:
originalTextData = final_transform_textfiles.extractOriginalText("/home/jupyter/data/cohort_2to6/Text files/combined_text")

annotatedXMIs = final_transform_textfiles.extractXMIAnnotation("/home/jupyter/data/cohort_2to6/XMI files/combined")


In [None]:
print(annotatedXMIs['anon_id'].value_counts(ascending=False))

# Data Processing 

In [None]:
# Deleting last row from 'originalTextData' and 'annotatedXMIS' dataframe as the last row is just checpoint roe mentioned above
# for  ANNON_ID 10040 and hence deleting it would result in final cohort size of 432 as needed for new results.

In [None]:
# dropping last row in text and XMI pandas dataframe.
#originalTextData.drop(originalTextData.tail(-1).index,inplace=True)
# annotatedXMIs.drop(annotatedXMIs.tail(-1).index,inplace=True)

originalTextData = originalTextData[:-1]
annotatedXMIs = annotatedXMIs[:-1]


In [None]:
# creating single label_of_interest "BT_yn" column using other columns

annotatedXMIs['BT_yn'] = np.where((annotatedXMIs['Counsel_Parent_BT'] == 1) | (annotatedXMIs['Counsel_Handout_BT'] == 1) | (annotatedXMIs['Refer_Parent_BT'] == 1) | (annotatedXMIs['Refer_School_BT'] == 1), 1, 0)
annotatedXMIs['BT_yn'].value_counts()

In [None]:
# merging data from both files 
data = originalTextData.merge(annotatedXMIs, on = "file", how = "right")

In [None]:
# using imported function sectionize() for processing notes text data

data['extractText'] = data['note_des'].apply(lambda x: final_process_text.sectionize(x)[1])

In [None]:
# using imported function clean_text() for processing notes text data

data['extractText'] = data['extractText'].apply(lambda x: final_process_text.clean_text(x))

In [None]:
data = data.loc[:, ['extractText',label_of_interest]]\
       .rename(columns = {'extractText':'text',
                          label_of_interest: 'label'})

In [None]:
X = data.loc[:, 'text']
y = data.loc[:, 'label']

# Split the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 117, stratify = y)
X_val_train, X_val_test, y_val_train, y_val_test = train_test_split(X_train, y_train, test_size = 0.3, random_state = 117, stratify = y_train)

In [None]:
val_train = pd.concat([X_val_train, y_val_train], axis = 1)
val_test = pd.concat([X_val_test, y_val_test], axis = 1)
test = pd.concat([X_test, y_test], axis = 1)

#test.head()

# Single Run

In [None]:
# setting different parameters for training the transformer model

model_args = ClassificationArgs()
batch_size = 8
steps_per_epoch = np.ceil(val_train.shape[0]/batch_size)
model_args.num_train_epochs = 30

model_args.eval_batch_size = batch_size
model_args.train_batch_size = batch_size
model_args.manual_seed = 117

model_args.evaluate_during_training_steps = steps_per_epoch
model_args.evaluate_during_training_verbose = True
model_args.max_seq_length = 512
model_args.learning_rate = 0.00008	

model_args.save_model_every_epoch = False
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.logging_steps = steps_per_epoch

model_args.overwrite_output_dir = True
model_args.manual_seed = 117
model_args.evaluate_during_training = True

model_args.best_model_dir = 'final_biobertbest_model_dir_new_cohort'
model_args.output_dir = 'final_biobert_output_dir_new_cohort'
model_args.tensorboard_dir = 'final_biobert_tensorboard_runs_new_cohort'

In [None]:
model = ClassificationModel("bert", "emilyalsentzer/Bio_ClinicalBERT", args=model_args, use_cuda = True, num_labels = 2)

In [None]:
# training the model

start = time.time()
model.train_model(val_train, f1 = f1_score, eval_df = val_test)
end = time.time()
print(end - start)

# Evaluation of the trained model

#Evaluate on Validation set (89 samples)

In [None]:

result, model_outputs, wrong_predictions = model.eval_model(val_test, f1 = f1_score, 
                                                            recall = sklearn.metrics.recall_score,
                                                            precision = sklearn.metrics.precision_score,
                                                            auc = sklearn.metrics.roc_auc_score,
                                                            accuracy = sklearn.metrics.accuracy_score)
result

In [None]:
len(wrong_predictions)

In [None]:
# saving misclassified samples for validation set in file
with open('/home/jupyter/sonam/final_result_files/final_val_misclassification_new_cohort.txt', mode='wt', encoding='utf-8') as myfile:
    list=["index", "truth", "error","tokenLength", "text"]
    myfile.writelines(str(list));
    myfile.writelines("\n");
    for x in range(len(wrong_predictions)): 
        list=[];
        list.append(x+1);
        truth=wrong_predictions[x].label;
        list.append(truth);
        if truth==1: list.append("fn")
        elif truth==0: list.append("fp")
        list.append(len(model.tokenizer(wrong_predictions[x].text_a)['input_ids']));
        list.append(wrong_predictions[x].text_a);
        myfile.writelines(str(list));
        myfile.writelines("\n");
myfile.close

#Evaluate on Test set (127 samples)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test, f1 = f1_score,  
                                                            recall = sklearn.metrics.recall_score,
                                                            precision = sklearn.metrics.precision_score,
                                                            auc = sklearn.metrics.roc_auc_score,
                                                           accuracy = sklearn.metrics.accuracy_score)


predictions, probabilities = model.predict(test['text'].tolist())
test['predictions'] = predictions
test['probabilities'] = [x[1] for x in np.array([softmax(element) for element in probabilities])]
result

#Getting precision, recall, thresholds metrics for test set and calculating threshold f1 score.

In [None]:
precision, recall, thresholds = sklearn.metrics.precision_recall_curve(test['label'], test['probabilities'])
precision = precision[:-1]
recall = recall[:-1]
thresholdDF = pd.DataFrame(data = {'precision': precision, 'recall': recall, 'thresholds':thresholds})
thresholdDF['f1_score'] = 2*(thresholdDF['precision']*thresholdDF['recall'])/(thresholdDF['precision'] + thresholdDF['recall'])

In [None]:
max(thresholdDF['f1_score'])