# Importing required libraries

In [1]:
import sys
import pprint
import pandas as pd
import numpy as np
from scipy.special import softmax

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterGrid
import sklearn
import os

import shutil

from os import listdir
from os.path import isfile, join

import time

In [2]:
# Setting path for importing required functions for data processing

sys.path.append("/home/jupyter/sonam/adhd_nlp/final_notebook_folder/data_processing")
sys.path.append("/home/jupyter/sonam/adhd_nlp/final_notebook_folder/data_processing")

In [3]:
# Functions required for data processing

import final_process_text
import final_transform_textfiles

In [4]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 300)
pd.set_option('display.max_columns', 100)

In [5]:
# using simpletransformer ai library

from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [6]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Import Data

In [7]:
label_of_interest = "BT_yn"

In [None]:
originalTextData = final_transform_textfiles.extractOriginalText("/home/jupyter/data/cohort_2to6/Text files/combined_text")

annotatedXMIs = final_transform_textfiles.extractXMIAnnotation("/home/jupyter/data/cohort_2to6/XMI files/combined")


In [14]:
print(annotatedXMIs['anon_id'].value_counts(ascending=False))

10040    2
58356    1
60206    1
32135    1
29371    1
        ..
11743    1
14870    1
69011    1
10235    1
71104    1
Name: anon_id, Length: 423, dtype: int64


# Data Processing 

In [15]:
# Deleting last row from 'originalTextData' and 'annotatedXMIS' dataframe as the last row is just checpoint row mentioned above
# for  ANNON_ID 10040 and hence deleting it would result in final cohort size of 432 as needed for new results.

In [None]:
# dropping last row in text and XMI pandas dataframe.
#originalTextData.drop(originalTextData.tail(-1).index,inplace=True)
# annotatedXMIs.drop(annotatedXMIs.tail(-1).index,inplace=True)

originalTextData = originalTextData[:-1]
annotatedXMIs = annotatedXMIs[:-1]


In [19]:
# creating single label_of_interest "BT_yn" column using other columns

annotatedXMIs['BT_yn'] = np.where((annotatedXMIs['Counsel_Parent_BT'] == 1) | (annotatedXMIs['Counsel_Handout_BT'] == 1) | (annotatedXMIs['Refer_Parent_BT'] == 1) | (annotatedXMIs['Refer_School_BT'] == 1), 1, 0)
annotatedXMIs['BT_yn'].value_counts()

0    303
1    120
Name: BT_yn, dtype: int64

In [22]:
# merging data from both files 
data = originalTextData.merge(annotatedXMIs, on = "file", how = "right")

In [23]:
# using imported function sectionize() for processing notes text data

data['extractText'] = data['note_des'].apply(lambda x: final_process_text.sectionize(x)[1])

In [24]:
# using imported function clean_text() for processing notes text data

data['extractText'] = data['extractText'].apply(lambda x: final_process_text.clean_text(x))

In [25]:
data = data.loc[:, ['extractText',label_of_interest]]\
       .rename(columns = {'extractText':'text',
                          label_of_interest: 'label'})

In [26]:
X = data.loc[:, 'text']
y = data.loc[:, 'label']

# Split the Data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 117, stratify = y)
X_val_train, X_val_test, y_val_train, y_val_test = train_test_split(X_train, y_train, test_size = 0.3, random_state = 117, stratify = y_train)

In [28]:
val_train = pd.concat([X_val_train, y_val_train], axis = 1)
val_test = pd.concat([X_val_test, y_val_test], axis = 1)
test = pd.concat([X_test, y_test], axis = 1)

# test.head()

# Loading saved model and evaluating

In [32]:
# loading saved weights from the training  for defining the model
model = ClassificationModel("bert", "./final_biobert_output_dir_new_cohort",use_cuda = True, num_labels = 2)

#Evaluate on Validation set (89 samples)

In [33]:

result, model_outputs, wrong_predictions = model.eval_model(val_test, f1 = f1_score, 
                                                            recall = sklearn.metrics.recall_score,
                                                            precision = sklearn.metrics.precision_score,
                                                            auc = sklearn.metrics.roc_auc_score,
                                                            accuracy = sklearn.metrics.accuracy_score)
result

HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=12.0, style=ProgressStyle(descri…




{'mcc': 0.52852868908695,
 'tp': 14,
 'tn': 59,
 'fp': 5,
 'fn': 11,
 'auroc': 0.76,
 'auprc': 0.7235928012436937,
 'f1': 0.6363636363636364,
 'recall': 0.56,
 'precision': 0.7368421052631579,
 'auc': 0.7409375,
 'accuracy': 0.8202247191011236,
 'eval_loss': 1.3299520648154914}

In [34]:
predictions, probabilities = model.predict(val_test['text'].tolist())
val_test['predictions'] = predictions
val_test['probabilities'] = [x[1] for x in np.array([softmax(element) for element in probabilities])]

HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




#Function to create the precion-recall-f1 score 

In [35]:
def precision_recall_metrics(true_label , pred_prob):
#     true_label= val_test['label']
#     pred_prob = val_test['probabilities']

    precision, recall, thresholds = sklearn.metrics.precision_recall_curve(true_label, pred_prob)
    precision = precision[:-1]
    recall = recall[:-1]
    f1 = 2*(precision*recall)/(precision+recall)
    results_DF = pd.DataFrame(data = {'precision': precision, 
                                       'recall': recall,
                                       'f1' : f1,
                                      'thresholds':thresholds})
    print(results_DF)


#Function to create a confusion matrix 

In [36]:
def confusion_matrix_thr(threshold_final, true_label, pred_prob):
    
    pred_label = (pred_prob >= threshold_final)

    pred_label = pred_label.values.astype(int)

    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(true_label, pred_label, normalize='true').ravel()
    print(sklearn.metrics.classification_report(true_label, pred_label))

    print("tn:",tn)
    print("tp:",tp)
    print("fn:",fn)
    print("fp:",fp)
    
    return pred_label

In [38]:
print("The dataframe with all the metrics and threshold to set a threshold value which gives maximum precision with decent recall and f1 score.")

precision_recall_metrics(val_test['label'], val_test['probabilities'])

The dataframe with all the metrics and threshold to set a threshold value which gives maximum precision with decent recall and f1 score.
    precision  recall        f1  thresholds
0    0.280899    1.00  0.438596    0.000084
1    0.272727    0.96  0.424779    0.000084
2    0.289157    0.96  0.444444    0.000084
3    0.291139    0.92  0.442308    0.000085
4    0.319444    0.92  0.474227    0.000085
5    0.313433    0.84  0.456522    0.000085
6    0.318182    0.84  0.461538    0.000086
7    0.328125    0.84  0.471910    0.000086
8    0.333333    0.84  0.477273    0.000086
9    0.344262    0.84  0.488372    0.000087
10   0.333333    0.80  0.470588    0.000087
11   0.338983    0.80  0.476190    0.000088
12   0.350877    0.80  0.487805    0.000088
13   0.333333    0.72  0.455696    0.000089
14   0.346154    0.72  0.467532    0.000089
15   0.352941    0.72  0.473684    0.000089
16   0.382979    0.72  0.500000    0.000090
17   0.409091    0.72  0.521739    0.000090
18   0.428571    0.72  0.53

#selected row 38 with threshold of 0.001842 for further calculations

#Creating confusion matrix for Validation set(89 samples)

In [39]:
true_label_val= val_test['label']
pred_prob_val = val_test['probabilities']

In [40]:

confusion_matrix_thr(0.001842,true_label_val,pred_prob_val)

              precision    recall  f1-score   support

           0       0.87      0.92      0.89        64
           1       0.76      0.64      0.70        25

    accuracy                           0.84        89
   macro avg       0.81      0.78      0.79        89
weighted avg       0.84      0.84      0.84        89

tn: 0.921875
tp: 0.64
fn: 0.36
fp: 0.078125


array([1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1])

#Creating confusion matrix for Test set(127 samples)

In [42]:
test_predictions, test_probabilities = model.predict(test['text'].tolist())

test['predictions'] = test_predictions
test['probabilities'] = [x[1] for x in np.array([softmax(element) for element in test_probabilities])]

HBox(children=(FloatProgress(value=0.0, max=127.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




In [43]:
test_true_label= test['label']
test_pred_prob = test['probabilities']

In [44]:
# Saving predictions obtained for test set using threshold value in arr
arr =confusion_matrix_thr(0.001842,test_true_label,test_pred_prob)

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        91
           1       0.84      0.72      0.78        36

    accuracy                           0.88       127
   macro avg       0.87      0.83      0.85       127
weighted avg       0.88      0.88      0.88       127

tn: 0.945054945054945
tp: 0.7222222222222222
fn: 0.2777777777777778
fp: 0.054945054945054944


In [46]:
# Updating predictions for test set with new predictions obtained according to threshold value

test['predictions']= arr

#Function to get the dataframe for missclassified samples 

In [48]:
def get_WP(test):
    
    index= test.index
    condition = ((test['label'] != test['predictions']))
    missclassified_indices = index[condition]

    missclassified_indices_list = missclassified_indices.tolist() 
    miss_df = test.loc[test.index.isin(missclassified_indices_list)]
    return miss_df

In [49]:
test_miss_df = get_WP(test)

wrong_predictions = test_miss_df.to_records(index=True)
len(wrong_predictions)

15

#saving the misclassified notes in file for test set

In [50]:
with open('/home/jupyter/sonam/final_result_files/final_threshold_test_misclassification_new_cohort.txt', mode='wt', encoding='utf-8') as myfile:
    list=["index","tokenLength","text", "truth", "error"]
    myfile.writelines(str(list));
    myfile.writelines("\n");
    for x in range(len(wrong_predictions)): 
        list=[];
        for y in range(3): 
            if(y==0):
                list.append(wrong_predictions[x][y])
            elif (y ==2):
                truth=wrong_predictions[x][y];
                list.append(truth);
                if truth==1: list.append("fn")
                elif truth==0: list.append("fp")
            elif(y==1):
                list.append(len(model.tokenizer(wrong_predictions[x][y])['input_ids']));
                list.append(wrong_predictions[x][y]);
        myfile.writelines(str(list));
        myfile.writelines("\n");
myfile.close

<function TextIOWrapper.close()>

#Checking saved model on Train Set(207 samples)

In [52]:
result, model_outputs, wrong_predictions = model.eval_model(val_train, f1 = f1_score,  
                                                            recall = sklearn.metrics.recall_score,
                                                            precision = sklearn.metrics.precision_score,
                                                            auc = sklearn.metrics.roc_auc_score,
                                                           accuracy = sklearn.metrics.accuracy_score)
result

HBox(children=(FloatProgress(value=0.0, max=207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=26.0, style=ProgressStyle(descri…




{'mcc': 1.0,
 'tp': 59,
 'tn': 148,
 'fp': 0,
 'fn': 0,
 'auroc': 1.0,
 'auprc': 1.0,
 'f1': 1.0,
 'recall': 1.0,
 'precision': 1.0,
 'auc': 1.0,
 'accuracy': 1.0,
 'eval_loss': 8.889194885761334e-05}

#Getting precision, recall, and thresholds for Test SEt (127 samples )

In [55]:
precision, recall, thresholds = sklearn.metrics.precision_recall_curve(test['label'], test['predictions'])
precision = precision[:-1]
recall = recall[:-1]
thresholdDF = pd.DataFrame(data = {'precision': precision, 'recall': recall, 'thresholds':thresholds})
thresholdDF['f1_score'] = 2*(thresholdDF['precision']*thresholdDF['recall'])/(thresholdDF['precision'] + thresholdDF['recall'])

In [56]:
max(thresholdDF['f1_score'])

0.7761194029850746