# Setup

In [None]:
import pandas as pd 
import numpy as np
import ast
import plotly.express as px

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

os.chdir('drive/MyDrive/NLP')
DATADIR = "data/"
SAVEDIR = "data/working/"

Mounted at /content/drive


# Loading Prediction


In [None]:
df_truth = pd.read_csv(DATADIR + 'validation.csv')
df_truth = df_truth.fillna('')

pred_BioC = pd.read_csv("data/working/" + 'BioC/prediction.csv')
pred_BioC = pred_BioC.fillna('')

pred_BioD = pd.read_csv("data/working/" + 'bioD/prediction.csv')
pred_BioD = pred_BioD.fillna('')

pred_BioC_S = pd.read_csv("data/working/" + 'BioC_small/prediction.csv')
pred_BioC_S = pred_BioC_S.fillna('')

pred_Distil = pd.read_csv("data/working/" + 'distilbert/prediction.csv')
pred_Distil = pred_Distil.fillna('')
feature = pd.read_csv(DATADIR + 'features.csv')

# Helper for Metrics Calculation

In [None]:
def convert_to_lst(label, ground=True):
    if ground:
        label_lst_str = ast.literal_eval(label)
        label_lst = []
        for label in label_lst_str:
            label_lst.append(label) 
    else:
        label_lst = label.split(';')
    label_set = set()
    for token in label_lst:
        if token:
            if not ground:
                begin, end = [int(s) for s in token.split() if s.isdigit()]
            else:
                begin, end = token
            label_set.update(list(range(begin, end)))
    return label_set

In [None]:
def calculate_metrics(truth, pred):
    TP = len(set.intersection(truth, pred))
    FP = len(pred.difference(truth))
    FN = len(truth.difference(pred))

    return (TP, FP, FN)

In [None]:
def calculate_f1(df_truth, df_pred, group=-1, feature=None):
    if group < 0:
        df_comb = df_truth.merge(df_pred, on=['id'])
        if feature:
            idx = df_comb.feature_num == feature
            df_comb = df_comb.loc[idx, :]
    else:
        if not feature:
            idx = df_truth.case_num == group
        else:
            idx = df_truth.case_num == group & df_truth.feature_num == feature
        df_truth_sub = df_truth.loc[idx, :]
        df_comb = df_truth_sub.merge(df_pred, on=['id'], how='left')
    
    df_comb['truth'] = df_comb['location_x'].apply(lambda x: convert_to_lst(x))
    df_comb['pred'] = df_comb['location_y'].apply(lambda x: convert_to_lst(x, False))
    df_comb['metrics'] = df_comb[['truth', 'pred']].apply(lambda x: calculate_metrics(*x), axis=1)
    df_comb[['TP', 'FP', 'FN']] = pd.DataFrame(df_comb['metrics'].tolist(), index=df_comb.index)

    tot_TP = df_comb.TP.sum()
    tot_FP = df_comb.FP.sum()
    tot_FN = df_comb.FN.sum()

    m_pre = tot_TP / (tot_TP + tot_FP)
    m_rec = tot_TP / (tot_TP + tot_FN)
    m_f1 = 2 * (m_pre * m_rec) / (m_pre + m_rec)

    return df_comb, (m_pre, m_rec, m_f1)

# Results

In [None]:
df_BioC, metrics_BioC = calculate_f1(df_truth, pred_BioC)
df_BioD, metrics_BioD = calculate_f1(df_truth, pred_BioD)
df_BioC_S, metrics_BioC_S = calculate_f1(df_truth, pred_BioC_S)
df_Distil, metrics_Distil = calculate_f1(df_truth, pred_Distil)

In [None]:
metrics_lst = [metrics_BioC, metrics_BioD, metrics_BioC_S, metrics_Distil]

df_results = pd.DataFrame(metrics_lst, 
             columns =['Val-Pre', 'Val-Rec', 'Val-F1'])
df_results['Model'] = ['Bio-Clinic + LSTM', 'Bio-Discharge + LSTM', 'Bio-Clinic', 'Distil-Bert']
df_results['Test-F1'] = [0.797, 0.793, 0.789, 0.784]


In [None]:
df_results = df_results.round(4)
df_results

Unnamed: 0,Val-Pre,Val-Rec,Val-F1,Model,Test-F1
0,0.9267,0.9291,0.9279,Bio-Clinic + LSTM,0.797
1,0.9257,0.9293,0.9275,Bio-Discharge + LSTM,0.793
2,0.9095,0.9125,0.911,Bio-Clinic,0.789
3,0.9159,0.914,0.915,Distil-Bert,0.784


In [None]:
df_long = pd.melt(df_results, id_vars=['Model'], 
          value_vars=['Val-Pre', 'Val-Rec', 'Val-F1', 'Test-F1'],
          var_name='Metrics', value_name='Result')

In [None]:
fig = px.bar(df_long, x='Model', y='Result',
             color='Metrics', barmode='group',
             width=700,
             height=400,
             text_auto=True)
fig.update_layout(yaxis_title='Performance')
#fig.update_yaxes(range=[0.6, 0.95])
fig.show()

# Bio-Clinical BERT

In [None]:
def calculate_case(df_truth, df_pred, var):
    sub_result = {var: [], 'm_pre': [], 'm_rec': [], 'm_f1': []}
    for k in df_truth[var].unique():
        if var == 'case_num':
            df_comb, (m_pre, m_rec, m_f1) = calculate_f1(df_truth, df_pred, group=k)
        else:
            df_comb, (m_pre, m_rec, m_f1) = calculate_f1(df_truth, df_pred, feature=k)
        sub_result['m_pre'].append(m_pre)
        sub_result['m_rec'].append(m_rec)
        sub_result['m_f1'].append(m_f1)
        sub_result[var].append(k)
    group_sum = pd.DataFrame(data=sub_result)
    #group_sum.reset_index(inplace=True)
    group_sum.rename(columns={'index': var}, inplace=True)

    return group_sum


In [None]:
result_case = calculate_case(df_truth, df_BioC, 'case_num')
result_case.round(3)

Unnamed: 0,case_num,m_pre,m_rec,m_f1
0,0,0.851,0.919,0.884
1,1,0.916,0.914,0.915
2,2,0.937,0.935,0.936
3,3,0.937,0.945,0.941
4,4,0.919,0.948,0.933
5,5,0.922,0.834,0.876
6,6,0.932,0.974,0.952
7,7,0.926,0.969,0.947
8,8,0.963,0.941,0.952
9,9,0.93,0.954,0.942


In [None]:
df_truth_sub = df_truth.loc[df_truth.case_num == 5, :]
result_feat = calculate_case(df_truth_sub, pred_BioC, 'feature_num')#
result_feat.merge(feature, on='feature_num').sort_values(['m_f1'])

Unnamed: 0,feature_num,m_pre,m_rec,m_f1,case_num,feature_text
8,508,0.866667,0.383966,0.532164,5,Associated-nausea
12,512,0.960452,0.609319,0.745614,5,Associated-throat-tightness
4,504,0.819588,0.75,0.783251,5,Episodes-of-heart-racing
9,509,0.916115,0.765683,0.834171,5,Increased-frequency-recently
0,500,0.869565,0.827586,0.848057,5,Onset-5-years-ago
3,503,0.869792,0.837093,0.853129,5,Associated-SOB-OR-Associated-shortness-of-breath
13,513,0.9,0.84375,0.870968,5,Feels-hot-OR-Feels-clammy
14,514,0.960396,0.832618,0.891954,5,Episode-of-hand-numbness-OR-Episode-of-finger-...
7,507,0.909091,0.902256,0.90566,5,No-illicit-drug-use
5,505,0.948895,0.906332,0.927126,5,Recent-visit-to-emergency-department-with-nega...


# Error Visualization

In [None]:
pn_max_FP = df_BioC.pn_num[df_BioC.FP.argmax()]
pn_max_FN = df_BioC.pn_num[df_BioC.FN.argmax()]
print(pn_max_FP)
print(pn_max_FN)

55601
20507


In [None]:
import spacy
import random 
def plot_annotation(df, pn_num, plot_pred):
    options = {'colors': {}}

    df_text = df[df["pn_num"] == pn_num].reset_index(drop=True)

    text = df_text["pn_history"][0]
    ents = []
    
    if plot_pred:
        for spans, feature_text, in df_text[["location_y", "feature_text"]].values:
            if spans:
                for span in spans.split(';'):
                    begin, end = [int(s) for s in span.split() if s.isdigit()]
                    ents.append({"start": begin, "end": end, "label": feature_text.upper()})
    else:
        for spans, feature_text, in df_text[["location_x", "feature_text"]].values:
            if spans:
                label_lst_str = ast.literal_eval(spans)
                label_lst = []
                for label in label_lst_str:
                    label_lst.append(label) 
                for span in label_lst_str:
                    begin, end = span
                    ents.append({"start": begin, "end": end, "label": feature_text.upper()})
            
    doc = {"text": text, "ents": sorted(ents, key=lambda i: i["start"])}
    
    np.random.seed(0)
    for ent in doc["ents"]:
        feat_txt = ent['label']
        if feat_txt not in options["colors"]:
            options["colors"][feat_txt] = f"rgb{tuple(np.random.randint(100, 255, size=3))}"
    spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True)

In [None]:
plot_annotation(df_BioC, pn_max_FP, False)

In [None]:
plot_annotation(df_BioC, pn_max_FP, True)

In [None]:
plot_annotation(df_BioC, pn_max_FN, True)

In [None]:
plot_annotation(df_BioC, pn_max_FN, False)

In [None]:
df_BioC.pn_history[df_BioC.pn_num == pn_max_FN].tolist()[0]


'HPI:\r\nDolores Montgomery is a 44 year old female with a history of hypertension who presents with 3 years of irregular menstrual periods. She relays that the interval between periods is irregular, and they last between 2-6 days, sometimes heavier with 6-7 pad changes/day, and sometimes lighter. She denies any pain or fullness in the pelvis. She has occassional breast tenderness before her periods. She denies any change in vaginal discharge or fevers. She also denies any headaches, visual changes, galactorhea, constipation, diarrhea, abdominal pain, fatigue, or cold/heat insensitivity. No significant change in weight or exercise habits.\r\n\r\nFH: HTN and obesity\r\n\r\nPMH: HTN- takes HCTZ\r\nno other meds or over the counter supplements.\r\n\r\nOB/GYN: menarche at 14 with regular periods until 3 years ago. G2P2 with 2 vaginal deliveries'

In [None]:
df_BioC.annotation[df_BioC.pn_num == pn_max_FN].tolist()[12]#.tolist()[0]

"['sometimes heavier sometimes lighter', 'sometimes with 6-7 pad changes/day', 'and sometimes lighter', 'interval is irregular', 'last between 2-6 days']"