In [None]:
!python --version

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install simpletransformers
!pip install transformers

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from collections import Counter
import re
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import re
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
import numpy as np

In [None]:
tokenizer_bart_large = BartTokenizer.from_pretrained('facebook/bart-large')
model_bart_large = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

In [None]:
columns = ['MRN', 'ID', 'DATE', 'TYPE', 'TEXT']
notes_1 = pd.read_csv('/content/drive/MyDrive/RA_Medical_DATA/APC_NOTES/MRN_Clinical_Notes_File1.txt', sep='\t', names=columns)
notes_2 = pd.read_csv('/content/drive/MyDrive/RA_Medical_DATA/APC_NOTES/MRN_Clinical_Notes_File2.txt', sep='\t', names=columns)
notes_3 = pd.read_csv('/content/drive/MyDrive/RA_Medical_DATA/APC_NOTES/MRN_Clinical_Notes_File3.txt', sep='\t', names=columns)

In [None]:
icd_notes_updated = pd.read_csv('/content/drive/MyDrive/RA_Medical_DATA/APC_NOTES/updated-diagnosis-list.csv')
icd_notes_updated.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
icd_notes_updated.head(5)

In [None]:
icd_notes_updated_merged = icd_notes_updated.groupby('MRN')['problem'].agg(lambda x: ', '.join(x)).reset_index()

In [None]:
icd_notes_updated_merged.head(5)

In [None]:
label_classification = pd.read_excel('/content/drive/MyDrive/RA_Medical_DATA/APC_NOTES/classification.xlsx')

In [None]:
label_classification.head(5)

In [None]:
print(label_classification['outcome'].value_counts())

In [None]:
notes = pd.concat([notes_1, notes_2, notes_3], axis=0)
notes = notes[notes['TYPE'].isin(['H&P'])]

In [None]:
def clean_input(input_text):
    cleaned_text = re.sub(r"&#x0A;", "", input_text)
    cleaned_text = re.sub(r"-{3,}", "", cleaned_text)
    cleaned_text = cleaned_text.strip()

    return cleaned_text

In [None]:
notes['TEXT'] = notes['TEXT'].apply(clean_input)

In [None]:
notes.head(3)

In [None]:
notes['DATE_NEW'] = pd.to_datetime(notes['DATE'])

In [None]:
notes.head(3)

In [None]:
notes['text_length'] = notes['TEXT'].apply(lambda x: len(x))

In [None]:
notes.head(3)

In [None]:
average_length = notes['text_length'].mean()

print(average_length)

In [None]:
Q1 = notes['text_length'].quantile(0.25)
Q3 = notes['text_length'].quantile(0.75)
IQR = Q3 - Q1

# Filter out rows with text lengths within the acceptable range (excluding outliers)
filtered_notes_for_length = notes[(notes['text_length'] >= Q1 - 1.5 * IQR) & (notes['text_length'] <= Q3 + 1.5 * IQR)]

# Calculate the average length of text after excluding outliers
average_length_excluding_outliers = filtered_notes_for_length['text_length'].mean()

print("Average length of text (excluding outliers):", average_length_excluding_outliers)

In [None]:
notes_filtered = notes[notes['TEXT'].apply(lambda x: len(x) > 1800)]

In [None]:
notes_filtered = notes_filtered.sort_values(['MRN', 'DATE_NEW'], ascending=[True, False])

In [None]:
df_notes = notes_filtered.groupby('MRN').agg({'TEXT': lambda x: ' '.join(x.head(1))}).reset_index()

In [None]:
def generate_summary(sentences):
    inputs = tokenizer_bart_large(sentences, return_tensors='pt')
    print('Input shape:', inputs.input_ids.shape)

  # Check if tokenization resulted in more than one chunk
    if inputs.input_ids.shape[1] > 1024:
      # Split the input into smaller chunks
        chunked_input_ids = inputs.input_ids[0].split(1024)
        chunked_attention_mask = inputs.attention_mask[0].split(1024)
        generated_ids = []

      # Generate text for each chunk
        for i in range(len(chunked_input_ids)):
            input_ids = chunked_input_ids[i].unsqueeze(0)
            attention_mask = chunked_attention_mask[i].unsqueeze(0)

          # Generate text for the current chunk
            generated_ids_chunk = model_bart_large.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                num_beams=5,
                min_length=100,
                max_length=300
            )

            generated_ids.extend(generated_ids_chunk.tolist())

      # Concatenate the generated chunks
        generated_ids = generated_ids[0]
    else:
      # Generate text for the entire input
        generated_ids = model_bart_large.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            num_beams=5,
            min_length=100,
            max_length=300
            )
        generated_ids = generated_ids.tolist()
        generated_ids = generated_ids[0]

  # Decode the generated output
    generated_text = tokenizer_bart_large.decode(generated_ids, skip_special_tokens=True)
    return generated_text

In [None]:
df_notes['Summarised_Text'] = df_notes['TEXT'].apply(generate_summary)

In [None]:
# concatenated_data = pd.read_csv('/content/drive/MyDrive/RA_Medical_DATA/APC_NOTES/concatenated_data.csv')
# df_notes = concatenated_data

In [None]:
df_notes.head(3)

In [None]:
data_classification = pd.merge(df_notes, label_classification, on='MRN', how='inner').dropna()

In [None]:
def remove_na(input_text):
    cleaned_text = input_text.replace('nan', '')

    return cleaned_text

In [None]:
data_classification = pd.merge(data_classification, icd_notes_updated_merged, on='MRN', how='inner').dropna()

In [None]:
data_classification.head(3)

In [None]:
y_train_classification = data_classification[['outcome']]
X_train_classification = pd.concat(['Problem: ' + data_classification['problem'] + '.Summary Report: ' + data_classification['Summarised_Text']] ,  axis=1)
X_train_classification.columns = ['text']
y_train_classification.columns = ['labels']

In [None]:
data_final_classification = pd.concat([X_train_classification, y_train_classification], axis=1)

In [None]:
train_data, test_data = train_test_split(data_final_classification, test_size=0.25, random_state=42)
train_data = train_data.reset_index(drop = True)
test_data = test_data.reset_index(drop = True)

In [None]:
minority_class = train_data[train_data['labels'] == 0]
majority_class = train_data[train_data['labels'] == 1]

print(len(minority_class), len(majority_class))

In [None]:
oversampled_minority_class = minority_class.sample(n=287, replace=True, random_state=42)
print(len(oversampled_minority_class))

In [None]:
balanced_df = pd.concat([majority_class, oversampled_minority_class])

In [None]:
train_data = balanced_df
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
train_data_no_sampling, test_data_no_sampling = train_test_split(data_final_classification, test_size=0.25, random_state=42)
train_data_no_sampling = train_data_no_sampling.reset_index(drop = True)
test_data_no_sampling = test_data_no_sampling.reset_index(drop = True)

In [None]:
def calculate_values(result, model):

  # Calculate Precision
  precision = result['tp'] / (result['tp'] + result['fp'])

  # Calculate Recall (Sensitivity)
  recall = result['tp'] / (result['tp'] + result['fn'])

  # Calculate F1 Score
  f1 = 2 * (precision * recall) / (precision + recall)

  # Calculate Accuracy
  accuracy = (result['tp'] + result['tn']) / (result['tp'] + result['tn'] + result['fp'] + result['fn'])

  # Calculate Specificity
  specificity = result['tn'] / (result['tn'] + result['fp'])

  # Calculate AUC-ROC
  auc_roc = result['auroc']

  # Calculate ROC
  roc = {
      'fpr': result['fp'] / (result['fp'] + result['tn']),
      'tpr': recall
  }

  # Print the calculated metrics
  print("Precision:",model, ": ", precision)
  print("Recall:",model, ": ", recall)
  print("F1 Score:", model, ": ", f1)
  print("Accuracy:", model, ": ", accuracy)
  print("Sensitivity:", model, ": ", recall)
  print("Specificity:", model, ": ", specificity)
  print("AUC-ROC:", model, ": ", auc_roc)
  print("ROC:", model, ": ", roc)

In [None]:
def draw_plots(predicted_probabilities, true_labels, model):

  auc_score = roc_auc_score(true_labels, predicted_probabilities)
  fpr, tpr, _ = roc_curve(true_labels, predicted_probabilities)

  plt.figure()
  plt.plot(fpr, tpr, label=f"auroc = {auc_score:.2f}")
  plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line representing random classification
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Rceiver Operating Characteristic Curve for ' + model)
  plt.legend(loc='lower right')
  plt.show()

  print(" ")

  average_precision = average_precision_score(true_labels, predicted_probabilities)
  precision, recall, _ = precision_recall_curve(true_labels, predicted_probabilities)

  plt.figure()
  plt.plot(recall, precision, label=f"auprc = {average_precision:.2f}")
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.title('Precision-Recall Curve for ' + model)
  plt.legend(loc='upper right')
  plt.show()

In [None]:
def calculate_f1(result):
  precision = result['tp'] / (result['tp'] + result['fp'])
  recall = result['tp'] / (result['tp'] + result['fn'])

  f1 = (2 * precision * recall)/(precision + recall)
  return f1

### BERT Based Uncased

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_bert_base_uncased = ClassificationModel(
    "bert", "bert-base-uncased", args=model_args)

In [None]:
for i in range(5):
  model_bert_base_uncased.train_model(train_data)

In [None]:
result, model_outputs, wrong_predictions = model_bert_base_uncased.eval_model(test_data)

In [None]:
result

In [None]:
calculate_values(result, 'bert-base-uncased')

In [None]:
predicted_probabilities_bert = model_outputs[:, 1]
true_labels_bert = np.array(test_data['labels'].tolist())
draw_plots(predicted_probabilities_bert, true_labels_bert, 'bert-base')

### roberta-base

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_roberta = ClassificationModel(
    "roberta", "roberta-base", args=model_args)

In [None]:
for i in range(5):
  model_roberta.train_model(train_data)

In [None]:
result, model_outputs, wrong_predictions = model_roberta.eval_model(test_data)

In [None]:
result

In [None]:
calculate_values(result, 'roberta-base')

In [None]:
predicted_probabilities_roberta = model_outputs[:, 1]
true_labels_roberta = np.array(test_data['labels'].tolist())
draw_plots(predicted_probabilities_roberta, true_labels_roberta, 'roberta-base')

### emilyalsentzer/Bio_ClinicalBERT

In [None]:
# model_args = ClassificationArgs(num_train_epochs=5, overwrite_output_dir = True, sliding_window=True)
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_Bio_ClinicalBERT= ClassificationModel(
    "bert", "emilyalsentzer/Bio_ClinicalBERT", args=model_args)

In [None]:
for i in range(5):
  model_Bio_ClinicalBERT.train_model(train_data)

In [None]:
result, model_outputs, wrong_predictions = model_Bio_ClinicalBERT.eval_model(test_data)

In [None]:
result

In [None]:
calculate_values(result, 'Bio-Bertt')

In [None]:
predicted_probabilities_biobert= model_outputs[:, 1]
true_labels_biobert = np.array(test_data['labels'].tolist())
draw_plots(predicted_probabilities_biobert, true_labels_biobert, 'Bio_ClinicalBERT')

### microsoft/BiomedNLP-PubMedBERT

In [None]:
# model_args = ClassificationArgs(num_train_epochs=5, overwrite_output_dir = True, sliding_window=True)
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_microsoft_bio_pubmed =  ClassificationModel(
    "bert", "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", args=model_args)

In [None]:
for i in range(5):
  model_microsoft_bio_pubmed.train_model(train_data)

In [None]:
result, model_outputs, wrong_predictions = model_microsoft_bio_pubmed.eval_model(test_data)

In [None]:
result

In [None]:
calculate_values(result, 'PubMedBert')

In [None]:
predicted_probabilities_pubmed = model_outputs[:, 1]
true_labels_pubmed = np.array(test_data['labels'].tolist())
draw_plots(predicted_probabilities_pubmed, true_labels_pubmed, 'PubMedBERT')

### Graphs

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score, precision_recall_curve
import matplotlib.pyplot as plt

def draw_roc_curve(predicted_probabilities, true_labels, model_name):
    auc_score = roc_auc_score(true_labels, predicted_probabilities)
    fpr, tpr, _ = roc_curve(true_labels, predicted_probabilities)

    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {auc_score:.2f})")

def draw_precision_recall_curve(predicted_probabilities, true_labels, model_name):
    average_precision = average_precision_score(true_labels, predicted_probabilities)
    precision, recall, _ = precision_recall_curve(true_labels, predicted_probabilities)

    plt.plot(recall, precision, label=f"{model_name} (AP = {average_precision:.2f})")

def draw_plots_multiple(predicted_probabilities_list, true_labels_list, model_names):
    plt.figure()

    for i, predicted_probabilities in enumerate(predicted_probabilities_list):
        model_name = model_names[i]
        true_labels = true_labels_list[i]
        draw_roc_curve(predicted_probabilities, true_labels, model_name)

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.xticks(np.arange(0, 1.1, 0.1))
    plt.yticks(np.arange(0, 1.1, 0.1))
    plt.legend(loc='lower right')
    plt.show()
    plt.figure()

    print(" ")

    for i, predicted_probabilities in enumerate(predicted_probabilities_list):
        model_name = model_names[i]
        true_labels = true_labels_list[i]
        draw_precision_recall_curve(predicted_probabilities, true_labels, model_name)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower left')
    plt.show()

In [None]:
# predicted_probabilities_list = [predicted_probabilities_roberta, predicted_probabilities_bert, predicted_probabilities_biobert, predicted_probabilities_pubmed, predicted_probabilities_longformer]
# true_labels_list = [true_labels_roberta, true_labels_bert, true_labels_biobert, true_labels_pubmed, true_labels_longformer]
# model_names = ["Roberta", "Bert-base-uncase", "Bio_ClinicalBERT", "PubMedBERT", "Longformer"]

# draw_plots_multiple(predicted_probabilities_list, true_labels_list, model_names)

predicted_probabilities_list = [predicted_probabilities_roberta, predicted_probabilities_bert, predicted_probabilities_biobert, predicted_probabilities_pubmed]
true_labels_list = [true_labels_roberta, true_labels_bert, true_labels_biobert, true_labels_pubmed]
model_names = ["Roberta", "Bert-base-uncase", "Bio_ClinicalBERT", "PubMedBERT"]

draw_plots_multiple(predicted_probabilities_list, true_labels_list, model_names)

In [None]:
def write_list_to_txt(lst, file_path):
    with open(file_path, 'w') as file:
        for item in lst:
            file.write(str(item) + '\n')

In [None]:
write_list_to_txt(predicted_probabilities_list, "predicted_probabilities_list")
write_list_to_txt(true_labels_list, "true_labels_list")
write_list_to_txt(model_names, "model_names")

In [None]:
def read_txt_to_list(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        data_list = [line.strip() for line in lines]
    return data_list

In [None]:
predicted_probabilities_list_1 = read_txt_to_list("predicted_probabilities_list")
true_labels_list_1 = read_txt_to_list("true_labels_list")
model_names_1 = read_txt_to_list("model_names")