In [None]:
!pip install openai
!pip install tiktoken
!pip install transformers
!pip install simpletransformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import tiktoken
import openai
import xml.etree.ElementTree as ET
import re
import os
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel
import random
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
seed_value = 42
random.seed(seed_value)

### OpenAI data generation

In [None]:
openai.organization = os.environ.get('organisation')
openai.api_key = os.environ.get('Key')
# openai.Model.list()

In [None]:
command_homelessness = """I'm creating a dataset to train an NLP model which can identify 'Social determinants of Health'
when given a medical report of a patient. The determinant which I want to focus on is 'Homelessness'. Give me 100
examples which each at least have 1 to 2 sentences related to the determinant 'Homelessness'.   Limit the use of the
phrase 'Homelessness' when possible. Just give the text, don't include the patient name and other information.
Each example should start with keyword  'Determinant Example'.
"""

In [None]:
command_food_insecurity = """I'm creating a dataset to train an NLP model which can identify 'Social determinants of Health'
when given a medical report of a patient. The determinant which I want to focus on is 'Food Insecurity'. Give me 100
examples which each at least have 1 to 2 sentences related to the determinant 'Food Insecurity'.   Limit the use of the
phrase 'Food Insecurity' when possible. Just give the text, don't include the patient name and other information.
Each example should start with keyword  'Determinant Example'.
"""

In [None]:
command_domestic_violence = """I'm creating a dataset to train an NLP model which can identify 'Social determinants of Health'
when given a medical report of a patient. The determinant which I want to focus on is 'Domestic Violence'. Give me 100
examples which each at least have 1 to 2 sentences related to the determinant 'Domestic Violence'.   Limit the use of the
phrase 'Domestic Violence' when possible. Just give the text, don't include the patient name and other information.
Each example should start with keyword  'Determinant Example'.
"""

In [None]:
command_not_food_insecurty = """I'm creating a dataset to train an NLP model which can identify 'Social determinants of
Health' when given a medical report of a patient. The determinant which I want to focus on is 'Food Insecurity'. Give me
100 examples which each at least have 1 to 2 sentences related to the patient not having issues with 'Food Insecurity'.
Limit the use of the phrase 'Food Insecurity' when possible. Just give the text, don't include the patient name and
other information. Each example should start with keyword  'Determinant Example'.
"""

In [None]:
command_not_homelessness = """I'm creating a dataset to train an NLP model which can identify 'Social determinants of
Health' when given a medical report of a patient. The determinant which I want to focus on is 'Homelessness'. Give me
100 examples which each at least have 1 to 2 sentences related to the patient not having issues with 'Homelessness'.
Limit the use of the phrase 'Homelessness' when possible. Just give the text, don't include the patient name and
other information. Each example should start with keyword  'Determinant Example'.
"""

In [None]:
command_not_domestic_violence = """I'm creating a dataset to train an NLP model which can identify 'Social determinants of
Health' when given a medical report of a patient. The determinant which I want to focus on is 'Domestic Violence'. Give me
100 examples which each at least have 1 to 2 sentences related to the patient not having issues with 'Domestic Violence'.
Limit the use of the phrase 'Domestic Violence' when possible. Just give the text, don't include the patient name and
other information. Each example should start with keyword  'Determinant Example'.
"""

In [None]:
def generate_data(command, keyword_determinant):
  for i in range(1):
    message = [{"role": "system", "content": command}]
    response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages = message)
    file_name = keyword_determinant + str(i) + '.txt'
    with open(file_name, "w") as file:
      file.write(response['choices'][0]['message']['content'])

In [None]:
%cd /content/
generate_data(command_homelessness, 'Homelessness')
generate_data(command_food_insecurity, 'Food Insecurity')
generate_data(command_not_food_insecurty, 'Food Abundance')
generate_data(command_not_homelessness, 'Housing Avaialble')
generate_data(command_domestic_violence, 'Domestic Violence')
generate_data(command_not_domestic_violence, 'NOT Domestic Violence')

### Data pre-processing

In [None]:
def read_data(file_name):

  tree = ET.parse(file_name)
  root = tree.getroot()
  file_content = root[0].text
  return file_content

In [None]:
def pre_process_data(file_content):

  clean_data = file_content.replace('\n\n', '.')
  clean_data = clean_data.replace('\n', '')
  clean_data = clean_data.replace("\'", "")
  clean_data = clean_data.replace("\t", " ")
  split_string = clean_data.split('.')
  cleaned_parts = [part.strip() for part in split_string if part.strip()]
  modified_string = '.'.join(cleaned_parts)
  return modified_string

In [None]:
data = {}
combined_data = {}

In [None]:
%cd /i2b/training-PHI-Gold-Set1

file_list = os.listdir()
for file_name in file_list:
  file_content = read_data(file_name)
  cleaned_data = pre_process_data(file_content)

  key = int(file_name.split('-')[0])
  if key in data:
    value = data[key]
    value.append(cleaned_data)
    data[key] = value
  else:
    data[key] = [cleaned_data]

In [None]:
%cd /i2b/training-PHI-Gold-Set2

file_list = os.listdir()
for file_name in file_list:
  file_content = read_data(file_name)
  cleaned_data = pre_process_data(file_content)

  key = int(file_name.split('-')[0])
  if key in data:
    value = data[key]
    value.append(cleaned_data)
    data[key] = value
  else:
    data[key] = [cleaned_data]

In [None]:
# Medical Notes

medical_notes = list()
count = 0

for k in data:
  for i in range(len(data[k])):
    medical_notes.append(data[k][i])
    count += 1

In [None]:
def extract_determinant_notes(keyword):

  split_content = list()

  file_list = os.listdir()
  for file_name in file_list:
    if keyword in file_name:
      with open(file_name, 'r') as file:
        content = file.read()
        if file_name == 'Food Abundance0.txt' or file_name == 'Food Insecurity updated0.txt' or file_name == 'Domestic Violence2.txt' or file_name == 'Violence Absent.txt':
          split_content.append(content.split('\n'))
        else:
          split_content.append(content.split('\n\n'))

  determinant = list()


  for i in range(len(split_content)):
    for j in range(len(split_content[i])):
      if keyword == 'Food Abundance' or keyword == 'Violence Absent':
        determinant.append(split_content[i][j][21:])
      else:
        determinant.append(split_content[i][j][22 + len(str(j+1)): ])

  return determinant

In [None]:
%cd /i2b/Training Data

# Determinants

food_insecurity_determinant = extract_determinant_notes('Food Insecurity')
homeless_determinant = extract_determinant_notes('Homelessnes')
not_food_insecurity_determinant = extract_determinant_notes('Food Abundance')
not_homeless_determinant = extract_determinant_notes('Housing Avaialble')
domestic_violence_determinant = extract_determinant_notes('Domestic Violence')
not_domestic_violence_determinant = extract_determinant_notes('Violence Absent')

%cd /content/

In [None]:
new_not_food_insecurity_determinant = list()
ind = list()
for i in range(len(not_food_insecurity_determinant)):
  if 'food insecurity' in not_food_insecurity_determinant[i].lower():
    new_not_food_insecurity_determinant.append(not_food_insecurity_determinant[i])
    ind.append(i)

for i in range(len(not_food_insecurity_determinant)):
  if i not in ind:
    new_not_food_insecurity_determinant.append(not_food_insecurity_determinant[i])

not_homeless_determinant = new_not_food_insecurity_determinant

In [None]:
new_not_homeless_determinant = list()
ind = list()
for i in range(len(not_homeless_determinant)):
  if 'homelessness' in not_homeless_determinant[i].lower():
    new_not_homeless_determinant.append(not_homeless_determinant[i])
    ind.append(i)

for i in range(len(not_homeless_determinant)):
  if i not in ind:
    new_not_homeless_determinant.append(not_homeless_determinant[i])

not_homeless_determinant = new_not_homeless_determinant

In [None]:
new_not_domestic_violence_determinant = list()
ind = list()
for i in range(len(not_domestic_violence_determinant)):
  if 'violence' in not_domestic_violence_determinant[i].lower():
    new_not_domestic_violence_determinant.append(not_domestic_violence_determinant[i])
    ind.append(i)

for i in range(len(not_domestic_violence_determinant)):
  if i not in ind:
    new_not_domestic_violence_determinant.append(not_domestic_violence_determinant[i])

not_domestic_violence_determinant = new_not_domestic_violence_determinant

In [None]:
not_homeless_determinant = not_homeless_determinant[:40]
not_food_insecurity_determinant = not_food_insecurity_determinant[:40]
not_domestic_violence_determinant = not_domestic_violence_determinant[:40]

In [None]:
index_list = list()
for i in range(len(homeless_determinant)):
  index_list.append(len(homeless_determinant[i].split(' ')))

plt.hist(index_list, bins=10, color='blue', edgecolor='black')
plt.xlabel('Tokenised length')
plt.ylabel('Frequency')
plt.title('Histogram - Homelessness')
plt.show()

In [None]:
index_list = list()
for i in range(len(food_insecurity_determinant)):
  index_list.append(len(food_insecurity_determinant[i].split(' ')))

plt.hist(index_list, bins=10, color='blue', edgecolor='black')
plt.xlabel('Tokenised length')
plt.ylabel('Frequency')
plt.title('Histogram - Food Insecurity')
plt.show()

In [None]:
index_list = list()
for i in range(len(not_food_insecurity_determinant)):
  index_list.append(len(not_food_insecurity_determinant[i].split(' ')))

plt.hist(index_list, bins=10, color='blue', edgecolor='black')
plt.xlabel('Tokenised length')
plt.ylabel('Frequency')
plt.title('Histogram - Food Insecurity')
plt.show()

In [None]:
index_list = list()
for i in range(len(not_homeless_determinant)):
  index_list.append(len(not_homeless_determinant[i].split(' ')))

plt.hist(index_list, bins=10, color='blue', edgecolor='black')
plt.xlabel('Tokenised length')
plt.ylabel('Frequency')
plt.title('Histogram - Food Insecurity')
plt.show()

In [None]:
# Lets take default length of determinant text to be inserted as 27.5 tokens words

In [None]:
seed_value = 42
random.seed(seed_value)

random.shuffle(medical_notes)

In [None]:
def medical_notes_and_combine_determinant(determinant, not_determinant, medical_notes):

  combined_medical_notes = list()

  for z in range(len(determinant)):
    split_data = medical_notes[z].split('.')
    front_count = 0
    back_count = 0
    final_data = list()
    for i in range(len(split_data)):
      front_count += len(split_data[i].split(' '))
      if front_count <= 242:
        continue
      else:
        final_data += split_data[:i+1]
        break

    final_data.append(determinant[z][:-1])

    for i in range(len(split_data)-1, -1, -1):
      back_count += len(split_data[i].split(' '))
      if back_count <= 242:
        continue
      else:
        final_data += split_data[i:]
        break

    final_data = '.'.join(final_data)
    combined_medical_notes.append(final_data)

# Not Determinant
  for z in range(len(not_determinant)):
    split_data = medical_notes[len(determinant) + z].split('.')
    front_count = 0
    back_count = 0
    final_data = list()
    for i in range(len(split_data)):
      front_count += len(split_data[i].split(' '))
      if front_count <= 242:
        continue
      else:
        final_data += split_data[:i+1]
        break

    final_data.append(not_determinant[z][:-1])

    for i in range(len(split_data)-1, -1, -1):
      back_count += len(split_data[i].split(' '))
      if back_count <= 242:
        continue
      else:
        final_data += split_data[i:]
        break

    final_data = '.'.join(final_data)
    combined_medical_notes.append(final_data)

  return combined_medical_notes

In [None]:
combined_medical_notes_food_insecurity = medical_notes_and_combine_determinant(food_insecurity_determinant, not_food_insecurity_determinant, medical_notes)

In [None]:
combined_medical_notes_homelessness = medical_notes_and_combine_determinant(homeless_determinant, not_homeless_determinant, medical_notes)

In [None]:
combined_medical_notes_domestic_violence = medical_notes_and_combine_determinant(domestic_violence_determinant, not_domestic_violence_determinant, medical_notes)

In [None]:
def get_x_and_y(combined_notes, medical_notes, count_determinant, count_no_determinant):
  X = list()
  y = list()

  for i in range(count_determinant):
    X.append(combined_notes[i])
    y.append(1)

  for i in range(count_determinant, count_determinant + count_no_determinant):
    X.append(combined_notes[i])
    y.append(2)

  for i in range(count_determinant + count_no_determinant, len(medical_notes)):
    X.append(medical_notes[i])
    y.append(0)

  return X, y

In [None]:
X_fi, y_fi = get_x_and_y(combined_medical_notes_food_insecurity, medical_notes, len(food_insecurity_determinant), len(not_food_insecurity_determinant))
X_h, y_h = get_x_and_y(combined_medical_notes_homelessness, medical_notes, len(homeless_determinant), len(not_homeless_determinant))
X_dv, y_dv = get_x_and_y(combined_medical_notes_domestic_violence, medical_notes, len(domestic_violence_determinant), len(not_domestic_violence_determinant))

In [None]:
from collections import Counter
element_count = Counter(y_dv)
for element, count in element_count.items():
    print(f"Element {element} appears {count} times")

In [None]:
train_X_fi, test_x_fi, train_y_fi, test_y_fi = train_test_split(X_fi, y_fi, test_size=0.25, random_state=42)
train_X_h, test_x_h, train_y_h, test_y_h = train_test_split(X_h, y_h, test_size=0.25, random_state=42)
train_X_dv, test_x_dv, train_y_dv, test_y_dv = train_test_split(X_dv, y_dv, test_size=0.25, random_state=42)

In [None]:
train_data_fi = {'text': train_X_fi, 'labels': train_y_fi}
test_data_fi = {'text': test_x_fi, 'labels': test_y_fi}

df_train_data_fi = pd.DataFrame(train_data_fi)
df_test_data_fi = pd.DataFrame(test_data_fi)

df_train_data_fi = df_train_data_fi.sample(frac=1, random_state=42).reset_index(drop=True)
df_test_data_fi = df_test_data_fi.sample(frac=1, random_state=42).reset_index(drop=True)

df_train_data_fi['labels'] = df_train_data_fi['labels'].replace(2, 0)
df_test_data_fi['labels'] = df_test_data_fi['labels'].replace(2, 0)

In [None]:
train_data_h = {'text': train_X_h, 'labels': train_y_h}
test_data_h = {'text': test_x_h, 'labels': test_y_h}

df_train_data_h = pd.DataFrame(train_data_h)
df_test_data_h = pd.DataFrame(test_data_h)

df_train_data_h = df_train_data_h.sample(frac=1, random_state=42).reset_index(drop=True)
df_test_data_h = df_test_data_h.sample(frac=1, random_state=42).reset_index(drop=True)

df_train_data_h['labels'] = df_train_data_h['labels'].replace(2, 0)
df_test_data_h['labels'] = df_test_data_h['labels'].replace(2, 0)

In [None]:
train_data_dv = {'text': train_X_dv, 'labels': train_y_dv}
test_data_dv = {'text': test_x_dv, 'labels': test_y_dv}

df_train_data_dv = pd.DataFrame(train_data_dv)
df_test_data_dv = pd.DataFrame(test_data_dv)

df_train_data_dv = df_train_data_dv.sample(frac=1, random_state=42).reset_index(drop=True)
df_test_data_dv = df_test_data_dv.sample(frac=1, random_state=42).reset_index(drop=True)

df_train_data_dv['labels'] = df_train_data_dv['labels'].replace(2, 0)
df_test_data_dv['labels'] = df_test_data_dv['labels'].replace(2, 0)

## Modeling

In [None]:
def calculate_values(result, model):

  # Calculate Precision
  precision = result['tp'] / (result['tp'] + result['fp'])

  # Calculate Recall (Sensitivity)
  recall = result['tp'] / (result['tp'] + result['fn'])

  # Calculate F1 Score
  f1 = 2 * (precision * recall) / (precision + recall)

  # Calculate Accuracy
  accuracy = (result['tp'] + result['tn']) / (result['tp'] + result['tn'] + result['fp'] + result['fn'])

  # Calculate Specificity
  specificity = result['tn'] / (result['tn'] + result['fp'])

  # Calculate AUC-ROC
  auc_roc = result['auroc']

  # Calculate ROC
  roc = {
      'fpr': result['fp'] / (result['fp'] + result['tn']),
      'tpr': recall
  }

  # Print the calculated metrics
  print("Precision:",model, ": ", precision)
  print("Recall:",model, ": ", recall)
  print("F1 Score:", model, ": ", f1)
  print("Accuracy:", model, ": ", accuracy)
  print("Sensitivity:", model, ": ", recall)
  print("Specificity:", model, ": ", specificity)
  print("AUC-ROC:", model, ": ", auc_roc)
  print("ROC:", model, ": ", roc)

In [None]:
def draw_plots(predicted_probabilities, true_labels, model):

  auc_score = roc_auc_score(true_labels, predicted_probabilities)
  fpr, tpr, _ = roc_curve(true_labels, predicted_probabilities)

  plt.figure()
  plt.plot(fpr, tpr, label=f"auroc = {auc_score:.2f}")
  plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line representing random classification
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Rceiver Operating Characteristic Curve for ' + model)
  plt.legend(loc='lower right')
  plt.show()

  print(" ")

  average_precision = average_precision_score(true_labels, predicted_probabilities)
  precision, recall, _ = precision_recall_curve(true_labels, predicted_probabilities)

  plt.figure()
  plt.plot(recall, precision, label=f"auprc = {average_precision:.2f}")
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.title('Precision-Recall Curve for ' + model)
  plt.legend(loc='upper right')
  plt.show()

### Bert-Base-Uncased: Food Insecurity

In [None]:
df_train_data_fi.shape

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_bert_base_uncased = ClassificationModel(
    "bert", "bert-base-uncased", args=model_args)

In [None]:
for i in range(5):
  print("Epoch: ", i)
  model_bert_base_uncased.train_model(df_train_data_fi)

In [None]:
result, model_outputs, wrong_predictions = model_bert_base_uncased.eval_model(df_test_data_fi)

In [None]:
result

In [None]:
calculate_values(result, 'bert-base-uncased')

In [None]:
predicted_probabilities_bert_i2b2_fi = model_outputs[:, 1]
true_labels_bert_i2b2_fi = np.array(df_test_data_fi['labels'].tolist())

In [None]:
draw_plots(predicted_probabilities_bert_i2b2_fi, true_labels_bert_i2b2_fi, 'bert-base-uncased')

### Roberta: Food Insecurity

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_roberta = ClassificationModel(
    "roberta", "roberta-base", args=model_args)

In [None]:
for i in range(5):
  print('Epoch: ', i)
  model_roberta.train_model(df_train_data_fi)

In [None]:
result, model_outputs, wrong_predictions = model_roberta.eval_model(df_test_data_fi)

In [None]:
result

In [None]:
calculate_values(result, 'roberta-base')

In [None]:
predicted_probabilities_roberta_i2b2_fi = model_outputs[:, 1]
true_labels_roberta_i2b2_fi = np.array(df_test_data_fi['labels'].tolist())
draw_plots(predicted_probabilities_roberta_i2b2_fi, true_labels_roberta_i2b2_fi, 'roberta-base')

### Bert-Base-Uncased: Homelessness

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_bert_base_uncased = ClassificationModel(
    "bert", "bert-base-uncased", args=model_args)

In [None]:
for i in range(5):
  print('Epoch: ', i)
  model_bert_base_uncased.train_model(df_train_data_h)

In [None]:
result, model_outputs, wrong_predictions = model_bert_base_uncased.eval_model(df_test_data_h)

In [None]:
result

In [None]:
calculate_values(result, 'bert-base-uncased')

In [None]:
predicted_probabilities_bert_i2b2_h = model_outputs[:, 1]
true_labels_bert_i2b2_h = np.array(df_test_data_h['labels'].tolist())
draw_plots(predicted_probabilities_bert_i2b2_h, true_labels_bert_i2b2_h, 'bert-base-uncased')

### Roberta: Homelessness

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_roberta = ClassificationModel(
    "roberta", "roberta-base", args=model_args)

In [None]:
for i in range(5):
  print('Epoch: ', i)
  model_roberta.train_model(df_train_data_h)

In [None]:
result, model_outputs, wrong_predictions = model_roberta.eval_model(df_test_data_h)

In [None]:
result

In [None]:
calculate_values(result, 'roberta-base')

In [None]:
predicted_probabilities_roberta_i2b2_h = model_outputs[:, 1]
true_labels_roberta_i2b2_h = np.array(df_test_data_h['labels'].tolist())
draw_plots(predicted_probabilities_roberta_i2b2_h, true_labels_roberta_i2b2_h, 'roberta-base')

### BERT: Domestic Violence

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_bert_base_uncased = ClassificationModel(
    "bert", "bert-base-uncased", args=model_args)

In [None]:
for i in range(5):
  print('Epoch: ', i)
  model_bert_base_uncased.train_model(df_train_data_dv)

In [None]:
result, model_outputs, wrong_predictions = model_bert_base_uncased.eval_model(df_test_data_dv)

In [None]:
result

In [None]:
calculate_values(result, 'bert-base-uncased')

In [None]:
predicted_probabilities_bert_i2b2_dv = model_outputs[:, 1]
true_labels_bert_i2b2_dv = np.array(df_test_data_dv['labels'].tolist())
draw_plots(predicted_probabilities_bert_i2b2_dv, true_labels_bert_i2b2_dv, 'bert-base-uncased')

### Roberta: Domestic Violence

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir = True, max_seq_length=512)

model_roberta = ClassificationModel(
    "roberta", "roberta-base", args=model_args)

In [None]:
for i in range(5):
  print('Epoch: ', i)
  model_roberta.train_model(df_train_data_dv)

In [None]:
result, model_outputs, wrong_predictions = model_roberta.eval_model(df_test_data_dv)

In [None]:
result

In [None]:
calculate_values(result, 'roberta-base')

In [None]:
predicted_probabilities_roberta_i2b2_dv = model_outputs[:, 1]
true_labels_roberta_i2b2_dv = np.array(df_test_data_dv['labels'].tolist())
draw_plots(predicted_probabilities_roberta_i2b2_dv, true_labels_roberta_i2b2_dv, 'roberta-base')