In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import os


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!unzip "/content/drive/MyDrive/NLP/model_output.zip" -d "/content/model_output" > /dev/null 2>&1
!unzip "/content/combined_text.zip" -d "/content/combined_text" > /dev/null 2>&1

In [None]:
def load_model_and_tokenizer(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    return model, tokenizer

In [None]:
def generate_text(model, tokenizer, sequence, max_length=10):
    ids = tokenizer.encode(sequence, return_tensors='pt')
    final_outputs = model.generate(
        ids,
        max_length=max_length + len(ids[0]),
        pad_token_id=tokenizer.eos_token_id,
        top_k=10,
        top_p=1,
    )
    generated_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)
    return generated_text

In [None]:
def process_clinical_notes(txt_file_path, model, tokenizer, output_file_path):
    with open(txt_file_path, 'r', encoding='utf-8') as f, \
         open(output_file_path, 'w', encoding='utf-8') as out_f:
         i=0
         for line in f:
            print(line)
            if i% 2 ==0: # Extract just the clinical note from each line
              note = line.strip().split('\n')[0]
              # print(note)
              generated_text = generate_text(model, tokenizer, note, max_length=150)
              out_f.write(f"Generated Prediction: {generated_text}\n\n")
              # print(generated_text)

            else: pass
            i=i+1

In [None]:
os.makedirs('/content/output_gpt', exist_ok=True)
model_path = "/content/model_output/model_output/sdoh_extracotor_gpt"  # Adjust if necessary
sdoh_txt_path = "/content/combined_text/content/combined_text/test.txt"  # Update this path to your test dataset text file
sdoh_output_path = "/content/output_gpt/predicted_labels_sdoh.txt"

mts_txt_path = "/content/combined_text/content/combined_text/mts_test.txt"
mts_output_path = "/content/output_gpt/predicted_labels_mts.txt"

In [None]:
# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(model_path)

# Process and generate predictions
process_clinical_notes(sdoh_txt_path, model, tokenizer, sdoh_output_path)
process_clinical_notes(mts_txt_path, model, tokenizer, mts_output_path)

In [None]:
model, tokenizer = load_model_and_tokenizer(model_path)


In [None]:
# @title
process_clinical_notes(mts_txt_path, model, tokenizer, mts_output_path)

In [None]:
import pandas as pd
import re

def extract_info_and_create_csv(input_file_path, output_csv_path):
    # Pattern to match the lines and extract relevant parts
    pattern = re.compile(r'Generated Prediction: Get the SDOH_LABEL and ADVERSE_LABEL from the following clinical note: (.+) \[SDOH_LABEL\]: (\w+) \[ADVERSE_LABEL\]: (\w+)')

    # Initialize lists to hold extracted information
    notes = []
    sdoh_labels = []
    adverse_labels = []

    with open(input_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Check if line matches the expected starting text
            match = pattern.match(line)
            if match:
                # Extract information from the matched groups
                note, sdoh_label, adverse_label = match.groups()
                notes.append(note)
                sdoh_labels.append(sdoh_label)
                adverse_labels.append(adverse_label)

    # Create a DataFrame from the extracted information
    df = pd.DataFrame({
        'Note': notes,
        'SDOH_LABEL': sdoh_labels,
        'ADVERSE_LABEL': adverse_labels
    })

    # Write the DataFrame to a CSV file
    df.to_csv(output_csv_path, index=False)

# Define input and output paths
sdoh_input_file_path = "/content/output_gpt/predicted_labels_sdoh.txt"
sdoh_output_csv_path = "/content/output_gpt/predictions_sdoh.csv"

mts_input_file_path = "/content/output_gpt/predicted_labels_mts.txt"
mts_output_csv_path = "/content/output_gpt/predictions_mts.csv"
# Call the function to extract information and create the CSV
extract_info_and_create_csv(sdoh_input_file_path, sdoh_output_csv_path)
extract_info_and_create_csv(mts_input_file_path, mts_output_csv_path)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i in range(len(classes)):
        for j in range(len(classes)):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Load true labels and predicted labels from CSV files
df = pd.read_csv('Iteration__1.csv')
training_file, true_labels_df = train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
predicted_labels_df = pd.read_csv('/content/output_gpt/predictions_sdoh.csv')

# Assuming the structure of both files is: ['Note', 'SDOH_LABEL', 'ADVERSE_LABEL']
# And that the rows in both files are aligned

# Calculate accuracies
sdoh_accuracy = accuracy_score(true_labels_df['label'], predicted_labels_df['SDOH_LABEL'])
adverse_accuracy = accuracy_score(true_labels_df['adverse'], predicted_labels_df['ADVERSE_LABEL'])

print(f"SDOH Label Accuracy: {sdoh_accuracy * 100:.2f}%")
print(f"Adverse Label Accuracy: {adverse_accuracy * 100:.2f}%")

# Calculate confusion matrices
sdoh_cm = confusion_matrix(true_labels_df['label'], predicted_labels_df['SDOH_LABEL'])
adverse_cm = confusion_matrix(true_labels_df['adverse'], predicted_labels_df['ADVERSE_LABEL'])



# Plotting confusion matrices
# You need to replace `unique_labels` with the actual labels used in your dataset.
# For SDOH_LABEL
unique_sdoh_labels = true_labels_df['label'].unique()
plt.figure(figsize=(10, 7))
plot_confusion_matrix(sdoh_cm, classes=unique_sdoh_labels, title='SDOH Label Confusion Matrix')
plt.show()

# For ADVERSE_LABEL
unique_adverse_labels = true_labels_df['adverse'].unique()
plt.figure(figsize=(10, 7))
plot_confusion_matrix(adverse_cm, classes=unique_adverse_labels, title='Adverse Label Confusion Matrix')
plt.show()


In [None]:
# Load true labels and predicted labels from CSV files
true_labels_df = pd.read_csv('augmented_mts.csv')
predicted_labels_df = pd.read_csv('/content/output_gpt/predictions_sdoh.csv')

# Assuming the structure of both files is: ['Note', 'SDOH_LABEL', 'ADVERSE_LABEL']
# And that the rows in both files are aligned

# Calculate accuracies
sdoh_accuracy = accuracy_score(true_labels_df['SDOH label'], predicted_labels_df['SDOH_LABEL'])
adverse_accuracy = accuracy_score(true_labels_df['Adverse Category'], predicted_labels_df['ADVERSE_LABEL'])

print(f"SDOH Label Accuracy: {sdoh_accuracy * 100:.2f}%")
print(f"Adverse Label Accuracy: {adverse_accuracy * 100:.2f}%")

# Calculate confusion matrices
sdoh_cm = confusion_matrix(true_labels_df['SDOH label'], predicted_labels_df['SDOH_LABEL'])
adverse_cm = confusion_matrix(true_labels_df['Adverse Category'], predicted_labels_df['ADVERSE_LABEL'])



# Plotting confusion matrices
# You need to replace `unique_labels` with the actual labels used in your dataset.
# For SDOH_LABEL
unique_sdoh_labels = true_labels_df['SDOH label'].unique()
plt.figure(figsize=(10, 7))
plot_confusion_matrix(sdoh_cm, classes=unique_sdoh_labels, title='SDOH Label Confusion Matrix')
plt.show()

# For ADVERSE_LABEL
unique_adverse_labels = true_labels_df['Adverse Category'].unique()
plt.figure(figsize=(10, 7))
plot_confusion_matrix(adverse_cm, classes=unique_adverse_labels, title='Adverse Label Confusion Matrix')
plt.show()
