In [1]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer


In [2]:
import re
import pandas as pd
import os
# Load model and tokenizer functions
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

# Adjusted generate_text function for your specific task
def generate_text(model, tokenizer, sequence, max_length):
    ids = tokenizer.encode(sequence, return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    generated_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)
    
    # Extract SDoH label and adverse label from generated text
    label_match = re.search(r'\[LABEL\] (.+?) \[ADVERSE_LABEL\]', generated_text)
    adverse_label_match = re.search(r'\[ADVERSE_LABEL\] (.+)', generated_text)
    
    label = label_match.group(1) if label_match else "Label not found"
    adverse_label = adverse_label_match.group(1) if adverse_label_match else "Adverse label not found"
    
    return f"Label: {label}, Adverse Label: {adverse_label}"

def read_section_texts(csv_file_path):
    df = pd.read_csv(csv_file_path)
    # Assuming 'section_text' is the column name containing the text you want to process
    for text in df['section_text']:
        yield text.strip()


# Write SDoH and adverse labels generated by the model for given sentences
def write_labels(model, tokenizer, section_texts, output_file_path, additional_length=50):
    with open(output_file_path, 'w') as file:
        for text in section_texts:
            text_length = len(tokenizer.encode(text))
            max_length = text_length + additional_length
            generated_text = generate_text(model, tokenizer, text, max_length)
            file.write(f"Input Text: {text}\n{generated_text}\n\n")




In [3]:
# Load model and tokenizer
model_path = "content/model_output/custom_q_and_a"  # Ensure this is the correct path where your model is saved
model = load_model(model_path)
tokenizer = load_tokenizer(model_path)

# Read sentences and generate SDoH and adverse labels
ehr_data_csv_path = "MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"  # Update this path to your CSV file
output_file_path = "content/predicted_labels.txt"  # Output file where predictions will be written
section_texts = read_section_texts(ehr_data_csv_path)
write_labels(model, tokenizer, section_texts, output_file_path)