In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

# Load the dataset
dataset = load_dataset("mohammad2928git/complete_medical_symptom_dataset")

# Convert the dataset to a pandas DataFrame
df = dataset['train'].to_pandas()

# Extract relevant columns and drop missing values
df = df[['symptoms', 'text']].dropna()

# Split the data into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Check the sizes of each dataset
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(validation_df)}")
print(f"Test size: {len(test_df)}")

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Function to preprocess data
def preprocess_data(df):
    inputs = []
    targets = []
    for _, row in df.iterrows():
        symptoms_list = row['symptoms']
        text = row['text']

        # Convert the list of symptoms into a string
        symptoms_str = ', '.join([symptom.strip() for symptom in symptoms_list if isinstance(symptom, str) and symptom.strip()])

        # Create inputs and targets
        inputs.append(f"extract symptoms: {text}")
        targets.append(symptoms_str)

    return inputs, targets

# Preprocess the training, validation, and test data
train_inputs, train_targets = preprocess_data(train_df)
validation_inputs, validation_targets = preprocess_data(validation_df)
test_inputs, test_targets = preprocess_data(test_df)

# Tokenize inputs and targets
train_encodings = tokenizer(train_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")
train_labels = tokenizer(train_targets, padding=True, truncation=True, max_length=512, return_tensors="pt").input_ids

validation_encodings = tokenizer(validation_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")
validation_labels = tokenizer(validation_targets, padding=True, truncation=True, max_length=512, return_tensors="pt").input_ids

test_encodings = tokenizer(test_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")
test_labels = tokenizer(test_targets, padding=True, truncation=True, max_length=512, return_tensors="pt").input_ids

# Create PyTorch Dataset
class SymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = SymptomDataset(train_encodings, train_labels)
validation_dataset = SymptomDataset(validation_encodings, validation_labels)
test_dataset = SymptomDataset(test_encodings, test_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy="epoch",     # Evaluate at each epoch
    learning_rate=5e-5,              # Learning rate
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Weight decay for optimization
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    save_strategy="epoch",           # Save model at the end of each epoch
    load_best_model_at_end=True,     # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Metric to select the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained("trained_model")
tokenizer.save_pretrained("trained_model")

In [13]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Path to the extracted model directory
model_path = "/content/drive/MyDrive/t5/doctor GPT/trained_model/trained_model"

# Load the tokenizer and model from the extracted directory
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Function to generate a response based on the input prompt
def generate_response(prompt):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)

    # Generate output with the model
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_length=150,      # Maximum length of the generated text
            num_beams=5,         # Number of beams for beam search (optional)
            early_stopping=True  # Stop early when the sentence is complete
        )

    # Decode the generated output into text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# Example usage
query = """Another symptom that's been troubling me is gastrointestinal discomfort. I've experienced frequent episodes of nausea and vomiting, often unrelated to food intake. My appetite has significantly decreased, and I've noticed a sudden weight loss of about 10 pounds in just two weeks. There have been bouts of diarrhea alternating with constipation, and I sometimes experience sharp, cramping abdominal pain that seems to move around my midsection.
In addition to these, I've had muscle aches and joint pain that seem to migrate from one area to another. The pain is most pronounced in my lower back and knees and tends to worsen in the evening. My energy levels have plummeted, leaving me feeling fatigued and lethargic throughout the day, regardless of how much sleep I get at night. Speaking of sleep, I've been struggling with insomnia, waking up frequently during the night and feeling unrested in the morning.
On top of everything else, I've noticed a strange rash developing on my arms and legs. The rash is red, itchy, and occasionally forms small blisters that ooze a clear fluid. This skin issue is new and seems to correlate with increased exposure to sunlight, leading me to suspect a photosensitivity reaction."""
response = generate_response(f"extract symptoms: {query}")
print("Extracted Symptoms:", response)

Extracted Symptoms: symptoms that's troubling me gastrointestinal discomfort, i've experienced frequent episodes nausea vomiting, unrelated food intake, appetite significantly decreased, i've noticed sudden weight loss 10 pounds just weeks, bouts diarrhea alternating cons
