In [30]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [31]:
# Load GPT2 model and tokenizer 
model_name = "gpt2" 
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Ensure the tokenizer is configured for padding tokens if needed
tokenizer.pad_token = tokenizer.eos_token


In [32]:
# Loading the dataset 
dataset = load_dataset("openlifescienceai/medmcqa", split="train")  
print(dataset[:2])

{'id': ['e9ad821a-c438-4965-9f77-760819dfa155', 'e3d3c4e1-4fb2-45e7-9f88-247cc8f373b3'], 'question': ['Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma', 'Which vitamin is supplied from only animal source:'], 'opa': ['Hyperplasia', 'Vitamin C'], 'opb': ['Hyperophy', 'Vitamin B7'], 'opc': ['Atrophy', 'Vitamin B12'], 'opd': ['Dyplasia', 'Vitamin D'], 'cop': [2, 2], 'choice_type': ['single', 'single'], 'exp': ['Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950', "Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. P 640* Vitamin B12 (Cobalamin) is synthesized solely by microorganisms.* In humans, the

In [34]:
def preprocess_function(examples):
    # Create the prompt with question and choices, and mark the correct option
    input_texts = [
        f"Question: {q} Choices: A. {a} B. {b} C. {c} D. {d} Correct Answer: {correct}"
        for q, a, b, c, d, correct in zip(
            examples['question'],
            examples['opa'],
            examples['opb'],
            examples['opc'],
            examples['opd'],
            examples['cop']
        )
    ]
    return tokenizer(input_texts, truncation=True, padding="max_length", max_length=512)

# Step 4: Apply the preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)


In [35]:
train_dataset, test_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()

In [36]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


In [None]:
# Fine-tune the model
trainer.train()

In [None]:
results = trainer.evaluate()
print(f"Validation Results: {results}")

In [None]:
# Save the model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")