#**Step 1: Install All the Required Packages**

In [1]:
import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizerBase
)
from typing import Any, Dict, List, Union
from torch.nn.utils.rnn import pad_sequence

In [2]:
# ------------------ Config ------------------ #
model_name = "EleutherAI/gpt-neo-125M"
data_path = "/content/fine_tune_data.jsonl"
output_dir = "./neo_outputs"
logging_dir = "./neo_logs"

In [3]:
# ------------------ Load Model & Tokenizer ------------------ #
print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

tokenizer.padding_side = "right"


Loading model: EleutherAI/gpt-neo-125M


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:

# ------------------ Load Dataset ------------------ #
abs_path = os.path.abspath(data_path)
print(f"Loading dataset from: {abs_path}")
df = pd.read_json(abs_path, lines=True)
dataset = Dataset.from_pandas(df)


Loading dataset from: /content/fine_tune_data.jsonl


In [5]:
# ------------------ Tokenization Function ------------------ #
def tokenize_function(example):
    text = f"### Question:\n{example['prompt']}\n\n### Answer:\n{example['response']}"
    tokens = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"],
        "labels": tokens["input_ids"]
    }


In [6]:
# ------------------ Tokenize ------------------ #
print("Tokenizing dataset...")

tokenized_dataset = dataset.map(tokenize_function, remove_columns=["prompt", "response"])


Tokenizing dataset...


Map:   0%|          | 0/19595 [00:00<?, ? examples/s]

In [7]:
# ------------------ Filter empty examples ------------------ #
tokenized_dataset = tokenized_dataset.filter(lambda example: len(example["input_ids"]) > 0)


Filter:   0%|          | 0/19595 [00:00<?, ? examples/s]

In [8]:
# ------------------ Train / Eval Split ------------------ #
split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

In [9]:
# ------------------ Custom Data Collator ------------------ #
class CausalDataCollator:
    def __init__(self, tokenizer: PreTrainedTokenizerBase):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        features = [f for f in features if len(f["input_ids"]) > 0]
        if len(features) == 0:
            raise ValueError("No valid sequences in batch.")

        input_ids = [torch.tensor(f["input_ids"]) for f in features]
        attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
        labels = [torch.tensor(f["labels"]) for f in features]

        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
        labels = pad_sequence(labels, batch_first=True, padding_value=-100)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

data_collator = CausalDataCollator(tokenizer)


In [10]:
# ------------------ Training Arguments ------------------ #
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir=logging_dir,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    do_train=True,
    do_eval=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [11]:
# ------------------ Trainer Setup ------------------ #
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [12]:
# ------------------ Check First Batch ------------------ #
print("Checking a sample batch from the data collator:")
try:
    for i, batch in enumerate(trainer.get_train_dataloader()):
        if i == 0:
            for k, v in batch.items():
                print(f"{k}: shape={v.shape}, dtype={v.dtype}")
            break
except Exception as e:
    print(f"Error during batch check: {e}")

Checking a sample batch from the data collator:
input_ids: shape=torch.Size([4, 512]), dtype=torch.int64
attention_mask: shape=torch.Size([4, 512]), dtype=torch.int64
labels: shape=torch.Size([4, 512]), dtype=torch.int64


In [13]:
# ------------------ Training ------------------ #
print("Starting training...")
trainer.train()

Starting training...


Step,Training Loss
50,1.6668
100,0.8827
150,0.8622
200,0.8236
250,0.8731
300,0.8435
350,0.8703
400,0.8029
450,0.8175
500,0.7831


Step,Training Loss
50,1.6668
100,0.8827
150,0.8622
200,0.8236
250,0.8731
300,0.8435
350,0.8703
400,0.8029
450,0.8175
500,0.7831


TrainOutput(global_step=13227, training_loss=0.6726134493389376, metrics={'train_runtime': 5662.9148, 'train_samples_per_second': 9.342, 'train_steps_per_second': 2.336, 'total_flos': 1.381916148498432e+16, 'train_loss': 0.6726134493389376, 'epoch': 3.0})

In [14]:
# ------------------ Save Model ------------------ #
final_output_path = os.path.join(output_dir, "final")
print(f"Saving model to: {final_output_path}")
trainer.save_model(final_output_path)
tokenizer.save_pretrained(final_output_path)
print("Training complete.")

Saving model to: ./neo_outputs/final
Training complete.


In [15]:
while True:
    user_input = input("\n🧠 Ask your medical question (or type 'exit' to quit): ")
    if user_input.lower() == "exit":
        break

    prompt = f"### Question:\n{user_input}\n\n### Answer:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Answer:")[-1].strip()
    print("🗣️ Answer:", response)



🧠 Ask your medical question (or type 'exit' to quit): A 32-year-old woman presents to the office with oral lesions. On examination, you observe multiple flaccid bullae that spread laterally with gentle fingertip pressure (positive Nikolsky sign). Which of the following is the most likely target of the autoantibodies in this condition?
🗣️ Answer: correct answer autoantibodies in condition like condition like autoantibody positive Nikolsky sign

🧠 Ask your medical question (or type 'exit' to quit): exit


A 32-year-old woman presents to the office with oral lesions. On examination, you observe multiple flaccid bullae that spread laterally with gentle fingertip pressure (positive Nikolsky sign). Which of the following is the most likely target of the autoantibodies in this condition?