In [1]:
# Install required packages
!pip install transformers datasets peft accelerate bitsandbytes torch pandas numpy

print("✅ All packages installed successfully!")

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1
✅ All packages installed successfully!


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import pandas as pd
import os

# Instead of importing, we'll define the functions directly here
def load_and_preprocess_data():
    """Load and preprocess the medical instruction dataset"""

    print("Loading AlpaCare-MedInstruct-52k dataset...")

    # Load dataset from Hugging Face
    dataset = load_dataset("lavita/AlpaCare-MedInstruct-52k")

    # Split dataset (90/5/5)
    train_testvalid = dataset['train'].train_test_split(test_size=0.1, seed=42)
    test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

    dataset_splits = {
        'train': train_testvalid['train'],
        'test': test_valid['test'],
        'valid': test_valid['train']
    }

    print(f"Training samples: {len(dataset_splits['train'])}")
    print(f"Testing samples: {len(dataset_splits['test'])}")
    print(f"Validation samples: {len(dataset_splits['valid'])}")

    return dataset_splits

def format_with_disclaimer(instruction, response):
    """Format training examples with medical disclaimer"""
    disclaimer = "Important: This is for educational purposes only. Always consult a qualified healthcare professional for medical advice."

    formatted_text = f"### Instruction: {instruction}\n\n### Response: {response}\n\n{disclaimer}"
    return formatted_text

print("✅ All functions defined successfully!")

✅ All functions defined successfully!


In [7]:
# Load dataset
print("📊 Loading medical dataset...")
dataset_splits = load_and_preprocess_data()

# Show sample data
print("\n📝 Sample training example:")
sample = dataset_splits['train'][0]
print(f"Instruction: {sample['instruction']}")
print(f"Response: {sample['output']}")

📊 Loading medical dataset...
Loading AlpaCare-MedInstruct-52k dataset...
Training samples: 46801
Testing samples: 2601
Validation samples: 2600

📝 Sample training example:
Instruction: Explain why widespread vaccination was key to eradicating polio in many regions of the world.
Response: Widespread vaccination was key to eradicating polio in many regions of the world because it helps to achieve herd immunity. Here are the steps explaining why:

1. Polio is a highly contagious viral disease that primarily affects young children. It can cause paralysis and, in severe cases, death.
2. The poliovirus spreads through direct contact with an infected person's feces or saliva. It can also spread through contaminated water and food.
3. Vaccines against polio were developed in the mid-20th century, which triggered a significant decline in polio cases.
4. By vaccinating a large portion of the population, especially children, we can create herd immunity. Herd immunity occurs when a sufficient prop

In [8]:
# Load model and tokenizer
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("✅ Model and tokenizer loaded successfully!")

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Model and tokenizer loaded successfully!


In [11]:
# LoRA configuration - UPDATED for DialoGPT
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,  # rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"]  # CHANGED: Correct layers for GPT models
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("✅ LoRA configuration applied!")



trainable params: 2,162,688 || all params: 356,985,856 || trainable%: 0.6058
✅ LoRA configuration applied!


In [21]:
def preprocess_function(examples):
    # Format text with disclaimer
    texts = []
    for instruction, output in zip(examples['instruction'], examples['output']):
        text = format_with_disclaimer(instruction, output)
        texts.append(text)

    # Tokenize properly - return token IDs, not the tokenizer object
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors=None  # This returns Python lists, not tensors
    )

    # The tokenizer returns dict with 'input_ids', 'attention_mask'
    return tokenized

print("🔧 Tokenizing dataset...")

# Convert each split individually
tokenized_datasets = {}
for split_name, dataset in dataset_splits.items():
    print(f"Tokenizing {split_name} split...")
    tokenized_datasets[split_name] = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset.column_names  # Remove original columns
    )

print("✅ Dataset tokenized!")
print(f"Training samples: {len(tokenized_datasets['train'])}")
print(f"Validation samples: {len(tokenized_datasets['valid'])}")
print(f"Test samples: {len(tokenized_datasets['test'])}")

# Show what the tokenized data looks like
print("\n📊 Sample tokenized data:")
print(tokenized_datasets['train'][0].keys())

🔧 Tokenizing dataset...
Tokenizing train split...


Map:   0%|          | 0/46801 [00:00<?, ? examples/s]

Tokenizing test split...


Map:   0%|          | 0/2601 [00:00<?, ? examples/s]

Tokenizing valid split...


Map:   0%|          | 0/2600 [00:00<?, ? examples/s]

✅ Dataset tokenized!
Training samples: 46801
Validation samples: 2600
Test samples: 2601

📊 Sample tokenized data:
dict_keys(['input_ids', 'attention_mask'])


In [22]:
# SIMPLE Training arguments that work with all versions
training_args = TrainingArguments(
    output_dir="./alpacare-medical-model",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_steps=100,
    learning_rate=2e-4,
    warmup_steps=100,
    report_to=None,  # Disable wandb
    remove_unused_columns=False,
    # REMOVED all problematic parameters
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    tokenizer=tokenizer,
)

print("✅ Training setup complete!")
print("Training configuration ready!")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


✅ Training setup complete!
Training configuration ready!


  trainer = Trainer(


In [23]:
print("🚀 Starting QUICK training demo...")

# Use smaller subset
small_train_dataset = tokenized_datasets["train"].select(range(200))
small_eval_dataset = tokenized_datasets["valid"].select(range(50))

# Simple training
simple_training_args = TrainingArguments(
    output_dir="./alpacare-medical-model",
    per_device_train_batch_size=2,
    max_steps=20,
    logging_steps=5,
    learning_rate=2e-4,
    report_to="none",
)

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=simple_training_args,
    train_dataset=small_train_dataset,
    data_collator=data_collator,
)

print("Training on 200 samples for 20 steps...")
trainer.train()
print("✅ Quick training demo completed!")

🚀 Starting QUICK training demo...
Training on 200 samples for 20 steps...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,7.2355


Step,Training Loss
5,7.2355
10,6.8291
15,6.3324
20,5.6347


✅ Quick training demo completed!


In [24]:
# Save the demo model
model.save_pretrained("./alpacare-demo-adapter")
tokenizer.save_pretrained("./alpacare-demo-adapter")
print("✅ Demo adapter saved!")
print("📁 You can use this for testing now!")

✅ Demo adapter saved!
📁 You can use this for testing now!
