# Healthcare Chatbot Fine-tuning Pipeline

This notebook demonstrates how to fine-tune a transformer model for healthcare domain-specific conversations.


In [2]:
import json
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


ModuleNotFoundError: No module named 'pandas'

## Preprocessing and Data loading


In [None]:
# Load larger healthcare dataset from Hugging Face
from datasets import load_dataset

try:
    # Load healthcare chatbot dataset
    health_dataset = load_dataset("shaneperry0101/health-chatbot")
    print(f"Hugging Face dataset size: {len(health_dataset['train'])}")
    
    # Convert to training format
    hf_training_data = []
    for example in health_dataset['train']:
        # Apply same preprocessing as above
        user_msg = ' '.join(example['input'].lower().split())
        assistant_msg = ' '.join(example['output'].lower().split())
        
        user_msg = re.sub(r'[^\w\s\.\?\!]', '', user_msg)
        assistant_msg = re.sub(r'[^\w\s\.\?\!]', '', assistant_msg)
        
        formatted_text = f"<|startoftext|>Patient: {user_msg}<|endoftext|>Doctor: {assistant_msg}<|endoftext|>"
        hf_training_data.append({"text": formatted_text})
    
    print(f"Created {len(hf_training_data)} training examples from Hugging Face")
    
    # Use the larger dataset for training
    training_data = hf_training_data
    
except Exception as e:
    print(f"Could not load Hugging Face dataset: {e}")
    print("Using custom dataset instead")


## Model configuration

In [None]:
# Model configuration - using GPT-2 for better healthcare performance
MODEL_NAME = "gpt2"  # or "gpt2-medium" for better quality
MAX_LENGTH = 512

# Load tokenizer with proper configuration
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Add special tokens for healthcare conversations
special_tokens = {
    "additional_special_tokens": ["<|startoftext|>", "<|endoftext|>", "Patient:", "Doctor:"]
}
tokenizer.add_special_tokens(special_tokens)

# Configure padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded: {MODEL_NAME}")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Special tokens added: {special_tokens['additional_special_tokens']}")

# Test tokenization
sample_text = "<|startoftext|>Patient: I have a headache<|endoftext|>Doctor:"
tokens = tokenizer.encode(sample_text)
print(f"Sample tokenization: {tokens}")
print(f"Decoded: {tokenizer.decode(tokens)}")
