In [None]:
%%capture
!pip install --upgrade pip
!pip install torch torchvision torchaudio
!pip install transformers==4.36.2
!pip install accelerate==0.25.0
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install evaluate==0.4.1
!pip install wandb

In [None]:
import torch
import platform
import os
from huggingface_hub import login
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the Hugging Face token from environment variables
HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
WANDB_API_KEY = os.getenv('WANDB_API_KEY')

In [None]:
print(f"System: {platform.system()} {platform.machine()}")
print(f"Python version: {platform.python_version()}")
print(f"PyTorch version: {torch.__version__}")

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("🚀 MPS (Metal Performance Shaders) available - using GPU acceleration")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("🔥 CUDA available - using NVIDIA GPU")
else:
    device = torch.device("cpu")
    print("💻 Using CPU - training will be slower but still functional")

print(f"Selected device: {device}")

In [None]:
experiment_config = {
    "WANDB_PROJECT": "Supervised-fine-tune-models",
    "WANDB_NOTES": "MacBook PHI-2 fine-tuning",
    "WANDB_NAME": "sft-phi2-dialogsum-macbook",
    "MODEL_NAME": "microsoft/phi-2",
    "DATASET_NAME": "neil-code/dialogsum-test"
}

for key, value in experiment_config.items():
    os.environ[key] = value

In [None]:
from datasets import load_dataset

print("Loading dataset...")
original_dataset = load_dataset(os.getenv('DATASET_NAME'))
print("Dataset loaded successfully!")

# Use smaller subsets for faster training on MacBook
SAMPLE_SIZE = 50  # Reduced for MacBook performance

train_data = original_dataset['train'].select(range(SAMPLE_SIZE))
validation_data = original_dataset['validation'].select(range(SAMPLE_SIZE))
test_data = original_dataset['test'].select(range(SAMPLE_SIZE))

dataset_splits = {
    'train': train_data,
    'validation': validation_data,
    'test': test_data
}

print(f"Dataset prepared with {SAMPLE_SIZE} samples each:")
for split, data in dataset_splits.items():
    print(f"  {split}: {len(data)} samples")


In [None]:
from transformers import AutoTokenizer
# see https://github.com/huggingface/transformers/issues/18388 for description about padding
tokenizer=AutoTokenizer.from_pretrained(
    os.getenv('MODEL_NAME'),
    padding_side='left',
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False
)
tokenizer.pad_token=tokenizer.eos_token

phi2_tokenizer = tokenizer
print(f"✅ Tokenizer initialized with vocab size: {len(phi2_tokenizer)}")

In [None]:
from functools import partial
from transformers import set_seed, DataCollatorForLanguageModeling

In [None]:
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

def format_prompt(sample):
    """Create structured prompt from dialogue and summary"""
    INTRO = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION = "### Instruct: Summarize the below conversation."
    OUTPUT_MARKER = "### Output:"
    END_MARKER = "### End"
    
    # Build complete prompt
    prompt_parts = [
        f"\n{INTRO}",
        f"{INSTRUCTION}",
        f"{sample['dialogue']}" if sample["dialogue"] else None,
        f"{OUTPUT_MARKER}\n{sample['summary']}",
        f"{END_MARKER}"
    ]
    
    # Filter out None parts and join
    formatted_prompt = "\n\n".join([part for part in prompt_parts if part])
    sample["text"] = formatted_prompt
    return sample

def tokenize_function(examples, tokenizer, max_length):
    """Tokenize the text data"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=max_length,
    )

def preprocess_dataset(dataset, tokenizer, max_length=1024):
    """Complete preprocessing pipeline"""
    print("Applying prompt formatting...")
    formatted_dataset = dataset.map(format_prompt)
    
    print("Tokenizing dataset...")
    tokenize_fn = partial(tokenize_function, tokenizer=tokenizer, max_length=max_length)
    tokenized_dataset = formatted_dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=['id', 'topic', 'dialogue', 'summary'],
    )
    
    # Filter out sequences that are too long
    filtered_dataset = tokenized_dataset.filter(
        lambda x: len(x["input_ids"]) < max_length
    )
    
    # Shuffle dataset
    shuffled_dataset = filtered_dataset.shuffle(seed=RANDOM_SEED)
    return shuffled_dataset

# Set up data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=phi2_tokenizer,
    mlm=False,
)

In [None]:
from transformers import AutoModelForCausalLM

def load_model_for_macbook(model_name, device):
    """Load PHI-2 model optimized for MacBook"""
    print(f"Loading model on {device}...")
    
    if device.type == "mps":
        # Optimized for Apple Silicon - remove device_map="auto" for Phi models
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
        )
        # Manually move to MPS device
        model = model.to(device)
    elif device.type == "cuda":
        # NVIDIA GPU settings
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        # CPU fallback
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,  # Use float32 for CPU
            trust_remote_code=True,
            low_cpu_mem_usage=True,
        )
    
    print(f"✅ Model loaded successfully on {model.device}")
    return model

# Load the base model
base_model = load_model_for_macbook(os.getenv('MODEL_NAME'), device)

# Get max sequence length
max_seq_length = getattr(base_model.config, 'max_position_embeddings', 2048)
print(f"Max sequence length: {max_seq_length}")

In [None]:
print("Preprocessing datasets...")

train_dataset = preprocess_dataset(
    dataset_splits['train'], 
    phi2_tokenizer, 
    max_seq_length
)

eval_dataset = preprocess_dataset(
    dataset_splits['validation'], 
    phi2_tokenizer, 
    max_seq_length
)

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

In [None]:
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

# Enable gradient checkpointing for memory efficiency
base_model.gradient_checkpointing_enable()

# LoRA configuration optimized for MacBook
lora_config = LoraConfig(
    r=8,  # Reduced rank for MacBook memory efficiency
    lora_alpha=16,
    target_modules=[
        'q_proj', 'k_proj', 'v_proj', 'dense', 'fc1', 'fc2'
    ],
    bias="none",
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA
peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

print("✅ LoRA configuration applied successfully!")

In [None]:
from transformers import TrainingArguments, Trainer
import time

def get_training_args_for_device(device):
    """Get optimized training arguments based on device"""
    
    # Determine if wandb should be used
    use_wandb = os.getenv('WANDB_API_KEY') and os.getenv('WANDB_DISABLED') != 'true'
    report_to = "wandb" if use_wandb else None
    
    if device.type == "mps":
        # Apple Silicon optimized settings
        return TrainingArguments(
            output_dir=os.getenv("WANDB_NAME"),
            overwrite_output_dir=True,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=4,
            gradient_checkpointing=True,
            num_train_epochs=2,
            max_steps=50,  # Reduced for MacBook
            warmup_steps=10,
            learning_rate=5e-5,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_strategy="steps",
            logging_steps=5,
            evaluation_strategy="steps",
            eval_steps=25,
            save_strategy="steps",
            save_steps=50,
            save_total_limit=2,
            load_best_model_at_end=True,
            fp16=False,  # MPS doesn't support fp16 well
            dataloader_pin_memory=False,
            dataloader_num_workers=0,  # Avoid multiprocessing issues on macOS
            report_to=report_to,
            run_name=os.getenv("WANDB_NAME"),
        )
    elif device.type == "cuda":
        # NVIDIA GPU settings
        return TrainingArguments(
            output_dir=os.getenv("WANDB_NAME"),
            overwrite_output_dir=True,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            gradient_accumulation_steps=4,
            gradient_checkpointing=True,
            num_train_epochs=2,
            max_steps=50,
            warmup_steps=10,
            learning_rate=5e-5,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_strategy="steps",
            logging_steps=5,
            evaluation_strategy="steps",
            eval_steps=25,
            save_strategy="steps",
            save_steps=50,
            save_total_limit=2,
            load_best_model_at_end=True,
            fp16=True,
            report_to=report_to,
            run_name=os.getenv("WANDB_NAME"),
        )
    else:
        # CPU settings
        return TrainingArguments(
            output_dir=os.getenv("WANDB_NAME"),
            overwrite_output_dir=True,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=2,
            num_train_epochs=1,  # Reduced for CPU
            max_steps=20,
            warmup_steps=5,
            learning_rate=5e-5,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_strategy="steps",
            logging_steps=5,
            evaluation_strategy="steps",
            eval_steps=10,
            save_strategy="steps",
            save_steps=20,
            save_total_limit=1,
            dataloader_num_workers=0,
            report_to=report_to,
            run_name=os.getenv("WANDB_NAME"),
        )

# Get training arguments
training_args = get_training_args_for_device(device)

# Disable cache for training
peft_model.config.use_cache = False

# Initialize trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print(f"✅ Trainer configured for {device} with batch size {training_args.per_device_train_batch_size}")

In [None]:
import wandb

# Initialize Weights & Biases if API key is available
if os.getenv('WANDB_API_KEY'):
    wandb.login(key=os.getenv('WANDB_API_KEY'))
    wandb.init(
        project=os.getenv('WANDB_PROJECT'),
        name=os.getenv('WANDB_NAME'),
        notes=os.getenv('WANDB_NOTES'),
        config={
            "model_name": os.getenv('MODEL_NAME'),
            "dataset_name": os.getenv('DATASET_NAME'),
            "device": str(device),
            "sample_size": 50,
            "random_seed": 42,
        }
    )
    print("✅ Weights & Biases initialized successfully!")
else:
    print("⚠️ WANDB_API_KEY not found - training without W&B logging")
    # Disable wandb in training args
    os.environ['WANDB_DISABLED'] = 'true'

In [None]:

print("🚀 Starting training...")
print(f"Device: {device}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

start_time = time.time()
trainer.train()
end_time = time.time()

training_duration = end_time - start_time
print(f"✅ Training completed in {training_duration:.2f} seconds!")

In [None]:
model_save_path = f"./{os.getenv('WANDB_NAME')}"
trainer.save_model(model_save_path)
phi2_tokenizer.save_pretrained(model_save_path)
print(f"Model saved locally to: {model_save_path}")


In [None]:
from peft import PeftModel

def load_model_for_inference(base_model_name, peft_model_path, device):
    """Load the fine-tuned model for inference"""
    # Load base model
    if device.type == "mps":
        # For MPS, don't use device_map="auto"
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True,
        )
        base_model = base_model.to(device)
    else:
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16 if device.type != "cpu" else torch.float32,
            device_map="auto" if device.type == "cuda" else None,
            trust_remote_code=True,
        )
    
    # Load LoRA weights
    model = PeftModel.from_pretrained(base_model, peft_model_path)
    return model

# Load tokenizer for inference - use slow tokenizer to avoid BOS token issue
inference_tokenizer = AutoTokenizer.from_pretrained(
    model_save_path,
    trust_remote_code=True,
    use_fast=False  # Use slow tokenizer to avoid BOS token error
)
inference_tokenizer.pad_token = inference_tokenizer.eos_token

# Load fine-tuned model
inference_model = load_model_for_inference(
    os.getenv('MODEL_NAME'),
    model_save_path,
    device
)

print("Model loaded for inference!")

In [None]:
def generate_summary(model, tokenizer, prompt, max_new_tokens=150, device=device):
    """Generate summary using the fine-tuned model"""
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    
    # Move to device
    if device.type != "cpu":
        inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Put model in eval mode
    model.eval()
    
    # Generate with more stable parameters
    with torch.no_grad():
        try:
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,  # Use greedy decoding for stability
                temperature=1.0,  # Reset to default
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1,  # Prevent repetition
                use_cache=True,
            )
        except RuntimeError as e:
            if "probability tensor" in str(e):
                # Fallback to beam search if sampling fails
                print("⚠️ Sampling failed, trying beam search...")
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    num_beams=2,
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    repetition_penalty=1.1,
                    use_cache=True,
                )
            else:
                raise e
    
    # Decode output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [None]:
test_sample = dataset_splits['test'][0]
test_dialogue = test_sample['dialogue']
ground_truth = test_sample['summary']

# Create test prompt
test_prompt = f"Instruct: Summarize the following conversation.\n{test_dialogue}\nOutput:\n"

# Generate summary
print("Generating summary...")
generated_output = generate_summary(
    inference_model, 
    inference_tokenizer, 
    test_prompt, 
    max_new_tokens=100
)

# Extract the summary part
try:
    generated_summary = generated_output.split("Output:\n")[1].split("### End")[0].strip()
except:
    generated_summary = generated_output

# Display results
print("=" * 80)
print("INPUT DIALOGUE:")
print(test_dialogue)
print("\n" + "=" * 80)
print("GROUND TRUTH SUMMARY:")
print(ground_truth)
print("\n" + "=" * 80)
print("GENERATED SUMMARY:")
print(generated_summary)
print("=" * 80)

In [None]:
# Fix wandb socket issues by properly finishing any active runs
try:
    import wandb
    if wandb.run is not None:
        print("🔄 Finishing active wandb run...")
        wandb.finish()
        print("✅ Wandb run finished")
    
    # Disable wandb to prevent socket errors
    os.environ['WANDB_DISABLED'] = 'true'
    os.environ['WANDB_MODE'] = 'disabled'
    print("🚫 Wandb disabled to prevent socket errors")
    
except Exception as e:
    print(f"⚠️ Error handling wandb: {e}")
    # Force disable wandb
    os.environ['WANDB_DISABLED'] = 'true'
    os.environ['WANDB_MODE'] = 'disabled'

# Clear any wandb cache/state
import gc
gc.collect()

print("🔧 Wandb issues resolved - ready to continue testing")

In [None]:
def evaluate_model_performance(model, tokenizer, test_dataset, num_samples=10):
    """Evaluate model performance with metrics"""
    
    print("📊 MODEL PERFORMANCE EVALUATION")
    print("=" * 80)
    
    import time
    from collections import defaultdict
    
    metrics = defaultdict(list)
    
    for i in range(min(num_samples, len(test_dataset))):
        test_sample = test_dataset[i]
        test_dialogue = test_sample['dialogue']
        ground_truth = test_sample['summary']
        
        test_prompt = f"Instruct: Summarize the following conversation.\n{test_dialogue}\nOutput:\n"
        
        try:
            # Measure generation time
            start_time = time.time()
            generated_output = generate_summary(
                model, 
                tokenizer, 
                test_prompt, 
                max_new_tokens=100
            )
            generation_time = time.time() - start_time
            
            # Extract summary
            try:
                generated_summary = generated_output.split("Output:\n")[1].split("### End")[0].strip()
                if not generated_summary:
                    generated_summary = generated_output.strip()
            except:
                generated_summary = generated_output.strip()
            
            # Calculate basic metrics
            dialogue_length = len(test_dialogue.split())
            summary_length = len(generated_summary.split())
            ground_truth_length = len(ground_truth.split())
            compression_ratio = dialogue_length / summary_length if summary_length > 0 else 0
            
            # Store metrics
            metrics['generation_time'].append(generation_time)
            metrics['dialogue_length'].append(dialogue_length)
            metrics['summary_length'].append(summary_length)
            metrics['ground_truth_length'].append(ground_truth_length)
            metrics['compression_ratio'].append(compression_ratio)
            
            print(f"Sample {i+1}: ⏱️{generation_time:.2f}s | 📝{dialogue_length}→{summary_length} words | 🗜️{compression_ratio:.1f}x")
            
        except Exception as e:
            print(f"❌ Sample {i+1} failed: {str(e)}")
    
    # Calculate and display averages
    if metrics['generation_time']:
        print(f"\n📈 PERFORMANCE METRICS")
        print("-" * 60)
        print(f"Average generation time: {sum(metrics['generation_time'])/len(metrics['generation_time']):.2f} seconds")
        print(f"Average dialogue length: {sum(metrics['dialogue_length'])/len(metrics['dialogue_length']):.1f} words")
        print(f"Average generated summary: {sum(metrics['summary_length'])/len(metrics['summary_length']):.1f} words")
        print(f"Average ground truth: {sum(metrics['ground_truth_length'])/len(metrics['ground_truth_length']):.1f} words")
        print(f"Average compression ratio: {sum(metrics['compression_ratio'])/len(metrics['compression_ratio']):.1f}x")
        print(f"Total samples processed: {len(metrics['generation_time'])}")
        
        # Generation speed
        words_per_second = sum(metrics['summary_length']) / sum(metrics['generation_time'])
        print(f"Generation speed: {words_per_second:.1f} words/second")
    
    return metrics

# Run performance evaluation
performance_metrics = evaluate_model_performance(
    inference_model, 
    inference_tokenizer, 
    dataset_splits['test'], 
    num_samples=8
)