In [None]:
import torch
import platform
import os
from huggingface_hub import login
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the Hugging Face token from environment variables
HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
WANDB_API_KEY = os.getenv('WANDB_API_KEY')

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("🚀 MPS (Metal Performance Shaders) available - using GPU acceleration")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("🔥 CUDA available - using NVIDIA GPU")
else:
    device = torch.device("cpu")
    print("💻 Using CPU - training will be slower but still functional")

print(f"Selected device: {device}")

In [None]:
from datasets import load_dataset

print("Loading dataset...")
original_dataset = load_dataset(os.getenv('DATASET_NAME'))
print("Dataset loaded successfully!")

# Use smaller subsets for faster training on MacBook
SAMPLE_SIZE = 50  # Reduced for MacBook performance

train_data = original_dataset['train'].select(range(SAMPLE_SIZE))
validation_data = original_dataset['validation'].select(range(SAMPLE_SIZE))
test_data = original_dataset['test'].select(range(SAMPLE_SIZE))

dataset_splits = {
    'train': train_data,
    'validation': validation_data,
    'test': test_data
}

print(f"Dataset prepared with {SAMPLE_SIZE} samples each:")
for split, data in dataset_splits.items():
    print(f"  {split}: {len(data)} samples")


In [None]:
from functools import partial
from transformers import set_seed, DataCollatorForLanguageModeling

In [None]:
from transformers import AutoModelForCausalLM

def load_model_for_macbook(model_name, device):
    """Load PHI-2 model optimized for MacBook"""
    print(f"Loading model on {device}...")
    
    if device.type == "mps":
        # Optimized for Apple Silicon - remove device_map="auto" for Phi models
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
        )
        # Manually move to MPS device
        model = model.to(device)
    elif device.type == "cuda":
        # NVIDIA GPU settings
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        # CPU fallback
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,  # Use float32 for CPU
            trust_remote_code=True,
            low_cpu_mem_usage=True,
        )
    
    print(f"✅ Model loaded successfully on {model.device}")
    return model

# Load the base model
base_model = load_model_for_macbook(os.getenv('MODEL_NAME'), device)

# Get max sequence length
max_seq_length = getattr(base_model.config, 'max_position_embeddings', 2048)
print(f"Max sequence length: {max_seq_length}")

In [None]:
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

# Enable gradient checkpointing for memory efficiency
base_model.gradient_checkpointing_enable()

# LoRA configuration optimized for MacBook
lora_config = LoraConfig(
    r=8,  # Reduced rank for MacBook memory efficiency
    lora_alpha=16,
    target_modules=[
        'q_proj', 'k_proj', 'v_proj', 'dense', 'fc1', 'fc2'
    ],
    bias="none",
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA
peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

print("✅ LoRA configuration applied successfully!")

In [None]:

print("🚀 Starting training...")
print(f"Device: {device}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

start_time = time.time()
trainer.train()
end_time = time.time()

training_duration = end_time - start_time
print(f"✅ Training completed in {training_duration:.2f} seconds!")

In [None]:
from peft import PeftModel

def load_model_for_inference(base_model_name, peft_model_path, device):
    """Load the fine-tuned model for inference"""
    # Load base model
    if device.type == "mps":
        # For MPS, don't use device_map="auto"
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True,
        )
        base_model = base_model.to(device)
    else:
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16 if device.type != "cpu" else torch.float32,
            device_map="auto" if device.type == "cuda" else None,
            trust_remote_code=True,
        )
    
    # Load LoRA weights
    model = PeftModel.from_pretrained(base_model, peft_model_path)
    return model

# Load tokenizer for inference - use slow tokenizer to avoid BOS token issue
inference_tokenizer = AutoTokenizer.from_pretrained(
    model_save_path,
    trust_remote_code=True,
    use_fast=False  # Use slow tokenizer to avoid BOS token error
)
inference_tokenizer.pad_token = inference_tokenizer.eos_token

# Load fine-tuned model
inference_model = load_model_for_inference(
    os.getenv('MODEL_NAME'),
    model_save_path,
    device
)

print("Model loaded for inference!")

In [None]:
test_sample = dataset_splits['test'][0]
test_dialogue = test_sample['dialogue']
ground_truth = test_sample['summary']

# Create test prompt
test_prompt = f"Instruct: Summarize the following conversation.\n{test_dialogue}\nOutput:\n"

# Generate summary
print("Generating summary...")
generated_output = generate_summary(
    inference_model, 
    inference_tokenizer, 
    test_prompt, 
    max_new_tokens=100
)

# Extract the summary part
try:
    generated_summary = generated_output.split("Output:\n")[1].split("### End")[0].strip()
except:
    generated_summary = generated_output

# Display results
print("=" * 80)
print("INPUT DIALOGUE:")
print(test_dialogue)
print("\n" + "=" * 80)
print("GROUND TRUTH SUMMARY:")
print(ground_truth)
print("\n" + "=" * 80)
print("GENERATED SUMMARY:")
print(generated_summary)
print("=" * 80)