# ⚠️ DEPRECATED NOTEBOOK

**This notebook is outdated.** Use the production-ready Python scripts instead:

```bash
# Multi-GPU training (recommended)
python scripts/train_multigpu.py

# Single GPU (memory optimized)  
python scripts/train_memory_diet.py
```

See the main [README.md](../README.md) for current setup instructions.

---

# Friday AI Assistant - LoRA Fine-tuning on SageMaker

This notebook demonstrates how to fine-tune Meta-Llama-3.1-8B-Instruct using LoRA on AWS SageMaker for your Friday AI assistant.

## Prerequisites
1. AWS SageMaker Studio or Notebook Instance
2. Proper IAM permissions for SageMaker, S3
3. Your training data uploaded to S3

In [None]:
# Install required packages
!pip install -q sagemaker boto3 huggingface_hub transformers datasets peft

In [None]:
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFace
from sagemaker import get_execution_role
import time

# Configuration
BUCKET_NAME = "friday-ai-training"  # Change this to your S3 bucket
REGION = "us-east-1"  # Change to your region
S3_PREFIX = "friday-finetuning"

# SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

print(f"SageMaker role: {role}")
print(f"Region: {REGION}")
print(f"S3 bucket: {BUCKET_NAME}")

## Step 1: Verify Data Upload

In [None]:
# Check if data exists in S3
s3 = boto3.client('s3')

def check_s3_file(bucket, key):
    try:
        response = s3.head_object(Bucket=bucket, Key=key)
        size = response['ContentLength']
        print(f"✅ Found: s3://{bucket}/{key} ({size:,} bytes)")
        return True
    except:
        print(f"❌ Missing: s3://{bucket}/{key}")
        return False

# Check training files
train_exists = check_s3_file(BUCKET_NAME, f"{S3_PREFIX}/data/train.jsonl")
valid_exists = check_s3_file(BUCKET_NAME, f"{S3_PREFIX}/data/valid.jsonl")

if not (train_exists and valid_exists):
    print("\n❌ Please upload your training data first using the prepare_sagemaker_data.py script")
else:
    print("\n✅ All training data found!")

## Step 2: Create Training Script

In [None]:
# Create the training script
training_script = """
import os
import json
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import argparse

def load_dataset(file_path):
    \"\"\"Load JSONL dataset\"\"\"
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

def format_messages(example):
    \"\"\"Format messages into training text\"\"\"
    messages = example[\"messages\"]
    text = \"\"
    for msg in messages:
        if msg[\"role\"] == \"system\":
            text += f\"<|start_header_id|>system<|end_header_id|>\\n{msg['content']}<|eot_id|>\"
        elif msg[\"role\"] == \"user\":
            text += f\"<|start_header_id|>user<|end_header_id|>\\n{msg['content']}<|eot_id|>\"
        elif msg[\"role\"] == \"assistant\":
            text += f\"<|start_header_id|>assistant<|end_header_id|>\\n{msg['content']}<|eot_id|>\"
    return {\"text\": text}

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(\"--model-name\", type=str, default=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")
    parser.add_argument(\"--train-data\", type=str, default=\"/opt/ml/input/data/training/train.jsonl\")
    parser.add_argument(\"--valid-data\", type=str, default=\"/opt/ml/input/data/training/valid.jsonl\")
    parser.add_argument(\"--output-dir\", type=str, default=\"/opt/ml/model\")
    parser.add_argument(\"--epochs\", type=int, default=3)
    parser.add_argument(\"--batch-size\", type=int, default=2)
    parser.add_argument(\"--learning-rate\", type=float, default=2e-4)
    parser.add_argument(\"--lora-rank\", type=int, default=16)
    parser.add_argument(\"--lora-alpha\", type=int, default=32)
    
    args = parser.parse_args()
    
    print(f\"🚀 Starting training with {args.model_name}\")
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name,
        torch_dtype=torch.bfloat16,
        device_map=\"auto\",
        trust_remote_code=True,
        use_auth_token=True
    )
    
    # Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=args.lora_rank,
        lora_alpha=args.lora_alpha,
        lora_dropout=0.1,
        target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"]
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # Load and process datasets
    print(\"📊 Loading datasets...\")
    train_data = load_dataset(args.train_data)
    valid_data = load_dataset(args.valid_data)
    
    print(f\"📈 Train examples: {len(train_data)}\")
    print(f\"📊 Valid examples: {len(valid_data)}\")
    
    train_dataset = Dataset.from_list(train_data).map(format_messages)
    valid_dataset = Dataset.from_list(valid_data).map(format_messages)
    
    # Tokenize datasets
    def tokenize_function(examples):
        return tokenizer(
            examples[\"text\"],
            truncation=True,
            padding=False,
            max_length=2048,
            return_tensors=\"pt\"
        )
    
    print(\"🔧 Tokenizing datasets...\")
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    valid_dataset = valid_dataset.map(tokenize_function, batched=True)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        gradient_accumulation_steps=8,
        warmup_steps=100,
        learning_rate=args.learning_rate,
        bf16=True,
        logging_steps=10,
        eval_steps=50,
        save_steps=100,
        evaluation_strategy=\"steps\",
        save_strategy=\"steps\",
        load_best_model_at_end=True,
        metric_for_best_model=\"eval_loss\",
        greater_is_better=False,
        report_to=None,
        remove_unused_columns=False,
        dataloader_pin_memory=False,
    )
    
    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
    )
    
    # Train
    print(\"🎯 Starting training...\")
    trainer.train()
    
    # Save model
    print(\"💾 Saving model...\")
    trainer.save_model()
    tokenizer.save_pretrained(args.output_dir)
    
    print(\"✅ Training completed!\")

if __name__ == \"__main__\":
    main()
"""

# Write the script to a file
with open("train.py", "w") as f:
    f.write(training_script)

print("✅ Training script created: train.py")

## Step 3: Configure Training Job

In [None]:
# Training job configuration
hyperparameters = {
    "epochs": 3,
    "batch-size": 2,
    "learning-rate": 2e-4,
    "lora-rank": 16,
    "lora-alpha": 32,
}

# Data inputs
training_input_path = f"s3://{BUCKET_NAME}/{S3_PREFIX}/data"
output_path = f"s3://{BUCKET_NAME}/{S3_PREFIX}/output"

print(f"📁 Training data: {training_input_path}")
print(f"📤 Output path: {output_path}")
print(f"🔧 Hyperparameters: {hyperparameters}")

## Step 4: Create HuggingFace Estimator

In [None]:
# Create HuggingFace estimator
huggingface_estimator = HuggingFace(
    entry_point="train.py",
    source_dir=".",
    instance_type="ml.g5.2xlarge",  # GPU instance for faster training
    instance_count=1,
    role=role,
    transformers_version="4.36.0",
    pytorch_version="2.1.0",
    py_version="py310",
    hyperparameters=hyperparameters,
    output_path=output_path,
    max_run=3*60*60,  # 3 hours max
    volume_size=100,  # GB
    environment={
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
        "HF_HOME": "/tmp/.cache",
    }
)

print("✅ HuggingFace estimator created")
print(f"💰 Instance type: ml.g5.2xlarge (estimated cost: ~$1.50/hour)")
print(f"⏱️  Max runtime: 3 hours")

## Step 5: Start Training

In [None]:
# Start the training job
job_name = f"friday-lora-{int(time.time())}"

print(f"🚀 Starting training job: {job_name}")
print(f"📊 Monitor progress at: https://console.aws.amazon.com/sagemaker/home?region={REGION}#/jobs/{job_name}")

huggingface_estimator.fit(
    {"training": training_input_path},
    job_name=job_name,
    wait=False  # Set to True if you want to wait for completion
)

print(f"\n✅ Training job submitted!")
print(f"🔗 Job name: {job_name}")
print(f"📈 You can monitor the job in the SageMaker console")

## Step 6: Monitor Training (Optional)

In [None]:
# Monitor training job status
import time

def monitor_training(estimator, check_interval=60):
    """Monitor training job status"""
    while True:
        status = estimator.latest_training_job.describe()['TrainingJobStatus']
        print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Status: {status}")
        
        if status in ['Completed', 'Failed', 'Stopped']:
            break
            
        time.sleep(check_interval)
    
    if status == 'Completed':
        print("\n✅ Training completed successfully!")
        print(f"📤 Model artifacts: {estimator.model_data}")
    else:
        print(f"\n❌ Training {status.lower()}")

# Uncomment to monitor the job
# monitor_training(huggingface_estimator)

## Step 7: Deploy Model (After Training Completes)

In [None]:
# Deploy the trained model (run this after training completes)
# predictor = huggingface_estimator.deploy(
#     initial_instance_count=1,
#     instance_type="ml.g5.xlarge",
#     endpoint_name=f"friday-endpoint-{int(time.time())}"
# )

# print(f"✅ Model deployed to endpoint: {predictor.endpoint_name}")

## Estimated Costs

**Training (ml.g5.2xlarge):**
- ~$1.50/hour
- Expected training time: 1-2 hours
- **Total training cost: ~$2-3**

**Inference (ml.g5.xlarge):**
- ~$0.75/hour
- Only when endpoint is running

**Storage (S3):**
- ~$0.02/GB/month
- Model artifacts ~10-20GB

## Next Steps

1. Run this notebook in SageMaker Studio
2. Monitor training in SageMaker console
3. Download model artifacts after training
4. Deploy for inference or use locally