LLM FINE-TUNING PROJECT TEMPLATE (HUGGINGFACE)
==============================================
Use Case: Domain-specific LLM, Instruction Tuning, Task-specific Fine-tuning

# 1. PROJECT SETUP & ENVIRONMENT

## 1.1 Install Required Libraries

In [None]:
# !pip install transformers datasets accelerate peft bitsandbytes
# !pip install torch torchvision torchaudio
# !pip install wandb tensorboard
# !pip install sentencepiece protobuf

## 1.2 Import Libraries

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# HuggingFace
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict, load_dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)

# Evaluation
from evaluate import load as load_metric
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 1.3 Configuration

In [None]:
CONFIG = {
    # Model configuration
    'base_model': 'meta-llama/Llama-2-7b-hf',  # or 'mistralai/Mistral-7B-v0.1', 'gpt2', etc.
    'model_type': 'causal',  # 'causal' or 'seq2seq'
    
    # Data configuration
    'dataset_name': 'custom',  # or HF dataset like 'squad', 'alpaca', etc.
    'train_file': 'train.json',
    'val_file': 'val.json',
    'max_length': 512,
    'test_size': 0.1,
    
    # Training configuration
    'output_dir': './finetuned_model',
    'num_epochs': 3,
    'batch_size': 4,
    'gradient_accumulation_steps': 4,
    'learning_rate': 2e-4,
    'weight_decay': 0.01,
    'warmup_steps': 100,
    'logging_steps': 10,
    'save_steps': 500,
    'eval_steps': 500,
    'max_grad_norm': 1.0,
    'fp16': True,  # Mixed precision training
    
    # LoRA configuration
    'use_lora': True,
    'lora_r': 8,
    'lora_alpha': 16,
    'lora_dropout': 0.05,
    'lora_target_modules': ['q_proj', 'v_proj'],
    
    # Generation configuration
    'max_new_tokens': 256,
    'temperature': 0.7,
    'top_p': 0.9,
    'top_k': 50,
    
    'random_seed': 42
}

# Set random seed
torch.manual_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])

# 2. DATA LOADING & PREPARATION

## 2.1 Load Dataset

In [None]:
def load_custom_dataset(train_file, val_file=None):
    """Load custom dataset from JSON files"""
    with open(train_file, 'r') as f:
        train_data = json.load(f)
    
    if val_file:
        with open(val_file, 'r') as f:
            val_data = json.load(f)
    else:
        # Split train data
        split_idx = int(len(train_data) * (1 - CONFIG['test_size']))
        val_data = train_data[split_idx:]
        train_data = train_data[:split_idx]
    
    return train_data, val_data

# Load data
if CONFIG['dataset_name'] == 'custom':
    train_data, val_data = load_custom_dataset(CONFIG['train_file'], CONFIG['val_file'])
    
    # Create HuggingFace Dataset
    dataset = DatasetDict({
        'train': Dataset.from_list(train_data),
        'validation': Dataset.from_list(val_data)
    })
else:
    # Load from HuggingFace Hub
    dataset = load_dataset(CONFIG['dataset_name'])

print(f"Train size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"\nSample data:")
print(dataset['train'][0])

## 2.2 Data Exploration

In [None]:
# Analyze text lengths
def analyze_text_lengths(dataset, text_field='text'):
    """Analyze distribution of text lengths"""
    lengths = [len(sample[text_field].split()) for sample in dataset['train']]
    
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(lengths, bins=50, edgecolor='black')
    plt.xlabel('Number of Tokens')
    plt.ylabel('Frequency')
    plt.title('Distribution of Text Lengths')
    plt.axvline(x=np.mean(lengths), color='r', linestyle='--', label=f'Mean: {np.mean(lengths):.0f}')
    plt.axvline(x=np.percentile(lengths, 95), color='g', linestyle='--', label=f'95th percentile: {np.percentile(lengths, 95):.0f}')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.boxplot(lengths)
    plt.ylabel('Number of Tokens')
    plt.title('Box Plot of Text Lengths')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Mean length: {np.mean(lengths):.2f} tokens")
    print(f"Median length: {np.median(lengths):.2f} tokens")
    print(f"95th percentile: {np.percentile(lengths, 95):.2f} tokens")

# Uncomment based on your data structure
# analyze_text_lengths(dataset, text_field='text')

# 3. MODEL & TOKENIZER LOADING

## 3.1 Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    CONFIG['base_model'],
    trust_remote_code=True,
    padding_side='right'  # Important for generation
)

# Add special tokens if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Tokenizer loaded: {CONFIG['base_model']}")
print(f"Vocabulary size: {len(tokenizer)}")
print(f"Special tokens: {tokenizer.special_tokens_map}")

## 3.2 Load Base Model

In [None]:
# Load model with optimizations
if CONFIG['use_lora']:
    # Load in 8-bit for efficient fine-tuning
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG['base_model'],
        load_in_8bit=True,
        device_map='auto',
        torch_dtype=torch.float16,
        trust_remote_code=True
    )
    
    # Prepare for training
    model = prepare_model_for_kbit_training(model)
else:
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG['base_model'],
        torch_dtype=torch.float16,
        device_map='auto',
        trust_remote_code=True
    )

print(f"Model loaded: {CONFIG['base_model']}")
print(f"Parameters: {model.num_parameters() / 1e6:.2f}M")

## 3.3 Configure LoRA (Parameter-Efficient Fine-Tuning)

In [None]:
if CONFIG['use_lora']:
    # Configure LoRA
    peft_config = LoraConfig(
        r=CONFIG['lora_r'],
        lora_alpha=CONFIG['lora_alpha'],
        lora_dropout=CONFIG['lora_dropout'],
        target_modules=CONFIG['lora_target_modules'],
        bias='none',
        task_type=TaskType.CAUSAL_LM
    )
    
    # Wrap model with LoRA
    model = get_peft_model(model, peft_config)
    
    # Print trainable parameters
    model.print_trainable_parameters()

# 4. DATA PREPROCESSING & TOKENIZATION

## 4.1 Instruction Format (Alpaca-style)

In [None]:
def format_instruction(sample):
    """Format sample in instruction-following format"""
    if 'instruction' in sample and 'output' in sample:
        if 'input' in sample and sample['input']:
            prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['output']}"""
        else:
            prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{sample['instruction']}

### Response:
{sample['output']}"""
    else:
        # For conversational data
        prompt = sample.get('text', '')
    
    return prompt

## 4.2 Tokenization Function

In [None]:
def tokenize_function(examples):
    """Tokenize examples for training"""
    # Format as instructions
    if 'instruction' in examples:
        texts = [format_instruction(ex) for ex in examples]
    else:
        texts = examples['text']
    
    # Tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=CONFIG['max_length'],
        padding='max_length',
        return_tensors=None
    )
    
    # For causal LM, labels are the same as input_ids
    tokenized['labels'] = tokenized['input_ids'].copy()
    
    return tokenized

In [None]:
# Apply tokenization
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset['train'].column_names,
    desc="Tokenizing"
)

print("Tokenization complete!")
print(f"Sample tokenized data:")
print(tokenized_dataset['train'][0])

## 4.3 Data Collator

In [None]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM (not masked LM)
)

# 5. TRAINING CONFIGURATION

## 5.1 Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=CONFIG['output_dir'],
    
    # Training hyperparameters
    num_train_epochs=CONFIG['num_epochs'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    learning_rate=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay'],
    warmup_steps=CONFIG['warmup_steps'],
    max_grad_norm=CONFIG['max_grad_norm'],
    
    # Optimization
    fp16=CONFIG['fp16'],
    optim='adamw_torch',
    lr_scheduler_type='cosine',
    
    # Logging & Evaluation
    logging_dir=f"{CONFIG['output_dir']}/logs",
    logging_steps=CONFIG['logging_steps'],
    eval_strategy='steps',
    eval_steps=CONFIG['eval_steps'],
    save_strategy='steps',
    save_steps=CONFIG['save_steps'],
    save_total_limit=3,
    
    # Other
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    report_to='tensorboard',
    seed=CONFIG['random_seed'],
    
    # Performance
    dataloader_num_workers=4,
    remove_unused_columns=False,
)

print("Training arguments configured")

## 5.2 Evaluation Metrics

In [None]:
def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    predictions, labels = eval_pred
    
    # For generation tasks, this is simplified
    # In practice, you'd use BLEU, ROUGE, etc.
    
    predictions = np.argmax(predictions, axis=-1)
    
    # Mask padding tokens
    mask = labels != -100
    predictions = predictions[mask]
    labels = labels[mask]
    
    # Compute accuracy
    accuracy = accuracy_score(labels.flatten(), predictions.flatten())
    
    return {'accuracy': accuracy}

# 6. MODEL TRAINING

## 6.1 Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Trainer initialized")

## 6.2 Train Model

In [None]:
# Start training
print("Starting training...")
train_result = trainer.train()

# Save training metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

print("\nTraining completed!")
print(f"Training time: {metrics['train_runtime']:.2f} seconds")
print(f"Samples per second: {metrics['train_samples_per_second']:.2f}")

## 6.3 Training Visualization

In [None]:
# Load training logs
import pandas as pd
from pathlib import Path

log_history = trainer.state.log_history

# Extract loss values
train_loss = [log['loss'] for log in log_history if 'loss' in log]
eval_loss = [log['eval_loss'] for log in log_history if 'eval_loss' in log]

steps_train = [log['step'] for log in log_history if 'loss' in log]
steps_eval = [log['step'] for log in log_history if 'eval_loss' in log]

# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(steps_train, train_loss, label='Train Loss', linewidth=2)
axes[0].plot(steps_eval, eval_loss, label='Validation Loss', linewidth=2)
axes[0].set_xlabel('Steps')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Progress')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Learning rate schedule
lr_values = [log['learning_rate'] for log in log_history if 'learning_rate' in log]
lr_steps = [log['step'] for log in log_history if 'learning_rate' in log]

axes[1].plot(lr_steps, lr_values, linewidth=2, color='green')
axes[1].set_xlabel('Steps')
axes[1].set_ylabel('Learning Rate')
axes[1].set_title('Learning Rate Schedule')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 7. MODEL EVALUATION

## 7.1 Evaluate on Validation Set

In [None]:
print("Evaluating model...")
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

## 7.2 Generation Quality Metrics

In [None]:
# Install required packages for text generation metrics
# !pip install rouge-score bert-score

from rouge_score import rouge_scorer
# from bert_score import score as bert_score

def evaluate_generation(model, tokenizer, test_samples, max_new_tokens=256):
    """Evaluate generation quality"""
    model.eval()
    
    predictions = []
    references = []
    
    for sample in test_samples:
        # Get instruction
        if 'instruction' in sample:
            instruction = sample['instruction']
            expected_output = sample['output']
            
            prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""
        else:
            prompt = sample.get('prompt', sample.get('text', ''))
            expected_output = sample.get('completion', sample.get('output', ''))
        
        # Generate
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=CONFIG['temperature'],
                top_p=CONFIG['top_p'],
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the response part
        if "### Response:" in generated_text:
            generated_text = generated_text.split("### Response:")[-1].strip()
        
        predictions.append(generated_text)
        references.append(expected_output)
    
    # Calculate ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        for key in rouge_scores:
            rouge_scores[key].append(scores[key].fmeasure)
    
    # Average scores
    avg_scores = {key: np.mean(values) for key, values in rouge_scores.items()}
    
    return avg_scores, predictions, references

# Evaluate on a subset
test_samples = dataset['validation'].select(range(min(10, len(dataset['validation']))))
rouge_scores, predictions, references = evaluate_generation(model, tokenizer, test_samples)

print("\nROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score:.4f}")

## 7.3 Sample Predictions

In [None]:
# Display sample predictions
print("\n" + "="*80)
print("SAMPLE PREDICTIONS")
print("="*80)

for i, (pred, ref) in enumerate(zip(predictions[:3], references[:3]), 1):
    print(f"\n--- Sample {i} ---")
    print(f"Expected: {ref[:200]}...")
    print(f"\nGenerated: {pred[:200]}...")
    print("-" * 80)

# 8. INFERENCE & GENERATION

## 8.1 Text Generation Pipeline

In [None]:
def generate_response(prompt, model, tokenizer, max_new_tokens=256):
    """Generate response for a given prompt"""
    model.eval()
    
    # Format prompt
    formatted_prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{prompt}

### Response:
"""
    
    # Tokenize
    inputs = tokenizer(formatted_prompt, return_tensors='pt').to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=CONFIG['temperature'],
            top_p=CONFIG['top_p'],
            top_k=CONFIG['top_k'],
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    
    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract response
    if "### Response:" in generated_text:
        response = generated_text.split("### Response:")[-1].strip()
    else:
        response = generated_text
    
    return response

## 8.2 Interactive Testing

In [None]:
# Test with custom prompts
test_prompts = [
    "What is machine learning?",
    "Explain the concept of neural networks.",
    "How does fine-tuning work?"
]

print("\n" + "="*80)
print("INTERACTIVE GENERATION TESTS")
print("="*80)

for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    response = generate_response(prompt, model, tokenizer)
    print(f"Response: {response}")
    print("-" * 80)

## 8.3 Batch Generation

In [None]:
def batch_generate(prompts, model, tokenizer, max_new_tokens=256, batch_size=4):
    """Generate responses for multiple prompts"""
    model.eval()
    all_responses = []
    
    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i+batch_size]
        
        # Format prompts
        formatted_prompts = [
            f"### Instruction:\n{p}\n\n### Response:\n" 
            for p in batch_prompts
        ]
        
        # Tokenize
        inputs = tokenizer(
            formatted_prompts,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=CONFIG['max_length']
        ).to(device)
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=CONFIG['temperature'],
                top_p=CONFIG['top_p'],
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )
        
        # Decode
        responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_responses.extend(responses)
    
    return all_responses

# 9. MODEL SAVING & EXPORT

## 9.1 Save Fine-tuned Model

In [None]:
# Save the fine-tuned model
output_path = f"{CONFIG['output_dir']}/final_model"
trainer.save_model(output_path)
tokenizer.save_pretrained(output_path)

print(f"Model saved to: {output_path}")

## 9.2 Save LoRA Adapters Only

In [None]:
if CONFIG['use_lora']:
    # Save only LoRA adapters (much smaller)
    lora_path = f"{CONFIG['output_dir']}/lora_adapters"
    model.save_pretrained(lora_path)
    print(f"LoRA adapters saved to: {lora_path}")

## 9.3 Merge LoRA with Base Model

In [None]:
if CONFIG['use_lora']:
    # Merge LoRA weights with base model
    merged_model = model.merge_and_unload()
    
    # Save merged model
    merged_path = f"{CONFIG['output_dir']}/merged_model"
    merged_model.save_pretrained(merged_path)
    tokenizer.save_pretrained(merged_path)
    
    print(f"Merged model saved to: {merged_path}")

## 9.4 Export to GGUF (for llama.cpp)

In [None]:
# For deployment with llama.cpp
# Install: pip install gguf
# Then convert using: python convert-hf-to-gguf.py <model_path>

# 10. MODEL LOADING & DEPLOYMENT

## 10.1 Load Fine-tuned Model

In [None]:
def load_finetuned_model(model_path):
    """Load the fine-tuned model for inference"""
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map='auto',
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        trust_remote_code=True
    )
    
    return model, tokenizer

# Load model
# loaded_model, loaded_tokenizer = load_finetuned_model(output_path)

## 10.2 API Deployment Class

In [None]:
class LLMInferenceAPI:
    """Production-ready inference API"""
    
    def __init__(self, model_path):
        self.model, self.tokenizer = load_finetuned_model(model_path)
        self.model.eval()
    
    def generate(self, prompt, max_new_tokens=256, temperature=0.7, top_p=0.9):
        """Generate response for a prompt"""
        inputs = self.tokenizer(prompt, return_tensors='pt').to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.pad_token_id
            )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    
    def chat(self, messages, max_new_tokens=256):
        """Multi-turn conversation"""
        # Format conversation history
        conversation = ""
        for msg in messages:
            role = msg['role']
            content = msg['content']
            conversation += f"{role}: {content}\n"
        
        conversation += "Assistant: "
        
        return self.generate(conversation, max_new_tokens)

# Initialize API
# api = LLMInferenceAPI(output_path)
# response = api.generate("Your prompt here")

# 11. ADVANCED TECHNIQUES

## 11.1 QLoRA (Quantized LoRA)

In [None]:
# For 4-bit quantization training
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load model with 4-bit quantization
# model_4bit = AutoModelForCausalLM.from_pretrained(
#     CONFIG['base_model'],
#     quantization_config=bnb_config,
#     device_map='auto'
# )

## 11.2 Instruction Tuning with RLHF

In [None]:
# For reinforcement learning from human feedback
# Install: pip install trl

# from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
# from trl.core import LengthSampler

# # Configure PPO
# ppo_config = PPOConfig(
#     model_name=CONFIG['base_model'],
#     learning_rate=1.41e-5,
#     batch_size=16,
#     mini_batch_size=4
# )

## 11.3 Multi-GPU Training

In [None]:
# For distributed training across multiple GPUs
# Use accelerate config and modify training_args:

# training_args = TrainingArguments(
#     ...
#     deepspeed='ds_config.json',  # DeepSpeed configuration
#     fsdp='full_shard auto_wrap',  # Fully Sharded Data Parallel
#     ...
# )

# 12. MONITORING & ANALYSIS

## 12.1 Model Size Analysis

In [None]:
def analyze_model_size(model):
    """Analyze model parameter counts"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"Total parameters: {total_params:,} ({total_params/1e6:.2f}M)")
    print(f"Trainable parameters: {trainable_params:,} ({trainable_params/1e6:.2f}M)")
    print(f"Trainable %: {100 * trainable_params / total_params:.2f}%")
    
    # Memory footprint
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    size_mb = (param_size + buffer_size) / 1024**2
    print(f"Model size: {size_mb:.2f} MB")

analyze_model_size(model)

## 12.2 Inference Speed Benchmark

In [None]:
import time

def benchmark_inference(model, tokenizer, num_samples=10):
    """Benchmark inference speed"""
    model.eval()
    
    test_prompt = "What is artificial intelligence?"
    inputs = tokenizer(test_prompt, return_tensors='pt').to(device)
    
    # Warmup
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=50)
    
    # Benchmark
    times = []
    for _ in range(num_samples):
        start = time.time()
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=50)
        times.append(time.time() - start)
    
    avg_time = np.mean(times)
    std_time = np.std(times)
    
    print(f"Average inference time: {avg_time:.3f}s Â± {std_time:.3f}s")
    print(f"Tokens per second: {50/avg_time:.2f}")

benchmark_inference(model, tokenizer)

# 13. CONCLUSIONS & NEXT STEPS

## Summary:
- Base Model: {CONFIG['base_model']}
- Fine-tuning Method: {'LoRA' if CONFIG['use_lora'] else 'Full Fine-tuning'}
- Training Samples: {len(dataset['train'])}
- Final Training Loss: X.XX
- Final Validation Loss: X.XX

## Next Steps:
- [ ] Experiment with different LoRA ranks
- [ ] Try instruction tuning with more diverse data
- [ ] Implement RLHF for alignment
- [ ] Quantize model for deployment (GPTQ, AWQ)
- [ ] Deploy with vLLM or TensorRT-LLM
- [ ] Create a chatbot interface with Gradio/Streamlit
- [ ] Monitor and collect user feedback