# Llama Tuning Test

In [7]:
# https://huggingface.co/meta-llama/Llama-3.2-1B

import torch
import pandas as pd
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset

In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

In [None]:
print("1. Load your dataframe: df = pd.read_csv('your_file.csv')")
print("2. Load transcript data: train_data = load_transcript_data_from_dataframe(df)")
print("3. Create dataset: train_dataset = Dataset.from_list(train_data)")
print("4. Tokenize: tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])")
print("5. Uncomment the start_training() call")
print("6. Monitor training progress and adjust batch size if needed")

In [None]:
# Add padding token if not present (common with Llama models)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Data loading functions
def load_transcript_data_from_excel(file_path, column_name='training', sheet_name=0):
    """Load transcript data from Excel file"""
    # Read Excel file into DataFrame
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    
    # Convert specified column to list of dictionaries
    data = []
    for text in df[column_name].values:
        # Skip empty/null values
        if pd.notna(text) and str(text).strip():
            data.append({"text": str(text)})
    
    print(f"Loaded {len(data)} transcripts from Excel file")
    return data

# Tokenization function
def tokenize_function(examples):
    """Tokenize the text data with appropriate truncation for short transcripts"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False,  # Dynamic padding handled by data collator
        max_length=512,  # Reasonable for short transcripts, adjust if needed
        return_special_tokens_mask=True
    )

In [None]:
# Load and prepare dataset
# Example usage:
# df = pd.read_csv('your_data.csv')  # or pd.read_pickle(), pd.read_parquet(), etc.
# train_data = load_transcript_data_from_dataframe(df)
# train_dataset = Dataset.from_list(train_data)
# tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
    pad_to_multiple_of=8  # Slight efficiency improvement
)

# Training arguments optimized for CPU and small dataset
training_args = TrainingArguments(
    output_dir="./llama-finetuned",
    
    # Small dataset optimization
    num_train_epochs=3,  # More epochs for small dataset to ensure convergence
    per_device_train_batch_size=1,  # Small batch size for CPU training
    gradient_accumulation_steps=8,  # Simulate larger batch (effective batch size = 8)
    
    # Learning rate settings
    learning_rate=2e-5,  # Slightly higher for small dataset, but conservative for stability
    warmup_steps=50,  # ~10% of total steps for gradual learning rate increase
    lr_scheduler_type="cosine",  # Smooth learning rate decay
    
    # CPU-specific optimizations
    dataloader_num_workers=2,  # Moderate for CPU, adjust based on your cores
    fp16=False,  # CPU doesn't support fp16
    bf16=False,  # Most CPUs don't support bf16
    
    # Memory and efficiency
    max_grad_norm=1.0,  # Gradient clipping for stability
    remove_unused_columns=True,  # Memory optimization
    dataloader_pin_memory=False,  # CPU training doesn't benefit from this
    
    # Logging and evaluation
    logging_steps=10,  # Frequent logging for small dataset monitoring
    eval_strategy="steps" if "eval_dataset" in locals() else "no",
    eval_steps=50,  # Evaluate every 50 steps if eval data available
    save_steps=100,  # Save checkpoints periodically
    save_total_limit=2,  # Keep only 2 latest checkpoints to save space
    
    # Reproducibility
    seed=42,
    data_seed=42,
    
    # Output control
    report_to=None,  # Disable wandb/tensorboard for simplicity
    load_best_model_at_end=True if "eval_dataset" in locals() else False,
    metric_for_best_model="eval_loss" if "eval_dataset" in locals() else None,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=tokenized_dataset,  # Uncomment when you have data loaded
    # eval_dataset=eval_dataset,  # Add if you have evaluation data
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:

# Training function
def start_training():
    """Start the fine-tuning process"""
    print("Starting fine-tuning...")
    print(f"Model parameters: {model.num_parameters():,}")
    print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
    
    # Start training
    trainer.train()
    
    # Save the final model
    trainer.save_model("./llama-finetuned-final")
    tokenizer.save_pretrained("./llama-finetuned-final")
    
    print("Training completed!")

# Memory optimization for CPU training
def optimize_for_cpu():
    """Apply CPU-specific optimizations"""
    # Enable memory efficient attention if available
    if hasattr(model.config, 'use_memory_efficient_attention'):
        model.config.use_memory_efficient_attention = True
    
    # Set torch to use single thread for deterministic results on CPU
    torch.set_num_threads(1)
    
    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

# Apply optimizations
optimize_for_cpu()

# Example data loading function - adjust based on your data format
def load_transcript_data_from_dataframe(df):
    """Load transcript data from pandas dataframe with 'training' column"""
    # Convert dataframe column to list of dictionaries
    data = []
    for text in df['training'].values:
        # Skip empty/null values
        if pd.notna(text) and text.strip():
            data.append({"text": text})
    return data

In [None]:
start_training()