# Model fine tuning test
#### Local test (CPU only),
(Model only trained for 20 steps, so the LoRA model memorizes a few examples and nothing more, not generalize yet)

In [None]:
%pip install pandas
%pip install torch
%pip install datasets
%pip install transformers
%pip install peft


In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig 
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Configuration
EXCEL_FILE = r"../../data/mccray/completion_df.xlsx"  # Replace with your file path
MODEL_NAME = "TinyLlama/TinyLlama_v1.1" 
# MODEL_NAME = "distilgpt2" 
# MODEL_NAME = "microsoft/DialoGPT-small2" # Alternative for slightly better quality
OUTPUT_DIR = "./400-completion-tasks-distilGPT2"
output_dir = OUTPUT_DIR 
MAX_LENGTH = 256         # Shorter for CPU efficiency
MIN_TRANSCRIPT_LENGTH = 20

In [None]:
def load_and_preprocess_data(excel_file):
    """Load Excel file and preprocess the data"""
    print("Loading Excel file...")
    df = pd.read_excel(excel_file)
    
    print(f"Dataset size: {len(df)}")
    
    # Create training format
    def format_example(input, output):
        prompt = f"### INPUT:{input}\n### RESPONSE:"
        response = output  
        return f"{prompt} {response}"

        print(df['input'])
        print(df['output'])
    
    df['text'] = df.apply(lambda row: format_example(row['input'], row['output']), axis=1)
    
    return df[['text']]


In [None]:
def create_dataset(df):
    """Convert DataFrame to HuggingFace Dataset"""
    print("Creating HuggingFace Dataset...")
    print(df)
    dataset = Dataset.from_pandas(df)
    return dataset

In [None]:
def setup_model_and_tokenizer(model_name):
    """Load model and tokenizer for CPU training"""
    print(f"Loading model and tokenizer: {model_name}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model for CPU
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # Use float32 for CPU
        low_cpu_mem_usage=True
    )
    
    return model, tokenizer

def setup_lora(model):
    """Configure LoRA for the model"""
    print("Setting up LoRA...")
    
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=4,  # Even lower rank for CPU training
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["c_attn", "c_proj"],  # For GPT-2 based models
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    return model

def tokenize_dataset(dataset, tokenizer):
    """Tokenize the dataset and add labels for Causal LM"""
    print("Tokenizing dataset...")

    def tokenize_function(examples):
        tokens = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
        )
        # Add labels = input_ids for causal LM loss
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )

    return tokenized_dataset

In [None]:

def train_model(model, tokenizer, train_dataset):
    """Fine-tune the model using Trainer"""
    print("Starting training...")

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=5,      # Try 3 or more epochs
        max_steps=1000,           # Or set higher steps, e.g., 100, 500, 1000
        per_device_train_batch_size=1,
        gradient_accumulation_steps=2,
        warmup_steps=5,
        learning_rate=1e-3,
        logging_steps=2,
        save_strategy="epoch",
        eval_strategy="no",
        remove_unused_columns=False,
        dataloader_pin_memory=True,
        dataloader_num_workers=0,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
    )
    
    trainer.train()
    
    # Save the model
    print("Saving model...")
    trainer.save_model()
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    return trainer

In [None]:

def test_generation(model, tokenizer):
    """Test the fine-tuned model with a sample prompt"""
    print("\n" + "="*50)
    print("TESTING GENERATION")
    print("="*50)
    
    test_prompt = """### INPUT: Give me the entire transcript, including what is missing, from this ending portion of a McCray transcript: ... the expression of your views with respect to the very controversial Civil Rights Message of the President. Sincerely, Mr. H. McCray Columbia, South Carolina
### RESPONSE:"""
    
    print(f"Input prompt:\n{test_prompt}\n")
    
    # Tokenize input
    inputs = tokenizer(test_prompt, return_tensors="pt")
    
    # Generate response
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=50,  # Shorter for CPU
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode and print
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response_start = generated_text.find("### RESPONSE:") + len("### RESPONSE:")
    response = generated_text[response_start:].strip()
    
    print(f"Generated response:\n{response}")

## Model code ran here

In [None]:
def main():
    """Main training pipeline"""
    print("Starting LoRA Fine-tuning Pipeline")
    print("="*50)
    
    try:
        # 1. Load and preprocess data
        df = load_and_preprocess_data(EXCEL_FILE)
        
        # 2. Create dataset
        dataset = create_dataset(df)
        
        # 3. Setup model and tokenizer
        model, tokenizer = setup_model_and_tokenizer(MODEL_NAME)
        
        # 4. Setup LoRA
        model = setup_lora(model)
        
        # 5. Tokenize dataset
        train_dataset = tokenize_dataset(dataset, tokenizer)
        
        # 6. Train model
        trainer = train_model(model, tokenizer, train_dataset)
        
        # 7. Test generation
        test_generation(model, tokenizer)
        
        print(f"\nTraining completed! Model saved to: {OUTPUT_DIR}")
        
    except Exception as e:
        print(f"Error occurred: {e}")
        print("Make sure you have the required packages installed:")
        print("pip install torch transformers datasets peft pandas openpyxl")

if __name__ == "__main__":
    main()

In [None]:
# Run the model

# Load PEFT config
peft_config = PeftConfig.from_pretrained(output_dir)

# Load base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Attach LoRA weights
model = PeftModel.from_pretrained(base_model, output_dir)

# Set model to eval mode
model.eval()


In [None]:
test_prompt = """### INPUT: Based on the year, 1946, and title, "Memorandum to executive officers of the Progressive Democratic Party, Page 2", complete this transcript:
(Page Two) covered in 8 full report to ell executive end unit officers we hope to submit within the next ton days. But it writes "finish" to the several decades of unfortunate political operations which, principally among Southern States, have make 8 great mockery of the very word "Democracy"....
### RESPONSE:"""

inputs = tokenizer(test_prompt, return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=50,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
"""
Models can be pushed to huggingface when we have a model that we want to do further testing on

from huggingface_hub import login
from peft import PeftModel

model.push_to_hub("your-username/your-model-name")
tokenizer.push_to_hub("your-username/your-model-name")

"""
