# Model fine tuning test
#### Local test (CPU only), simple casual llm model, code generated with claude
(Model only trained for 20 steps, so the LoRA model memorizes a few examples and nothing more, not generalize yet)

### Prompt: 
I have an Excel file containing historical document transcripts. Each row has:
* Title: the title of the document (some titles repeat),
* Transcript: the actual text (usually a paragraph or page from the document).
I want to fine-tune casual language model using LoRA in Python, just for a quick test demo, not for final production. Please give me a complete working script that does the following:
1. Loads the Excel file using pandas.
2. Drops any rows where Title or Transcript is missing or too short.
3. Creates a training dataset where:
   * The input prompt is: ### TITLE: {Title}\n### TRANSCRIPT:\n{Transcript}\n### RESPONSE:
   * The response can just be a copy of the transcript for now. This is a placeholder just to show fine-tuning works.
   * These are combined into one text field for casual language modeling.
4. Converts the dataset into a Hugging Face Dataset object.
5. Loads a model like tiiuae/falcon-7b-instruct or another instruct-tuned casual model.
6. Applies LoRA using Hugging Face peft with 4-bit quantization (bitsandbytes) to keep memory low and training fast.
7. Tokenizes the dataset using the model's tokenizer.
8. Fine-tunes the model using Trainer, training for just 1 epoch with a small batch size.
9. Saves the fine-tuned model.
10. Runs a single generation test from a sample prompt like: ### TITLE: Lincoln's Gettysburg Address
***###*** TRANSCRIPT:
Now we are engaged in a great civil war...
**###** RESPONSE:
and prints the generated output.

This is strictly a quick test to show proof-of-concept, so prioritize speed and simplicity. The goal is just to show the model responds to the fine-tuning data format.

In [18]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig 
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Configuration
EXCEL_FILE = r"../../data/mccray/changed_data/decade_subsets/McCray (1940s, messy_count=0).xlsx"  # Replace with your file path
MODEL_NAME = "distilgpt2"  # Very small model for CPU training
# Alternative for slightly better quality: "microsoft/DialoGPT-small"
OUTPUT_DIR = "./lora-finetuned-test-model-v2"
MAX_LENGTH = 256  # Shorter for CPU efficiency
MIN_TRANSCRIPT_LENGTH = 30

In [16]:
def load_and_preprocess_data(excel_file):
    """Load Excel file and preprocess the data"""
    print("Loading Excel file...")
    df = pd.read_excel(excel_file)
    
    print(f"Original dataset size: {len(df)}")
    
    # Drop rows with missing or short content
    df = df.dropna(subset=['Title', 'Transcript'])
    df = df[df['Title'].str.len() > 0]
    df = df[df['Transcript'].str.len() >= MIN_TRANSCRIPT_LENGTH]
    
    print(f"After filtering: {len(df)}")
    
    # Create training format
    def format_example(title, transcript):
        prompt = f"### TITLE: {title}\n### TRANSCRIPT:\n{transcript}\n### RESPONSE:"
        response = transcript  # Using transcript as response for demo
        return f"{prompt} {response}"
    
    df['text'] = df.apply(lambda row: format_example(row['Title'], row['Transcript']), axis=1)
    
    return df[['text']]

def create_dataset(df):
    """Convert DataFrame to HuggingFace Dataset"""
    print("Creating HuggingFace Dataset...")
    dataset = Dataset.from_pandas(df)
    return dataset

def setup_model_and_tokenizer(model_name):
    """Load model and tokenizer for CPU training"""
    print(f"Loading model and tokenizer: {model_name}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model for CPU
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # Use float32 for CPU
        low_cpu_mem_usage=True
    )
    
    return model, tokenizer

def setup_lora(model):
    """Configure LoRA for the model"""
    print("Setting up LoRA...")
    
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=4,  # Even lower rank for CPU training
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["c_attn", "c_proj"],  # For GPT-2 based models
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    return model

def tokenize_dataset(dataset, tokenizer):
    """Tokenize the dataset and add labels for Causal LM"""
    print("Tokenizing dataset...")

    def tokenize_function(examples):
        tokens = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
        )
        # Add labels = input_ids for causal LM loss
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )

    return tokenized_dataset

def train_model(model, tokenizer, train_dataset):
    """Fine-tune the model using Trainer"""
    print("Starting training...")
    
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=2,  # Smaller for CPU
        warmup_steps=5,
        max_steps=20,  # Very limited for CPU demo
        learning_rate=1e-3,  # Higher learning rate for faster convergence
        logging_steps=2,
        save_strategy="epoch",
        eval_strategy="no",
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        no_cuda=True,  # Force CPU usage
        dataloader_num_workers=0,  # Avoid multiprocessing issues
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
    )
    
    trainer.train()
    
    # Save the model
    print("Saving model...")
    trainer.save_model()
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    return trainer

def test_generation(model, tokenizer):
    """Test the fine-tuned model with a sample prompt"""
    print("\n" + "="*50)
    print("TESTING GENERATION")
    print("="*50)
    
    test_prompt = """### TITLE: Lincoln's Gettysburg Address
### TRANSCRIPT:
Now we are engaged in a great civil war, testing whether that nation or any nation so conceived and so dedicated can long endure.
### RESPONSE:"""
    
    print(f"Input prompt:\n{test_prompt}\n")
    
    # Tokenize input
    inputs = tokenizer(test_prompt, return_tensors="pt")
    
    # Generate response
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=50,  # Shorter for CPU
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode and print
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response_start = generated_text.find("### RESPONSE:") + len("### RESPONSE:")
    response = generated_text[response_start:].strip()
    
    print(f"Generated response:\n{response}")

def main():
    """Main training pipeline"""
    print("Starting LoRA Fine-tuning Pipeline")
    print("="*50)
    
    try:
        # 1. Load and preprocess data
        df = load_and_preprocess_data(EXCEL_FILE)
        
        # 2. Create dataset
        dataset = create_dataset(df)
        
        # 3. Setup model and tokenizer
        model, tokenizer = setup_model_and_tokenizer(MODEL_NAME)
        
        # 4. Setup LoRA
        model = setup_lora(model)
        
        # 5. Tokenize dataset
        train_dataset = tokenize_dataset(dataset, tokenizer)
        
        # 6. Train model
        trainer = train_model(model, tokenizer, train_dataset)
        
        # 7. Test generation
        test_generation(model, tokenizer)
        
        print(f"\nTraining completed! Model saved to: {OUTPUT_DIR}")
        
    except Exception as e:
        print(f"Error occurred: {e}")
        print("Make sure you have the required packages installed:")
        print("pip install torch transformers datasets peft pandas openpyxl")

if __name__ == "__main__":
    main()

Starting LoRA Fine-tuning Pipeline
Loading Excel file...
Original dataset size: 746
After filtering: 725
Creating HuggingFace Dataset...
Loading model and tokenizer: distilgpt2
Setting up LoRA...
trainable params: 202,752 || all params: 82,115,328 || trainable%: 0.2469
Tokenizing dataset...


Map: 100%|██████████| 725/725 [00:00<00:00, 3030.98 examples/s]


Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
2,4.93
4,4.8744
6,6.9938
8,4.2041
10,3.4696
12,3.7058
14,3.2563
16,3.2474
18,3.074
20,3.3399


Saving model...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



TESTING GENERATION
Input prompt:
### TITLE: Lincoln's Gettysburg Address
### TRANSCRIPT:
Now we are engaged in a great civil war, testing whether that nation or any nation so conceived and so dedicated can long endure.
### RESPONSE:

Generated response:


Training completed! Model saved to: ./lora-finetuned-test-model


In [19]:
# Run the model

# Path to saved model
output_dir = "./lora-finetuned-test-model"

# Load PEFT config
peft_config = PeftConfig.from_pretrained(output_dir)

# Load base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Attach LoRA weights
model = PeftModel.from_pretrained(base_model, output_dir)

# Set model to eval mode
model.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
             

In [None]:
test_prompt = """### TITLE: In the Court of General Sessions, June 19, 1950
### TRANSCRIPT:
STATE OF SOUTH CAROLINA    COUNTY OF NEWBERRY    IN THE COURT OF GENERAL SESSIONS    On June 19, 1950, John McCray, editor of The Lighthouse  and Informer, appeared at the Court of General Sessions,  Newberry, South Carolina, and pled guilty to a charge against  him of criminal libel, that had been preferred by the Grant  Jury of Greenwood County, and was given the following    sentence
### RESPONSE:"""

inputs = tokenizer(test_prompt, return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=50,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


### TITLE: John Henry McCray, business card
### TRANSCRIPT:
Office   1507   HARDIN  ST. Hama:   2018 ...
### RESPONSE:


In [None]:
"""
Models can be pushed to huggingface when we have a model that we want to do further testing on

from huggingface_hub import login
from peft import PeftModel

model.push_to_hub("your-username/your-model-name")
tokenizer.push_to_hub("your-username/your-model-name")

"""
