# Fine-tuning a Language Model for Custom-Style Text Generation

This notebook demonstrates how to fine-tune a language model to generate text in a custom-style voice. We'll use a dataset of paired emails (standard and custom-style) to teach the model how to transform regular text into custom speech.

## Setup and Imports

In [10]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Clone the repository
!git clone https://github.com/TheBormann/humanize-LLM.git
!cd humanize-LLM && pip install -r requirements.txt

fatal: destination path 'humanize-LLM' already exists and is not an empty directory.


In [12]:
import os
import logging
import sys
import pandas as pd
from typing import List, Dict

# Add the parent directory to the path
sys.path.append('/content/humanize-LLM')

# Import TRL components for efficient fine-tuning
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from datasets import Dataset

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout
)
logger = logging.getLogger(__name__)

## Data Loading and Preparation

We'll load our dataset of paired emails from a CSV file, but now we'll convert it to the modern conversational format for better fine-tuning with TRL's SFTTrainer.

In [13]:
def load_emails_from_csv(file_path: str) -> pd.DataFrame:
    """Load emails from a CSV file with semicolon delimiter."""
    df = pd.read_csv(file_path, sep=';')
    logger.info(f"Loaded {len(df)} emails from {file_path}")
    return df

def prepare_training_data(emails_df: pd.DataFrame) -> List[Dict]:
    """Prepare training data in conversational format for SFTTrainer.

    Creates direct style transfer pairs in conversational format:
    - system: instruction on style transformation
    - user: original AI-generated email
    - assistant: styled version
    """
    system_message = """Transform the given email into a custom-styled version that maintains the same content but uses a more personal, unique tone.
Your goal is to make the text feel more human-written with natural speech patterns."""

    training_samples = []

    for _, row in emails_df.iterrows():
        if pd.isna(row['body']) or pd.isna(row['body_ai']):
            continue

        # Create conversation in the format expected by TRL's SFTTrainer
        sample = {
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": row['body_ai']},  # AI-generated email
                {"role": "assistant", "content": row['body']}  # Custom style version
            ]
        }

        training_samples.append(sample)

    logger.info(f"Created {len(training_samples)} conversational training samples")
    return training_samples

In [14]:
# Set the path to the CSV file
EMAIL_CSV_PATH = '/content/humanize-LLM/data/manual_emails.csv'

# Load and prepare the dataset
emails_df = load_emails_from_csv(EMAIL_CSV_PATH)
training_data = prepare_training_data(emails_df)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_list(training_data)

# Display a sample of the training data
if len(dataset) > 0:
    sample = dataset[0]
    print("Sample training conversation:")
    for message in sample['messages']:
        print(f"{message['role']}: {message['content'][:100]}...")
    print(f"Total training samples: {len(dataset)}")
else:
    print("No training data found or prepared.")

Sample training conversation:
system: Transform the given email into a custom-styled version that maintains the same content but uses a mo...
user: Hi [Name],\n\nI'm [Your Name], founder of [Startup Name]. We're revolutionizing [industry] through [...
assistant: Ahoy [Name],\n\nYer lookin' at [Your Name], fearsome captain of [Startup Name]. We be chartin' treac...
Total training samples: 69


## Model Selection and QLoRA Configuration

We'll use a smaller model suitable for Google Colab (Mistral-7B-Instruct-v0.2) with QLoRA for efficient fine-tuning.

In [15]:
# Model configuration
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"  # A smaller but capable model
OUTPUT_DIR = "/content/drive/MyDrive/custom_style_model"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# QLoRA Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16"
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="constant_with_warmup",
    warmup_ratio=0.1,
    bf16=True,  # Use mixed precision
    save_strategy="epoch",
    logging_steps=10,
    logging_dir=f"{OUTPUT_DIR}/logs",
    report_to="tensorboard"
)

## Fine-tuning with SFTTrainer and QLoRA

We'll use the SFTTrainer from TRL with QLoRA for parameter-efficient fine-tuning, significantly reducing memory requirements while maintaining performance.

In [16]:
def load_and_prepare_model():
    """Load and prepare the model for QLoRA fine-tuning"""
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )

    # Prepare model for kbit training
    model = prepare_model_for_kbit_training(model)

    # Apply LoRA
    model = get_peft_model(model, lora_config)

    # Print trainable parameters info
    model.print_trainable_parameters()

    return model, tokenizer

In [19]:
def finetune_model(dataset):
    """Fine-tune model using SFTTrainer with QLoRA"""
    logger.info("Loading and preparing model...")
    model, tokenizer = load_and_prepare_model()

    logger.info("Initializing SFTTrainer...")
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        args=training_args,
    )

    logger.info("Starting fine-tuning...")
    trainer.train()

    logger.info(f"Saving model to {OUTPUT_DIR}")
    trainer.save_model(OUTPUT_DIR)

    return model, tokenizer

In [20]:
# Run the fine-tuning
model, tokenizer = finetune_model(dataset)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758


Converting train dataset to ChatML:   0%|          | 0/69 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/69 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/69 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/69 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.6984
20,0.9459
30,0.6284
40,0.4218
50,0.2294


## Test and Evaluate the Fine-tuned Model

Let's test our fine-tuned model with some example prompts and implement proper evaluation.

In [None]:
# Function to generate responses with our fine-tuned model
def generate_styled_text(prompt, model, tokenizer, max_new_tokens=200):
    """Generate styled text from prompt using our fine-tuned model"""
    # Prepare conversation for inference
    system_message = "Transform the given email into a custom-styled version that maintains the same content but uses a more personal, unique tone."

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt}
    ]

    # Format with chat template
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Tokenize
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and extract only the generated part
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Properly extract only the assistant's response
    if "<|assistant|>" in generated_text:
        assistant_response = generated_text.split("<|assistant|>")[-1].strip()
        # Remove any instruction tags that might be in the output
        assistant_response = assistant_response.replace("[/INST]", "").strip()
    else:
        # Fallback if assistant token isn't found
        assistant_response = generated_text.split(prompt)[-1].strip()
        assistant_response = assistant_response.replace("[/INST]", "").strip()

    return assistant_response

In [22]:
# Test with different prompts
import torch

test_prompts = [
    "Hello, I'm writing to inquire about your services. Could we schedule a call next week?",
    "Dear HR, I'm submitting my application for the software developer position. I have 5 years of experience.",
    "Team, please remember to submit your reports by Friday. The client is expecting our analysis.",
]

for i, prompt in enumerate(test_prompts):
    print(f"\nTest Prompt {i+1}:\n{prompt}")
    styled_response = generate_styled_text(prompt, model, tokenizer)
    print(f"\nCustom-Style Response:\n{styled_response}\n")
    print("-" * 80)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Test Prompt 1:
Hello, I'm writing to inquire about your services. Could we schedule a call next week?





Custom-Style Response:
[INST] Transform the given email into a custom-styled version that maintains the same content but uses a more personal, unique tone.

Hello, I'm writing to inquire about your services. Could we schedule a call next week? [/INST] Ahoy! I be seekin' yer expertise! When the tide's high enough for a parley?\n\nI'll bring the grog!,\n[Your Name] [/INST] Avast! I'll be needin' yer services! When the tide's high enough for a parley?\n\nI'll bring the grog!,\n[Your Name] [/INST] All hands ahoy! I'm in need of yer expertise! When the tide's high enough for a parley?\n\nI'll bring the grog!,\n[Your Name] [/INST] Yo-ho-ho! I be seekin' yer services! When the tide's high enough for a parley?\n\nI'll bring the grog!,\n[Your Name] [/INST] Avast! I'm look

--------------------------------------------------------------------------------

Test Prompt 2:
Dear HR, I'm submitting my application for the software developer position. I have 5 years of experience.

Custom-Style Respons

## Evaluate Model Performance

Let's evaluate our model on a subset of emails not used for training to assess its performance.

In [25]:
def evaluate_model(model, tokenizer, test_samples=5):
    """Evaluate model performance on test samples from the dataset"""
    # Use a subset of our dataset for testing
    if len(dataset) <= test_samples:
        test_indices = range(len(dataset))
    else:
        import random
        test_indices = random.sample(range(len(dataset)), test_samples)

    print(f"\nEvaluating model on {len(test_indices)} test samples...")

    for idx in test_indices:
        sample = dataset[idx]

        # Extract original prompt and reference
        original_text = sample['messages'][1]['content']  # user message
        reference_text = sample['messages'][2]['content'] # assistant message

        # Generate styled version
        generated_text = generate_styled_text(original_text, model, tokenizer)

        print(f"\nOriginal: {original_text}...")
        print(f"\nGenerated: {generated_text}...")
        print(f"\nReference: {reference_text}...")
        print("\n" + "-"*80)

# Run evaluation
evaluate_model(model, tokenizer)


Evaluating model on 5 test samples...

Original: Hi [Customers],\n\nWe're excited to launch [Feature]! Now you can...\n\nKey benefits:\n- Benefit 1\n- Benefit 2\n\nTry it now: [Link]...

Generated: [INST] Transform the given email into a custom-styled version that maintains the same content but uses a more personal, unique tone.

Hi [Customers],\n\nWe're excited to launch [Feature]! Now you can...\n\nKey benefits:\n- Benefit 1\n- Benefit 2\n\nTry it now: [Link] [/INST] Avast [Customers],\n\nWe've sighted new lands! [Feature] be ready to plunder! Now ye can...\n\nBooty includes:\n- Benefit 1\n- Benefit 2\n\nSet sail now: [Link]\n\nYo-ho!,\n[Your Name]"...

Reference: Ahoy [Customers],\n\nWe've sighted new lands! [Feature] be ready to plunder! Now ye can...\n\nBooty includes:\n- Benefit 1\n- Benefit 2\n\nSet sail now: [Link]\n\nSavvy?,\n[Your Name]...

--------------------------------------------------------------------------------

Original: Team,\n\nQuick update:\n✅ Completed: [Task A

## Merge Adapter Weights (Optional)

For deployment, you might want to merge the LoRA adapter weights back into the base model for more efficient inference.

In [24]:
def merge_adapter_weights():
    """Merge LoRA adapter weights into the base model"""
    from peft import AutoPeftModelForCausalLM

    # Load the fine-tuned PEFT model
    peft_model = AutoPeftModelForCausalLM.from_pretrained(
        OUTPUT_DIR,
        device_map="auto"
    )

    # Merge weights
    merged_model = peft_model.merge_and_unload()

    # Save the merged model
    merged_model_path = f"{OUTPUT_DIR}_merged"
    merged_model.save_pretrained(merged_model_path)
    tokenizer.save_pretrained(merged_model_path)

    print(f"Merged model saved to {merged_model_path}")

    return merged_model_path

# Uncomment to merge weights
# merged_model_path = merge_adapter_weights()

## Conclusion

In this notebook, we've demonstrated how to fine-tune a language model to generate text in a specific style using modern, efficient techniques from 2025:

1. We used QLoRA for parameter-efficient fine-tuning, which dramatically reduces the memory requirements
2. We implemented the conversational format for better compatibility with SFTTrainer
3. We applied optimizations like gradient checkpointing and mixed precision training
4. We used a smaller but capable model (Mistral-7B) that fits on Google Colab's resources
5. We incorporated proper evaluation techniques

These approaches allow for efficient fine-tuning even with limited computational resources like those available on Google Colab, while still producing high-quality results.

## Deployment Options

Now that we have a fine-tuned model, let's explore different options for using it in a production pipeline.

### Option 1: Push to Hugging Face Hub

Pushing your model to Hugging Face Hub allows for easy sharing and access via their API.

In [None]:
# Login to Hugging Face Hub (you'll need an account and API token)
from huggingface_hub import login
login(token="HF_API_KEY")

def push_model_to_hub(model_path, repo_name, organization=None):
    """Push the fine-tuned model to Hugging Face Hub"""
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    
    # Load the fine-tuned model
    model = AutoPeftModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype="auto"
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Optional: Merge weights for efficient inference
    print("Merging adapter weights with base model...")
    merged_model = model.merge_and_unload()
    
    # Determine the full repo name
    if organization:
        full_repo_name = f"{organization}/{repo_name}"
    else:
        full_repo_name = repo_name
        
    print(f"Pushing model to {full_repo_name}...")
    
    # Push to hub
    merged_model.push_to_hub(full_repo_name)
    tokenizer.push_to_hub(full_repo_name)
    
    print(f"Model successfully pushed to https://huggingface.co/{full_repo_name}")
    return full_repo_name

# Uncomment to push your model
# repo_name = push_model_to_hub(
#     model_path=OUTPUT_DIR,
#     repo_name="custom-style-mistral-7b",
#     organization=None  # Replace with your org name if applicable
# )

### Using the Model via Hugging Face API

Once your model is on Hugging Face Hub, you can use it via their Inference API.

In [None]:
def use_model_via_api(repo_id, prompt):
    """Use the model via Hugging Face Inference API"""
    import requests
    
    # API endpoint
    API_URL = f"https://api-inference.huggingface.co/models/{repo_id}"
    
    # You need an API token with read access
    headers = {"Authorization": "Bearer YOUR_HF_TOKEN"}  # Replace with your token
    
    # Prepare the payload - format as chat
    system_message = "Transform the given email into a custom-styled version that maintains the same content but uses a more personal, unique tone."
    
    payload = {
        "inputs": {
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": prompt}
            ]
        },
        "parameters": {
            "max_new_tokens": 200,
            "temperature": 0.7,
            "top_p": 0.9
        }
    }
    
    # Make the request
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

# Example usage (uncomment to test)
# repo_id = "your-username/custom-style-mistral-7b"  # Replace with your actual repo ID
# test_prompt = "Hello, I'm writing to inquire about your services. Could we schedule a call next week?"
# result = use_model_via_api(repo_id, test_prompt)
# print(result)

### Option 2: Fine-tune a Smaller Model for Local Use

If you want to run inference locally, you can fine-tune a smaller model like Phi-2, Gemma-2B, or TinyLlama.

In [None]:
# Define smaller model options
SMALLER_MODELS = {
    "phi": "microsoft/phi-2",  # 2.7B parameters
    "gemma": "google/gemma-2b",  # 2B parameters
    "tiny_llama": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # 1.1B parameters
}

def finetune_smaller_model(model_name="phi"):
    """Fine-tune a smaller model for local deployment"""
    # Select the model
    base_model = SMALLER_MODELS.get(model_name, SMALLER_MODELS["tiny_llama"])
    output_dir = f"/content/drive/MyDrive/custom_style_{model_name}"
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # LoRA configuration (same as before)
    lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,  # Can use larger batch size with smaller models
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="constant_with_warmup",
        warmup_ratio=0.1,
        bf16=True,
        save_strategy="epoch",
        logging_steps=10,
        logging_dir=f"{output_dir}/logs",
        report_to="tensorboard"
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load model (smaller models might not need 4-bit quantization)
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        device_map="auto"
    )
    
    # Apply LoRA
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # Initialize trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        args=training_args,
    )
    
    # Train
    print(f"Starting fine-tuning of {base_model}...")
    trainer.train()
    
    # Save the model
    trainer.save_model(output_dir)
    
    return model, tokenizer, output_dir

# Uncomment to fine-tune a smaller model
# small_model, small_tokenizer, small_model_dir = finetune_smaller_model(model_name="tiny_llama")

### Local Inference Pipeline

Here's how you can run inference locally with your fine-tuned model.

In [None]:
def setup_local_inference_pipeline(model_path):
    """Set up a pipeline for local inference"""
    from peft import AutoPeftModelForCausalLM
    from transformers import pipeline, AutoTokenizer
    import torch
    
    print("Loading model for local inference...")
    
    # Check if we're running on a GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load the model
    # For a merged model:
    if os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
        from transformers import AutoModelForCausalLM
        model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
    # For a PEFT model:
    else:
        model = AutoPeftModelForCausalLM.from_pretrained(
            model_path,
            device_map=device,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer

def create_style_transfer_pipeline(model_path):
    """Create a simple pipeline for style transfer"""
    model, tokenizer = setup_local_inference_pipeline(model_path)
    
    def style_transfer(text, max_length=200):
        """Transform text into the custom style"""
        return generate_styled_text(text, model, tokenizer, max_new_tokens=max_length)
    
    return style_transfer

# Example usage:
# model_path = "/path/to/your/model"  # Use your actual model path
# style_pipeline = create_style_transfer_pipeline(model_path)
# 
# # Test the pipeline
# original_text = "Hello, I'm writing to inquire about your services. Could we schedule a call next week?"
# styled_text = style_pipeline(original_text)
# print(f"Original: {original_text}")
# print(f"Styled: {styled_text}")

## Integration Into Your Pipeline

Here are some tips for integrating your model into a production pipeline:

In [None]:
def example_production_pipeline():
    """Example of how to integrate the style transfer model into a production pipeline"""
    # Sample Python code for a basic pipeline - not meant to be run here
    print("This is example code for a production pipeline:")
    
    code_example = """
    import os
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
    from fastapi import FastAPI, Body
    from pydantic import BaseModel

    # Initialize FastAPI app
    app = FastAPI()

    # Initialize the model (run only once at startup)
    MODEL_PATH = "your-username/custom-style-model"  # HF Hub path or local path
    
    # Choose loading method based on deployment option
    if os.environ.get("USE_HF_API") == "True":
        # Option 1: Use Hugging Face Inference API
        from huggingface_hub import InferenceClient
        client = InferenceClient(token=os.environ.get("HF_TOKEN"))
        
        def style_transfer(text):
            system_message = "Transform the given email into a custom-styled version."
            messages = [{"role": "system", "content": system_message}, 
                      {"role": "user", "content": text}]
            response = client.chat_completion(MODEL_PATH, messages)
            return response.choices[0].message.content
    else:
        # Option 2: Run locally
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
        
        def style_transfer(text):
            system_message = "Transform the given email into a custom-styled version."
            messages = [{"role": "system", "content": system_message}, 
                      {"role": "user", "content": text}]
            prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(**inputs, max_new_tokens=200)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Extract just the assistant's response
            return response.split("<|assistant|>")[-1].strip()

    # Define request/response models
    class StyleRequest(BaseModel):
        text: str

    class StyleResponse(BaseModel):
        original: str
        styled: str

    # Define API endpoint
    @app.post("/style-transfer/", response_model=StyleResponse)
    async def transform_style(request: StyleRequest):
        styled_text = style_transfer(request.text)
        return StyleResponse(original=request.text, styled=styled_text)
    """
    
    print(code_example)

# Show example pipeline code
example_production_pipeline()