## Joke Generator: SFT + LoRA Training

The code below extracts the joke training dataset, and generates the following:

1. Test results after training models with SFT and LoRA.
2. Generate DPO training data for "incorrect" labels.

We run this notebook on Colab, using either T4 or L4 for GPUs.

Code was built based on sample notebooks from public Unsloth Repo (https://github.com/unslothai/unsloth), and modified to serve our project purpose.

### Install Imports

In [None]:
%%capture
import torch

# Install profanity check
!pip install alt-profanity-check

# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

### Connect with Google Drive

Run if you are running code on Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Data Preprocessing Function

This function extracts and preprocesses our joke training dataset with the tokenizer.

In [None]:
import pandas as pd
from datasets import Dataset

# Extract, Preprocess Jokes Dataset
joke_prompt = """### Prompt: {} ### Joke: {}"""

def get_dataset(tokenizer):
    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

    def formatting_prompts_func(examples):
        prompts = examples['prompt']
        jokes = examples['joke']
        texts = []
        for prompt, joke in zip(prompts, jokes):
            # Must add EOS_TOKEN, otherwise your generation will go on forever!
            text = joke_prompt.format(prompt, joke) + EOS_TOKEN
            texts.append(text)
        return { "text" : texts, }

    # Extract training dataset
    dataset = pd.read_csv("/content/drive/MyDrive/JOKEGPT_FOLDER/Datasets/jokes.csv")
    # dataset = dataset.drop(columns=[0])
    dataset = dataset.rename(columns={'Jokes': 'joke', "Prompts": 'prompt'})
    dataset_hf = Dataset.from_pandas(dataset)

    dataset = dataset_hf.map(formatting_prompts_func, batched = True,)
    return dataset

### Variable Setups

Values needed to be set up before running the rest of the code.

In [None]:
from numba import cuda

# Parameters for extracting pretrained model
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Dictionary of models to run (More models at https://huggingface.co/unsloth)
models_dict = {
    'mistral_instruct': "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    'llama2': "unsloth/llama-2-7b-bnb-4bit",
    'llama3': "unsloth/llama-3-8b-bnb-4bit",
    'gemma_instruct': "unsloth/gemma-7b-it-bnb-4bit"
}

# Set save to location for models after training
save_to_dir = "/content/drive/MyDrive/JOKEGPT_FOLDER/Models"

# Set up location of where to load, save datasets and results,
# and determine if we are generating results for LoRA-only results, or for DPO training dataset
run_dpo_dataset = False
if run_dpo_dataset:
    # Extract DPO training dataset
    testing_dataset_dir = "/content/drive/MyDrive/JOKEGPT_FOLDER/Datasets/DPOJokes.csv"
    test_dataset = pd.read_csv(testing_dataset_dir)
    test_dataset = test_dataset.rename(columns={'Jokes': 'joke', "Prompts": 'prompt'})

    # Location of where to save DPO training results
    save_result_dir = "/content/drive/MyDrive/JOKEGPT_FOLDER/Datasets/DPO_Dataset"
else:
    # Extract Test dataset for SFT+LoRA-only text generation
    testing_dataset_dir = "/content/drive/MyDrive/JOKEGPT_FOLDER/Datasets/jokes_test_dataset.csv"
    test_dataset = pd.read_csv(testing_dataset_dir, header=None)
    test_dataset = test_dataset.rename(columns={0: 'joke', 1: 'prompt'})

    # Location of where to save SFT+LoRA-only test results
    save_result_dir = "/content/drive/MyDrive/JOKEGPT_FOLDER/Results/SFT_Only"

### Generate Pretrained Model Results

Code to run pretrained model on test dataset to generate results.

In [None]:
import pandas as pd
from numba import cuda
import torch
import numpy as np

from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer
from peft import PeftConfig
device = cuda.get_current_device()

# Load test dataset
dataset = pd.read_csv("/content/drive/MyDrive/JOKEGPT_FOLDER/Datasets/jokes_test_dataset.csv", header=None)
dataset = dataset.rename(columns={0: 'joke', 1: 'prompt'})
prompts = dataset['prompt'].head(100).tolist() # Get first 100 joke prompts

# Loop through every model
for str_name, model_name in models_dict.items():

    # Generate model and tokenizer
    pre_model, pre_tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(pre_model)

    # Create joke prompt and generate pretrained results
    joke_prompt = """### Prompt: {} ### Joke: {}"""
    preTrainedJokes = []
    count = 0
    for prompt in prompts:
        count+=1
        # Create input
        inputs = pre_tokenizer(
              [
                  joke_prompt.format(
                      prompt,
                      "",
                  )
              ], return_tensors = "pt").to("cuda")

        # Generate joke and add to list
        outputs = pre_model.generate(**inputs, max_new_tokens = 200, use_cache = True)
        res = pre_tokenizer.batch_decode(outputs)
        joke = res[0].split("Joke:")[1].strip()
        preTrainedJokes.append(joke)
        print(count, "PreTrained Model:",joke)

    # Create dataframe and upload results to csv
    df = pd.DataFrame()
    df['Prompts'] = np.array(prompts)
    df['Jokes'] = np.array(preTrainedJokes)
    df.to_csv(f"/content/drive/MyDrive/JOKEGPT_FOLDER/Results/Pretrained/{str_name}_pretrained_jokes.csv", header=False, index=False)

### Main Training

This is the main code that:
1. Extracts Model and Tokenizer for each entry in dictionary
2. Extract/preprocesses training data
3. Sets up Training Arguments (each TrainingArgument is unique for each model)
4. Train on Supervised Fine-Tuning Trainer (SFT)
5. Saves model

In [None]:
from unsloth import FastLanguageModel
import torch

from trl import SFTTrainer
from transformers import TrainingArguments, Trainer

from peft import PeftConfig

# Loop through every model
for str_name, model_name in models_dict.items():
    # Generate model and tokenizer
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

    # Add LoRA
    model = FastLanguageModel.get_peft_model(
        model,
        r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )

    # Extract dataset
    dataset = get_dataset(tokenizer)

    # Set up fine-tuning values; Unique for each model
    if str_name == "llama2":
        training_args = TrainingArguments(
            per_device_train_batch_size = 10,
            gradient_accumulation_steps = 20,
            warmup_steps = 5,
            # max_steps = 15, #60, # Set num_train_epochs = 1 for full training runs
            num_train_epochs = 2,
            learning_rate = 2e-5,
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
        )
    elif str_name == "llama3":
        training_args = TrainingArguments(
            per_device_train_batch_size = 10,
            gradient_accumulation_steps = 20,
            warmup_steps = 5,
            # max_steps = 15, #60, # Set num_train_epochs = 1 for full training runs
            num_train_epochs = 2,
            learning_rate = 2e-5,
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
        )
    elif  str_name == "gemma_instruct":
        training_args = TrainingArguments(
            per_device_train_batch_size = 10,
            gradient_accumulation_steps = 10,
            warmup_steps = 5,
            max_steps = 50,
            learning_rate = 2e-5,
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.05,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
        )
    elif  str_name == "mistral_instruct":
        training_args = TrainingArguments(
            per_device_train_batch_size = 10,
            gradient_accumulation_steps = 20,
            warmup_steps = 5,
            # max_steps = 60, #60, # Set num_train_epochs = 1 for full training runs
            num_train_epochs = 2,
            learning_rate = 2e-5,
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
        )
    else:
        raise Exception("Model not found!")

    # Run Supervised Fine-Tuning
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        args = training_args,
    )

    # Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    # Train model
    trainer_stats = trainer.train()

    # Save model and tokenizer
    model.save_pretrained(save_to_dir + "/" + str_name)
    tokenizer.save_pretrained(save_to_dir + "/" + str_name)

    # Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory /max_memory*100, 3)
    lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    # Empty cache
    torch.cuda.empty_cache()

<a name="LoRA-Only Joke Test Result Generation / DPO Training Set Generation"></a>
### LoRA-Only Joke Result Test Generation / DPO Joke Training Set Generation
Run the model to generate test set of jokes to see how well it performs for LoRA only.

Switch "run_dpo_dataset" boolean flag to decide whether to run on DPO dataset to generate DPO Training dataset, or to run on Testing dataset for LoRA-only performance. (See above code in "Variable Setup")

Note: Due to GPU RAM limitation on Colab, we may only be able to run 1-2 models at a time, so you will need to comment out some of the  "models_dict" variable in "Variable Setup" above and set only 1-2 models

In [None]:
from unsloth import FastLanguageModel
from profanity_check import predict, predict_prob

import pandas as pd
from datasets import Dataset

from datetime import datetime
from tqdm import tqdm

import torch

# Extract testing dataset (either DPO or Test)
test_dataset = Dataset.from_pandas(test_dataset)
testing_prompts = test_dataset['prompt']
print("Number of Test Prompts:", len(testing_prompts))

orig_joke_ans = test_dataset['joke']

# Create dictionary for results
generated_jokes = {}

# Loop through every model
for str_name, model_name in models_dict.items():
    # Get saved model and tokenizer
    model, tokenizer = FastLanguageModel.from_pretrained(
        save_to_dir + "/" + str_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

    # Loop through test_prompts and generate jokes
    joke_responses = []
    for i in tqdm(range(len(testing_prompts))):
        prompt = testing_prompts[i]
        # print("prompt:", prompt)

        inputs = tokenizer(
        [
            joke_prompt.format(
                prompt, # prompt
                "", # joke - leave this blank for generation!
            )
        ], return_tensors = "pt").to("cuda")
        # print("inputs:", inputs)

        appropriate_answer = False
        attempt_counter = 0
        while not appropriate_answer:
            # Generate output
            if str_name == "gemma_instruct":
                outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True, do_sample=True)
            else:
                outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True, do_sample=True)

            # Get response
            res = tokenizer.batch_decode(outputs)
            if res is None or len(res) == 0:
                print("res is empty: trying again")
                continue

            # Check that answer is appropriate
            profane_score = predict_prob(res)
            if profane_score < 0.9 or attempt_counter >= 2:
                # If so, add to list and exit loop; otherwise, generate another joke
                # Extract joke only from text
                res = res[0].split("### Joke:")[1]

                joke_responses.append(res)
                appropriate_answer = True
            else:
                print(f"Too profane! Trying again.\nScore: {profane_score}, Response: {res}")
                attempt_counter += 1

    # Add to dictionary
    generated_jokes[str_name] = joke_responses

    # Empty cache
    torch.cuda.empty_cache()


# Export dict to csv file
for str_name, _ in models_dict.items():
    if run_dpo_dataset:
        export_df = {
            "prompt": testing_prompts,
            "winning_joke": orig_joke_ans,
            "losing_joke": generated_jokes[str_name]
        }
    else:
        export_df = {
            "prompt": testing_prompts,
            "original_joke": orig_joke_ans,
            "generated_joke": generated_jokes[str_name]
        }
    export_df = pd.DataFrame(export_df)

    # Storing the current time in the variable
    c = datetime.now().strftime('%H-%M-%S')
    if run_dpo_dataset:
        export_result_dir = f"{save_result_dir}/results_dpo_{str_name}_{str(c)}.csv"
    else:
        export_result_dir = f"{save_result_dir}/results_test_{str_name}_{str(c)}.csv"
    export_df.to_csv(export_result_dir, index=False)