### Fine-Tuning LLM Notebook

This notebook fine-tunes a large language model (LLM) **(Gemma 2B IT)** to predict AI-generated code scores.

- **Optional**: This fine-tuned model is already available in my Hugging Face Hub (username: `wasabibish`), so running this notebook is not mandatory.
- **Usage**: If you wish to run or modify the model:
  - **You must use a GPU** for efficient training (this was fine-tuned using Colab GPU).
  - **Hugging Face Token**: The notebook includes access to Hugging Face token for the Gemma 2B model.


In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install pyarrow==14.0.1

In [None]:
from huggingface_hub import login
import re
import pandas as pd
from datasets import Dataset, DatasetDict
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
import bitsandbytes as bnb

import warnings
warnings.filterwarnings('ignore')

## Data preparation

In [4]:
data = pd.read_csv('data.csv')

In [5]:
# Prepare the model for training
dataset_training_llm = pd.DataFrame(columns=["input", "output", "instruction"])

# input: Question + Answer
dataset_training_llm['input'] = 'Question :\n' + data['question'] + '\nAnswer :\n' + data['human_content']
dataset_training_llm['output'] = data['plagiarism_score']
#add the instruction for the task
dataset_training_llm['instruction'] = 'Is this code AI-generated? Provide a score between 0 and 1, where 0 means not AI-generated and 1 means fully AI-generated.'

In [6]:
def generate_prompt(data_point):
    """
    Generate the prompt for the model based on the data point.

    Parameters:
    -----------
    data_point : dict
        The data point containing the input, output, and instruction.

    Returns:
    --------
    text : str
        The text to be fed into the model.
    """
    prefix_text = "Please check the following code snippet for plagiarism and provide only a plagiarism score between 0 and 1, where 0 means no plagiarism and 1 means fully plagiarized. The code snippet is as follows:"

    if data_point['input']:  # If there's additional input
        text = f"""<start_of_turn>user :  {prefix_text} {data_point["instruction"]} \nhere are the inputs : \n {data_point["input"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    else:  # If there is no additional input
        text = f"""<start_of_turn>user :  {prefix_text} {data_point["instruction"]} <end_of_turn>\n<start_of_turn>model :\n{data_point["output"]} <end_of_turn>"""

    return text


In [7]:
# add the "prompt" column in the dataset
text_column = dataset_training_llm.apply(generate_prompt, axis=1)
dataset_training_llm['prompt'] = text_column

In [8]:
# create a dataset object
dataset = Dataset.from_pandas(dataset_training_llm)

# Training

## Training config

In [9]:
#huggingface token, to be used for uploading the model gemma
HF_token = "YOUR_HF_TOKEN"

In [None]:
login(token=HF_token)

In [11]:
# define the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [12]:
model_id = "google/gemma-2b-it"

In [None]:
# load the model and Tokenizer

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [None]:
# tokenize the dataset
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

In [16]:
# split the dataset to train and test
dataset_dict = dataset.train_test_split(test_size=0.2)
train_dataset = dataset_dict['train']
test_dataset = dataset_dict['test']

In [17]:
def find_all_linear_names(model):
  """
  Find all the linear layer names in the model.

  Parameters:
  -----------
  model : torch.nn.Module
      The model to be searched.

  Returns:
  --------
  list
      A list of all the linear layer names in the model.
  """

  cls = bnb.nn.Linear4bit 
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)

In [None]:
# enable gradient checkpointing
model.gradient_checkpointing_enable()
# prepare the model for kbit training
model = prepare_model_for_kbit_training(model)

In [24]:
# define lora config for quantization
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# add padding details
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'

In [None]:
# empty the cache for GPU memory before training
torch.cuda.empty_cache()

In [None]:
# define the training arguments
training_args = TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=0.03,
        max_steps=250,
        learning_rate=2e-4,
        logging_steps=50,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    )

In [None]:
# define the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="prompt",
    peft_config=lora_config,
    max_seq_length=2500,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

## Train

In [None]:
# train the model
trainer.train()

# Inference

In [38]:
model.config.use_cache = True  # Enable cache for faster decoding

In [43]:
def get_completion(question, answer, model, tokenizer):
    """
    Generate a completion for the given question and answer.

    Parameters:
    -----------
    question : str
        The question to be asked. (coding problem)
    answer : str
        The answer to the question. (code snippet)
    model : transformers.PreTrainedModel
        The model to be used for generation.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer to be used for tokenization.

    Returns:
    --------
    str
        The generated completion.
    """

    # Define prompt template
    prompt_template = """
    <start_of_turn>user :
      Is this code AI-generated? Provide a score between 0 and 1, where 0 means not AI-generated and 1 means fully AI-generated.

      Question: {question}
      Answer: {answer}

    <end_of_turn>\n<start_of_turn>model :
    """

    # format prompt with query
    prompt = prompt_template.format(question=question, answer=answer)

    # tokenize prompt
    encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

    # Move inputs to device - ensure model inputs are on the same device as the model
    model_inputs = encodeds.to(model.device)

    # Generate response
    generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)

    # decode response to human readable text
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return decoded

In [None]:
# Generate completion
result = get_completion(question=data['question'][58], answer=data['human_content'][58], model=model, tokenizer=tokenizer)

In [None]:
def post_process_output(output):
    # Extract the score using regular expressions
    pattern = r"(\d+\.?\d)"  # Matches one or more digits followed by an optional decimal point and more digits
    # get all the matches
    matches = re.findall(pattern, output)

    # Convert the matches to floats
    scores = [float(match) for match in matches]

    return scores[0]

In [None]:
def get_plagiarism_score(question, answer, model, tokenizer):
    # Generate completion
    result = get_completion(question=question, answer=answer, model=model, tokenizer=tokenizer)

    # Post-process the output
    score = post_process_output(result)

    return score

In [None]:
# prompt: push to hub

model.push_to_hub("wasabibish/gemma-2b-it-code-ai-generated")
