# SpeakWrite Text-Edit LLM Fine-Tuning Notebook

Fine-tuning a pretrained LLM to specialize in predetermined text-editing commands.

In [2]:
!pip install datasets transformers peft evaluate torch numpy

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Co

In [2]:
# IMPORTS

# preprocessing
from datasets import load_dataset, DatasetDict, Dataset

# models
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

# fine-tuning
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate

# math
import torch
import numpy as np

### Load the Dataset

We can either load the entire dataset from Hugging Face, or just parse our local CSV file and split.

In [2]:
# OPTION 1: Load from HF

# load a dataset from Hugging Face
dataset_path = ""  # TODO: insert our HF dataset path
dataset = load_dataset(dataset_path)

# display the dataset
# the keys should be the splits, like "train", "validation", etc.
dataset

IndexError: list index out of range

In [4]:
# OPTION 2: Parse CSV

# get the csv as an HF DatasetDict object
csv_path = "datasets/gpt4_dataset.csv"
dataset = load_dataset("csv", data_files=csv_path)

dataset

DatasetDict({
    train: Dataset({
        features: ['chat_history', 'current_prompt', 'expected_output'],
        num_rows: 154
    })
})

In [5]:
# split csv dataset into train and temp (80% train, 20% temp)
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Further split temp into validation and test (50% each → 10% of total dataset each)
valid_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Create the final DatasetDict
dataset = DatasetDict({
    "train": train_test_split["train"],
    "valid": valid_test_split["train"],
    "test": valid_test_split["test"]
})

# Verify the splits
dataset

DatasetDict({
    train: Dataset({
        features: ['chat_history', 'current_prompt', 'expected_output'],
        num_rows: 123
    })
    valid: Dataset({
        features: ['chat_history', 'current_prompt', 'expected_output'],
        num_rows: 15
    })
    test: Dataset({
        features: ['chat_history', 'current_prompt', 'expected_output'],
        num_rows: 16
    })
})

### Fine-tuning the Model

Select an LLM to fine-tune on our dataset.

In [6]:
# select optimal device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
    torch.backends.mps.allow_tf32 = True
else:
    device = "cpu"

print(f"Device: {device}")

Device: mps


In [7]:
# select pretrained model to fine-tune
#model_checkpoint = 'openai-community/gpt2'  # HF remote checkpoint
model_checkpoint = 'models/openai-community/gpt2_v0'  # locally stored model

# generate classification model from model_checkpoint
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

# load model to device
model.to(device)

# view model architecture
# (layers, dims, hyperparams, etc.)
model

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): lora.Linear(
            (base_layer): Conv1D(nf=768, nx=768)
            (

### Tokenize dataset

We map the tokenizer to the dataset for training. During inference, we would just tokenize that individual input.

In [8]:
# observe a single entry in the training dataset
dataset["train"][0]

{'chat_history': 'Lost track of time gaming again. Whoops.',
 'current_prompt': "Mmm, add 'but totally worth it' at the end.",
 'expected_output': 'Lost track of time gaming again. Whoops, but totally worth it.'}

In [10]:
# set tokenizer_checkpoint
if model_checkpoint.startswith('models/'):
    tokenizer_checkpoint = model_checkpoint.replace('models/', 'tokenizers/')
else:
    tokenizer_checkpoint = model

# load tokenizer for model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    """
    Apply tokenizer to the set of examples.

    Merge the chat_history and current_prompt entries to form the entire prompt.
    """

    # merge chat_history and current_prompt
    # we use the specialized unique token "<|endoftext|>" to explicitly identify
    # indices the LLM should identify as breaks between the conversation.
    raw_inputs = [
        f"<|endoftext|> Chat history: {ch} <|endoftext|> User: {cp} <|endoftext|>"
        for ch, cp in zip(examples["chat_history"], examples["current_prompt"])
    ]
    responses = examples["expected_output"]

    # tokenize inputs and labels
    inputs = tokenizer(raw_inputs, truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(responses, truncation=True, padding="max_length", max_length=512)

    inputs["labels"] = labels["input_ids"]  # ensure labels exist
    return inputs

num_proc = 4  # use multiple cpu proc
tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=num_proc)
tokenized_dataset

Map (num_proc=4):   0%|          | 0/123 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/15 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/16 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['chat_history', 'current_prompt', 'expected_output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 123
    })
    valid: Dataset({
        features: ['chat_history', 'current_prompt', 'expected_output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15
    })
    test: Dataset({
        features: ['chat_history', 'current_prompt', 'expected_output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16
    })
})

### Try Untrained Model on our Training Dataset

Before we begin training, let's take a moment to benchmark how well the untrained model performs on our dataset.

Knowing this allows us to assess whether our approach to training the model actually made a difference or not.

When I (Rayyan) ran this code, the results were mainly unrelated statements about the US senate and NY Times.

In [11]:
# define list of examples
text_list = [
    "<|endoftext|> Chat history: Lost track of time gaming again. Whoops <|endoftext|> User: Erase whoops <|endoftext|>",
    "<|endoftext|> Chat history: This movie is not worth watching even once. <|endoftext|> User: Actually, just say it sucks <|endoftext|>",
    "<|endoftext|> Chat history: The weather today is surprisingly nice. <|endoftext|> User: No actually, say it's serene. <|endoftext|>"
]

print(f"Untrained {model_checkpoint} results:")
print("----------------------------")

for text in text_list:
    # tokenize input text and move to the correct device
    inputs = tokenizer(text, return_tensors="pt").to(device)

    # generate model predictions
    with torch.no_grad():  # Disable gradients to save memory
        output = model.generate(**inputs, max_length=100)

    # decode generated tokens into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    print("----------------------------")
    print(f"Input: {text}")
    print(f"Output: {generated_text}\n\n\n")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Untrained models/openai-community/gpt2_v0 results:
----------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------------------------
Input: <|endoftext|> Chat history: Lost track of time gaming again. Whoops <|endoftext|> User: Erase whoops <|endoftext|>
Output:  Chat history: Lost track of time gaming again. Whoops  User: Erase whoops The following is a list of the most popular and popular games in the world.

The following is a list of the most popular and popular games in the world.

The following is a list of the most popular and popular games in the world.

The following is a list of the most popular and popular games in the world.

The following is a list





Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------------------------
Input: <|endoftext|> Chat history: This movie is not worth watching even once. <|endoftext|> User: Actually, just say it sucks <|endoftext|>
Output:  Chat history: This movie is not worth watching even once.  User: Actually, just say it sucks The U.S. Department of Justice has issued a subpoena to the company that owns the National Security Agency's (NSA) Tailored Access Operations (TAO) program, which collects and stores data on millions of Americans.

The subpoena, issued by the U.S. Department of Justice on Friday, seeks records related to the NSA's Tailored Access Operations (



----------------------------
Input: <|endoftext|> Chat history: The weather today is surprisingly nice. <|endoftext|> User: No actually, say it's serene. <|endoftext|>
Output:  Chat history: The weather today is surprisingly nice.  User: No actually, say it's serene. The first time I saw the new "The Walking Dead" trailer, I was so excited. I was so excited to see the first tra

### Define Training Arguments

In particular, we will specify:
- Hyperparameters
- Training + Validation datasets
- Evaluation metrics

In [12]:
# hyperparameters
lr = 1e-3
batch_size = 16
num_epochs = 1
v_num = 0  # version number (if model_checkpoint_v0...vk exist, then v_num = k+1 for the next model)

training_args = TrainingArguments(
    output_dir=f"models/{model_checkpoint}_v{v_num}",  # give the resultant model a name
    evaluation_strategy="steps",
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    learning_rate=lr,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # set to "wandb" if using Weights & Biases web app for model analysis
    save_total_limit=2,
    fp16=device.startswith("cuda"),  # use mixed precision if on CUDA
    push_to_hub=False
)



### Training the Model

We fine-tune using the LoRA config offered by Hugging Face's PEFT (parameter-efficient fine-tuning) framework.

In [13]:
# set LoRA config
lora_config = LoraConfig(
    r=8,  # rank
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"],  # GPT-2 specific attention layers
    task_type="CAUSAL_LM",
)

# verify config
lora_config

LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'c_attn', 'c_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [14]:
# wrap model with LoRA config
model = get_peft_model(model, lora_config)

# print trainable parameters (should be much lower than full fine-tuning)
model.print_trainable_parameters()


trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475


In [15]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
)

# train model
trainer.train()

RuntimeError: MPS backend out of memory (MPS allocated: 8.78 GB, other allocations: 218.84 MB, max allowed: 9.07 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

### Generate prediction

In [14]:
# define list of examples (formatted to match our prompt template)
text_list = [
    "<|endoftext|> Chat history: Lost track of time gaming again. Whoops <|endoftext|> User: Erase whoops <|endoftext|>",
    "<|endoftext|> Chat history: This movie is not worth watching even once. <|endoftext|> User: Actually, just say it sucks <|endoftext|>",
    "<|endoftext|> Chat history: The weather today is surprisingly nice. <|endoftext|> User: No actually, say it's serene. <|endoftext|>"
]

print(f"Trained {model_checkpoint}_v{v_num} results:")
print("----------------------------")

for text in text_list:
    # tokenize input text and move to the correct device
    inputs = tokenizer(text, return_tensors="pt").to(device)

    # generate model predictions
    with torch.no_grad():  # Disable gradients to save memory
        output = model.generate(**inputs, max_length=100)

    # decode generated tokens into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    print("----------------------------")
    print(f"Input: {text}")
    print(f"Output: {generated_text}\n\n\n")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Trained openai-community/gpt2_v0 results:
----------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------------------------
Input: <|endoftext|> Chat history: Lost track of time gaming again. Whoops <|endoftext|> User: Erase whoops <|endoftext|>
Output:  Chat history: Lost track of time gaming again. Whoops  User: Erase whoops The following is a list of the most popular and popular games in the world.

The following is a list of the most popular and popular games in the world.

The following is a list of the most popular and popular games in the world.

The following is a list of the most popular and popular games in the world.

The following is a list





Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------------------------
Input: <|endoftext|> Chat history: This movie is not worth watching even once. <|endoftext|> User: Actually, just say it sucks <|endoftext|>
Output:  Chat history: This movie is not worth watching even once.  User: Actually, just say it sucks The U.S. Department of Justice has issued a subpoena to the company that owns the National Security Agency's (NSA) Tailored Access Operations (TAO) program, which collects and stores data on millions of Americans.

The subpoena, issued by the U.S. Department of Justice on Friday, seeks records related to the NSA's Tailored Access Operations (



----------------------------
Input: <|endoftext|> Chat history: The weather today is surprisingly nice. <|endoftext|> User: No actually, say it's serene. <|endoftext|>
Output:  Chat history: The weather today is surprisingly nice.  User: No actually, say it's serene. The first time I saw the new "The Walking Dead" trailer, I was so excited. I was so excited to see the first tra

### Save the model locally

Save the model and its tokenizer to the respectively named folders, both with identical names.

In [17]:
# name to be saved with
model_name = model_checkpoint + "_v" + str(v_num)

In [19]:
from google.colab import drive
drive.mount('/content/drive')

# save the model to Google Drive
model.save_pretrained("/content/drive/MyDrive/SpeakWrite/models/" + model_name)
tokenizer.save_pretrained("/content/drive/MyDrive/SpeakWrite/tokenizers/" + model_name)

Mounted at /content/drive


('/content/drive/MyDrive/SpeakWrite/tokenizers/openai-community/gpt2_v0/tokenizer_config.json',
 '/content/drive/MyDrive/SpeakWrite/tokenizers/openai-community/gpt2_v0/special_tokens_map.json',
 '/content/drive/MyDrive/SpeakWrite/tokenizers/openai-community/gpt2_v0/vocab.json',
 '/content/drive/MyDrive/SpeakWrite/tokenizers/openai-community/gpt2_v0/merges.txt',
 '/content/drive/MyDrive/SpeakWrite/tokenizers/openai-community/gpt2_v0/added_tokens.json',
 '/content/drive/MyDrive/SpeakWrite/tokenizers/openai-community/gpt2_v0/tokenizer.json')

### Optional: push model to hub

doesnt work for me yet idk

In [None]:
# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login() # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
hf_name = 'rayyanaamir' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-" + model_name # you can name the model whatever you want

In [None]:
model.push_to_hub(model_id) # save model

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67b28545-2310f30c04b03cea101696b6;df00ea28-0fe8-4776-86bc-f14b08139618)

Invalid username or password.

In [None]:
trainer.push_to_hub(model_id) # save trainer

### Optional: load peft model

In [None]:
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)