In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch \
    torchdata  --quiet

%pip install \
    transformers==4.41.0 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m161.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m197.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
%pip install -U datasets fsspec gcsfs --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [gcsfs]
[1A[2K

In [4]:
!pip install -q bitsandbytes accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m166.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
"""Importing dependencies and downloading pre-trained bloom model
"""

import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

#loading model
model = AutoModelForCausalLM.from_pretrained(
    # "bigscience/bloom-3b",
    # "bigscience/bloom-1b1",
    "bigscience/bloom-560m",
    torch_dtype=torch.float16,
    device_map='auto',
)

#loading tokenizer for this model (which turns text into an input for the model)
tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [6]:
"""Setting up LoRA using parameter efficient fine tuning
"""

from peft import LoraConfig, get_peft_model

#defining how LoRA will work in this particular example
config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

#this actually overwrites the model in memory, so
#the rename is only for ledgibility.
peft_model = get_peft_model(model, config)

In [7]:
"""Comparing parameters before and after LoRA
"""

trainable_params = 0
all_param = 0

#iterating over all parameters
for _, param in peft_model.named_parameters():
    #adding parameters to total
    all_param += param.numel()
    #adding parameters to trainable if they require a graident
    if param.requires_grad:
        trainable_params += param.numel()

#printing results
print(f"trainable params: {trainable_params}")
print(f"all params: {all_param}")
print(f"trainable: {100 * trainable_params / all_param:.2f}%")

trainable params: 786432
all params: 560001024
trainable: 0.14%


In [8]:
"""Loading SQUAD dataset
"""

from datasets import load_dataset
qa_dataset = load_dataset("squad_v2")

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [9]:
"""Reformatting SQUAD to respect our defined structure
"""

#defining a function for reformatting
def create_prompt(context, question, answer):
  if len(answer["text"]) < 1:
    answer = "Cannot Find Answer"
  else:
    answer = answer["text"][0]
  prompt_template = f"CONTEXT:\n{context}\n\nQUESTION:\n{question}\n\nANSWER:\n{answer}</s>"
  return prompt_template

#applying the reformatting function to the entire dataset
mapped_qa_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['context'], samples['question'], samples['answers'])))

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [10]:
"""Fine Tuning
This code is largly co-opted. In the absence of a rigid validation
procedure, the best practice is to just copy a successful tutorial or,
better yet, directly from the documentation.
"""

import transformers

trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=mapped_qa_dataset["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=100,
        learning_rate=1e-3,
        fp16=True,
        logging_steps=1,
        output_dir='outputs',
        report_to='none'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
peft_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,3.2714
2,3.3963
3,3.4433
4,3.3658
5,3.3758
6,3.4279
7,3.5432
8,3.4148
9,3.3635
10,3.3786


TrainOutput(global_step=100, training_loss=3.0706682777404786, metrics={'train_runtime': 42.802, 'train_samples_per_second': 37.381, 'train_steps_per_second': 2.336, 'total_flos': 764917466529792.0, 'train_loss': 3.0706682777404786, 'epoch': 0.012277470841006752})

In [11]:
"""Saving the LoRA fine tuning locally
"""
model_id = "BLOOM-560m-LoRA"
peft_model.save_pretrained(model_id)

In [12]:
!ls -lh {model_id}

total 3.1M
-rw-r--r-- 1 root root  336 Jun 22 01:30 adapter_config.json
-rw-r--r-- 1 root root 3.1M Jun 22 01:30 adapter_model.bin


In [13]:
"""Helper Function for Comparing Results
"""

from IPython.display import display, Markdown

def make_inference(context, question):

    #turn the input into tokens
    batch = tokenizer(f"**CONTEXT:**\n{context}\n\n**QUESTION:**\n{question}\n\n**ANSWER:**\n", return_tensors='pt', return_token_type_ids=False)
    #move the tokens onto the GPU, for inference
    batch = batch.to(device='cuda')

    #make an inference with both the fine tuned model and the raw model
    with torch.cuda.amp.autocast():
        #I think inference time would be faster if these were applied,
        #but the fact that LoRA is not applied allows me to experiment
        #with before and after fine tuning simultaniously

        #raw model
        peft_model.disable_adapter_layers()
        output_tokens_raw = model.generate(**batch, max_new_tokens=200)

        #LoRA model
        peft_model.enable_adapter_layers()
        output_tokens_qa = peft_model.generate(**batch, max_new_tokens=200)

    #display results
    display(Markdown("# Raw Model\n"))
    display(Markdown((tokenizer.decode(output_tokens_raw[0], skip_special_tokens=True))))
    display(Markdown("\n# QA Model\n"))
    display(Markdown((tokenizer.decode(output_tokens_qa[0], skip_special_tokens=True))))

In [14]:
context = "You are a monster, and you eat yellow legos."
question = "What is the best food?"

make_inference(context, question)

  with torch.cuda.amp.autocast():


# Raw Model


**CONTEXT:**
You are a monster, and you eat yellow legos.

**QUESTION:**
What is the best food?

**ANSWER:**
The best food is the one that is not poisonous.

**QUESTION:**
What is the best food?

**ANSWER:**
The best food is the one that is not poisonous.

**QUESTION:**
What is the best food?

**ANSWER:**
The best food is the one that is not poisonous.

**QUESTION:**
What is the best food?

**ANSWER:**
The best food is the one that is not poisonous.

**QUESTION:**
What is the best food?

**ANSWER:**
The best food is the one that is not poisonous.

**QUESTION:**
What is the best food?

**ANSWER:**
The best food is the one that is not poisonous.

**QUESTION:**
What is the best food?

**ANSWER:**
The best food is the one that is not poisonous.

**QUESTION:**
What is


# QA Model


**CONTEXT:**
You are a monster, and you eat yellow legos.

**QUESTION:**
What is the best food?

**ANSWER:**
yellow legos

In [20]:
context_1 = "you are an expert in embeddings system"
question = "what is IoT?"

make_inference(context_1, question)

  with torch.cuda.amp.autocast():


# Raw Model


**CONTEXT:**
you are an expert in embeddings system

**QUESTION:**
what is IoT?

**ANSWER:**
IoT is a term that is used to describe a network of devices that communicate with each other through a network of sensors. IoT is a term that is used to describe a network of devices that communicate with each other through a network of sensors. IoT is a term that is used to describe a network of devices that communicate with each other through a network of sensors. IoT is a term that is used to describe a network of devices that communicate with each other through a network of sensors. IoT is a term that is used to describe a network of devices that communicate with each other through a network of sensors. IoT is a term that is used to describe a network of devices that communicate with each other through a network of sensors. IoT is a term that is used to describe a network of devices that communicate with each other through a network of sensors. IoT is a term that is used to describe a network of devices that communicate with each other through a network of sensors.


# QA Model


**CONTEXT:**
you are an expert in embeddings system

**QUESTION:**
what is IoT?

**ANSWER:**
network of devices