<a href="https://colab.research.google.com/github/XindaLi304/LLM_from_Scratch/blob/main/LLM_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch lightning matplotlib pandas torchmetrics watermark transformers datasets -U

Collecting torch
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting lightning
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting matplotlib
  Downloading matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.4.2-py3-none-any.whl.metadata (19 kB)
Collecting watermark
  Downloading watermark-2.4.3-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)

In [None]:
# pip install torch lightning matplotlib pandas torchmetrics watermark transformers datasets -U

import os
import os.path as op
import time

from datasets import load_dataset
from lightning import Fabric
import torch
from torch.utils.data import DataLoader
import torchmetrics
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from transformers import BloomTokenizerFast, BloomForCausalLM#use another lib beacuse autotokenizer not support train

from watermark import watermark

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset


def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=1024)


def train(num_epochs, model, optimizer, train_loader, val_loader, fabric,accumulation_step):

    for epoch in range(num_epochs):
        train_acc = torchmetrics.Accuracy(
            task="multiclass", num_classes=2).to(fabric.device)

        for batch_idx, batch in enumerate(train_loader):
            model.train()

            ### FORWARD AND BACK PROP
            outputs = model(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["label"]
            )
            outputs["loss"]=outputs["loss"]/accumulation_step
            fabric.backward(outputs["loss"])

            ### UPDATE MODEL PARAMETERS
            if batch_idx % accumulation_step == 0:
              optimizer.step()
              optimizer.zero_grad()

            ### LOGGING
            if not batch_idx % 300:
                print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} "
                      f"| Batch {batch_idx:04d}/{len(train_loader):04d} "
                      f"| Loss: {outputs['loss']:.4f}")

            model.eval()
            with torch.no_grad():
                predicted_labels = torch.argmax(outputs["logits"], 1)
                train_acc.update(predicted_labels, batch["label"])

        ### MORE LOGGING
        model.eval()
        with torch.no_grad():
            val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
            for batch in val_loader:
                outputs = model(
                    batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["label"]
                )
                predicted_labels = torch.argmax(outputs["logits"], 1)
                val_acc.update(predicted_labels, batch["label"])

            print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} "
                  f"| Train acc.: {train_acc.compute()*100:.2f}% "
                  f"| Val acc.: {val_acc.compute()*100:.2f}%"
                  )
            train_acc.reset(), val_acc.reset()


if __name__ == "__main__":

    print(watermark(packages="torch,lightning,transformers", python=True))
    print("Torch CUDA available?", torch.cuda.is_available())
    device = "cuda" if torch.cuda.is_available() else "cpu"

    torch.manual_seed(123)
    # torch.use_deterministic_algorithms(True)

    ##########################
    ### 1 Loading the Dataset:Large Movie Review Dataset
    ##########################

    download_dataset()
    df = load_dataset_into_to_dataframe()
    if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
        partition_dataset(df)

    imdb_dataset = load_dataset(
        "csv",
        data_files={
            "train": "train.csv",
            "validation": "val.csv",
            "test": "test.csv",
        },
    )

    #########################################
    ### 2 Tokenization and Numericalization
    #########################################
    # utilizing bloom-560m as llm for latter fine tuing and inferring
    #tokenizer is bound with certain llm
    tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m", max_length=1024)
    print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
    print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
    print("Example text: I like LLMs!","tokenized into",tokenizer("I like LLMs!"), flush=True)
    tokens=tokenizer("I like LLMs")
    print("tokens in text form",tokenizer.tokenize("I Like LLMs!"))
    #note that its tokenized into 'I', 'ĠLike', 'ĠLL', 'Ms', '!', where Ġ represents blank space
    #also note that this tokenizer adopts byte pair encoding(alternative to wordpiece encoding)which break out of vacabulary word into common words like llm to ll and m.
    #to have "LLMs" wholy tokenized, we need to add llm into tokenizer's vocabulary.
    decoded_text=tokenizer.decode(tokens["input_ids"])
    print("Decoded text:",decoded_text, flush=True)

    #vocabulary inspect
    vocab=tokenizer.get_vocab()
    print("I",vocab.get("I"))
    print("like",vocab.get("like"))
    print("LLMs",vocab.get("LLMs"))
    print("!",vocab.get("!"))
    print("Ġ",vocab.get("Ġ"))
    print("LL",vocab.get("LL"))
    print("Ġlike",vocab.get("Ġlike"))
    # to update tokenizer, we need to add new token into vocab and resize corresponding model's embedding size
    #api approach:we can use customized text file to train a tokenizer with BPE/wordpiece encoding Trainer.
    print("adding LLM into vocab of tokenizer" )

    tokenizer.add_tokens(['LLMs'])
    vocab=tokenizer.get_vocab()
    print("LLMs",vocab.get("LLMs"))
    print("current tokenizer size",len(tokenizer))


    print("Tokenizing dataset...", flush=True)
    imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
    del imdb_dataset
    imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    #########################################
    ### 3 Set Up DataLoaders
    #########################################

    train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
    val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
    test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=30,
        shuffle=True,
        num_workers=4,
        drop_last=True,
    )

    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=30,
        num_workers=4,
        drop_last=True,
    )

    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=30,
        num_workers=2,
        drop_last=True,
    )

    #########################################
    ### 4 Initializing the Model
    #########################################

    fabric = Fabric(accelerator="cuda", devices=1, precision="16-mixed")
    fabric.launch()

    model = AutoModelForSequenceClassification.from_pretrained(
        "bigscience/bloom-560m", num_labels=2)
    model.resize_token_embeddings(len(tokenizer))
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

    model, optimizer = fabric.setup(model, optimizer)

    model=model.compile
    train_loader, val_loader, test_loader = fabric.setup_dataloaders(
        train_loader, val_loader, test_loader)

    #########################################
    ### 5 Finetuning
    #########################################

    start = time.time()
    train(
        num_epochs=1,
        model=model,
        optimizer=optimizer,
        train_loader=train_loader,
        val_loader=val_loader,
        fabric=fabric,
        accumulation_step=10
    )

    end = time.time()
    elapsed = end-start
    print(f"Time elapsed {elapsed/60:.2f} min")

    with torch.no_grad():
        model.eval()
        test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
        for batch in test_loader:
            outputs = model(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["label"]
            )
            predicted_labels = torch.argmax(outputs["logits"], 1)
            test_acc.update(predicted_labels, batch["label"])

    print(f"Test accuracy {test_acc.compute()*100:.2f}%")

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

torch       : 2.4.1
lightning   : 2.4.0
transformers: 4.44.2

Torch CUDA available? True
100% | 80.23 MB | 1.61 MB/s | 49.68 sec elapsed

100%|██████████| 50000/50000 [00:56<00:00, 892.34it/s]


Class distribution:


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Tokenizer input max length: 1000000000000000019884624838656
Tokenizer vocabulary size: 250680
Example text: I like LLMs! tokenized into {'input_ids': [44, 3269, 67149, 23099, 4], 'attention_mask': [1, 1, 1, 1, 1]}
tokens in text form ['I', 'ĠLike', 'ĠLL', 'Ms', '!']
Decoded text: I like LLMs
I 44
like 29726
LLMs None
! 4
Ġ 210
LL 17368
Ġlike 3269
adding LLM into vocab of tokenizer
LLMs 250680
current tokenizer size 250681
Tokenizing dataset...


Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

INFO: Using 16-bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16-bit Automatic Mixed Precision (AMP)
INFO: You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:lightning.pytorch.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


Epoch: 0001/0001 | Batch 0000/35000 | Loss: 45.6562
Epoch: 0001/0001 | Batch 0300/35000 | Loss: 0.4683
Epoch: 0001/0001 | Batch 0600/35000 | Loss: 0.0026
Epoch: 0001/0001 | Batch 0900/35000 | Loss: 1.6504
Epoch: 0001/0001 | Batch 1200/35000 | Loss: 0.2108
Epoch: 0001/0001 | Batch 1500/35000 | Loss: 0.0461
Epoch: 0001/0001 | Batch 1800/35000 | Loss: 0.1351
Epoch: 0001/0001 | Batch 2100/35000 | Loss: 0.5659
Epoch: 0001/0001 | Batch 2400/35000 | Loss: 0.0000
Epoch: 0001/0001 | Batch 2700/35000 | Loss: 0.0516
Epoch: 0001/0001 | Batch 3000/35000 | Loss: 0.2139
Epoch: 0001/0001 | Batch 3300/35000 | Loss: 0.6309
Epoch: 0001/0001 | Batch 3600/35000 | Loss: 0.0555
Epoch: 0001/0001 | Batch 3900/35000 | Loss: 0.0036
Epoch: 0001/0001 | Batch 4200/35000 | Loss: 0.1830
Epoch: 0001/0001 | Batch 4500/35000 | Loss: 0.0015
Epoch: 0001/0001 | Batch 4800/35000 | Loss: 0.0784
Epoch: 0001/0001 | Batch 5100/35000 | Loss: 1.6660
Epoch: 0001/0001 | Batch 5400/35000 | Loss: 1.6445
Epoch: 0001/0001 | Batch 5700/

Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch: 0001/0001 | Train acc.: 79.44% | Val acc.: 88.46%
Time elapsed 78.69 min
Test accuracy 88.31%


# finetuing a LLM

 1.   unknow tokens
*   update pretrained tokenizer
*   update corresponding pretained model's embedding size as len(current tokenizer)




---



2.   new text dataset




---

3.  fine tuning model
*   mixed-percision
*   distributed:
    tensor-shadering


*   maximize use of single gpu:
    gradient-accumulation: avoid unstable training caused by small batch due to limited gpu, which is
    i.e. use fabric.backward to accumulate scaled gradients through multiple steps, only update weights when steps reach multiple step with optimizer


*   following instruct with human feedback:\
f






















In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
model.eval()
label_mapping = {0: "NEGATIVE", 1: "POSITIVE"}
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer,device="cuda")

# Example usage
user_input = "this movie is about LLMs"
sentiment = sentiment_analysis(user_input)
print(sentiment)  # Outputs: [{'label': 'POSITIVE', 'score': 0.99}]


The model '_FabricModule' is not supported for sentiment-analysis. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification', 'Gemma2ForSequenceCl

[{'label': 'LABEL_0', 'score': 0.9801293015480042}]


In [None]:
while True:
    review = input("Enter a movie review (or type 'exit' to stop): ")
    if review.lower() == 'exit':
        break

    # Tokenize the input review
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs=inputs.to(device)
    # Perform sentiment classification
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probabilities, dim=1).item()

    sentiment = label_mapping[predicted_class]
    print(f"Sentiment: {sentiment} (Confidence: {probabilities[0][predicted_class].item():.2f})")

Sentiment: POSITIVE (Confidence: 1.00)
