In [1]:
!pip install -q \
  transformers==4.38.2 \
  peft==0.8.2 \
  trl==0.8.6 \
  bitsandbytes==0.46.0 \
  accelerate==0.28.0 \
  datasets==2.19.0

In [2]:
import os
# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"


In [3]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm
W0626 20:05:02.795000 14380 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [4]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")



In [5]:
import pandas as pd

# Load the file
dataset = pd.read_csv("train.csv")

# Preview first few rows
dataset.head()


Unnamed: 0,id,dialogue,summary,topic
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance


In [6]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
device_map = {"": 0}

In [7]:
!pip show trl
!pip show transformers
!pip show bitsandbytes

Name: trl
Version: 0.8.6
Summary: Train transformer language models with reinforcement learning.
Home-page: https://github.com/huggingface/trl
Author: Leandro von Werra
Author-email: leandro.vonwerra@gmail.com
License: Apache 2.0
Location: C:\Users\asus\AppData\Local\Programs\Python\Python312\Lib\site-packages
Requires: accelerate, datasets, numpy, torch, transformers, tyro
Required-by: 
Name: transformers
Version: 4.38.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\asus\AppData\Local\Programs\Python\Python312\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, trl
Name: b

In [8]:
import torch
print(torch.cuda.is_available())  # Should return True if bitsandbytes will work

True


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name='microsoft/phi-2'
original_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                      device_map=device_map,
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.08s/it]


In [10]:
# https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
eval_tokenizer = AutoTokenizer.from_pretrained(model_name, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

def gen(model,p, maxlen=100, sample=True):
    toks = eval_tokenizer(p, return_tensors="pt")
    res = model.generate(**toks.to("cuda"), max_new_tokens=maxlen, do_sample=sample,num_return_sequences=1,temperature=0.1,num_beams=1,top_p=0.95,).to('cpu')
    return eval_tokenizer.batch_decode(res,skip_special_tokens=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="train.csv")


Generating train split: 1999 examples [00:00, 14985.82 examples/s]


In [19]:
%%time
from transformers import set_seed
set_seed(42)

index = 10

# Retrieve prompt and reference summary
prompt = dataset['train'][index]['dialogue']
summary = dataset['train'][index]['summary']

# Format the prompt for the model
formatted_prompt = f"Instruct: Summarize the following conversation.\n{prompt}\nOutput:"

# Generate output
res = gen(original_model, formatted_prompt)

# Extract model's response after 'Output:'
try:
    model_output = res[0].split("Output:")[1].strip()
except IndexError:
    model_output = "[Error: Model output not formatted correctly.]"

# Print everything in clean format
dash_line = "-" * 100
print(dash_line)
print("INPUT PROMPT:")
print(formatted_prompt)
print(dash_line)
print("BASELINE HUMAN SUMMARY:")
print(summary)
print(dash_line)
print("MODEL GENERATION - ZERO SHOT:")
print(model_output)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Summarize the following conversation.
#Person1#: Could you do me a favor?
#Person2#: Sure. What is it?
#Person1#: Could you run over to the store? We need a few things.
#Person2#: All right. What do you want me to get?
#Person1#: Well, could you pick up some sugar?
#Person2#: Okay. How much?
#Person1#: A small bag. I guess we also need a few oranges.
#Person2#: How many?
#Person1#: Oh, let's see. . . About six.
#Person2#: Anything else?
#Person1#: Yes. We're out of milk.
#Person2#: Okay. How much do you want me to get? A gallon?
#Person1#: No. I think a half gallon will be enough.
#Person2#: Is that all?
#Person1#: I think so. Have you got all that?
#Person2#: Yes. That's small bag of sugar, four oranges, and a half gallon of milk.
#Person1#: Do you have enough money?
#Person2#: I think so.
#Person1#: Thanks very much. I appreciate it.
Output:
-------------------

In [13]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction','output')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the below conversation."
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['dialogue']}" if sample["dialogue"] else None
    response = f"{RESPONSE_KEY}\n{sample['summary']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample


In [14]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

In [15]:
from functools import partial

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset):
    """
    Format, tokenize, and prepare DialogSum samples for PEFT fine-tuning.
    :param tokenizer: AutoTokenizer instance
    :param max_length: maximum sequence length (e.g., 512 or 1024)
    :param seed: random seed for shuffling
    :param dataset: a HuggingFace DatasetDict or Dataset
    """

    print("Preprocessing dataset...")

    # Step 1: Apply your prompt formatter to produce 'text' field
    dataset = dataset.map(create_prompt_formats)

    # Step 2: Tokenize prompt-response pairs using batch mapper
    preprocess_fn = partial(preprocess_batch, tokenizer=tokenizer, max_length=max_length)
    columns_to_remove = [c for c in ['id', 'topic', 'dialogue', 'summary'] if c in dataset.column_names]

    dataset = dataset.map(
        preprocess_fn,
        batched=True,
        remove_columns=columns_to_remove,
    )

    # Step 3: Drop samples exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Step 4: Shuffle dataset for training randomness
    dataset = dataset.shuffle(seed=seed)

    return dataset


In [17]:
# ## Pre-process dataset
seed = 42

max_length = get_max_length(original_model)
print(f"Max length for model: {max_length}")

# Load train-validation split from CSV
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "train.csv"}, split="train[:90%]")
validation_dataset = load_dataset("csv", data_files={"train": "train.csv"}, split="train[90%:]")

# Tokenizer already defined above; now apply preprocessing
train_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)
eval_dataset = preprocess_dataset(tokenizer, max_length, seed, validation_dataset)

Found max lenth: 2048
Max length for model: 2048
Preprocessing dataset...


Map: 100%|██████████| 1799/1799 [00:00<00:00, 8924.41 examples/s]
Map: 100%|██████████| 1799/1799 [00:02<00:00, 669.95 examples/s]
Filter: 100%|██████████| 1799/1799 [00:00<00:00, 3044.72 examples/s]


Preprocessing dataset...


Map: 100%|██████████| 200/200 [00:00<00:00, 5015.10 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 552.70 examples/s]
Filter: 100%|██████████| 200/200 [00:00<00:00, 2629.79 examples/s]


In [18]:
print(len(train_dataset))
print(len(eval_dataset))

1799
200


In [19]:
print(f"Shapes of the datasets:")
print(f"Training: {train_dataset.shape}")
print(f"Validation: {eval_dataset.shape}")
print(train_dataset)

Shapes of the datasets:
Training: (1799, 3)
Validation: (200, 3)
Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 1799
})


In [20]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 262364160
all model parameters: 1521392640
percentage of trainable model parameters: 17.24%


In [21]:
print(original_model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layern

In [22]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

config = LoraConfig(
    r=64, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()

# 2 - Using the prepare_model_for_kbit_training method from PEFT
original_model = prepare_model_for_kbit_training(original_model)

peft_model = get_peft_model(original_model, config)

In [23]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 41943040
all model parameters: 1563335680
percentage of trainable model parameters: 2.68%


In [24]:
# See how the model looks different now, with the LoRA adapters added:
print(peft_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4

In [25]:
output_dir = './peft-dialogue-summary-training/final-checkpoint'
import transformers

peft_training_args = TrainingArguments(
    output_dir='./peft-dialogue-summary-training/final-checkpoint',
    warmup_steps=100,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    max_steps=1000,
    learning_rate = 2e-4,
    lr_scheduler_type = 'linear',
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir=True,
    group_by_length=True,
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [26]:
peft_training_args.device

device(type='cuda', index=0)

In [51]:
peft_trainer.train()

Step,Training Loss,Validation Loss
25,1.9595,1.816778
50,1.5868,1.404992
75,1.3561,1.356202
100,1.3515,1.3441
125,1.323,1.339977
150,1.3007,1.329777
175,1.2782,1.329439
200,1.2938,1.326749
225,1.2618,1.323288
250,1.3099,1.319444


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a

TrainOutput(global_step=1000, training_loss=1.257026954650879, metrics={'train_runtime': 16968.5469, 'train_samples_per_second': 0.943, 'train_steps_per_second': 0.059, 'total_flos': 7.607104824643584e+16, 'train_loss': 1.257026954650879, 'epoch': 8.89})

In [52]:
peft_model.save_pretrained("peft_adapter_checkpoint")



In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    load_in_8bit_fp32_cpu_offload=True  # Required for Windows + low VRAM GPUs like GTX 1650
)

base_model_id = "microsoft/phi-2"

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map={"": 0},  # Replace with 'auto' if you're unsure
    quantization_config=bnb_config,
    trust_remote_code=True)


Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.95s/it]


In [54]:
eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [55]:
import torch
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())
print(torch.cuda.is_bf16_supported())  # for bf16
print(torch.cuda.get_device_capability())  # (7, 5) for GTX 1650


Tesla P100-PCIE-16GB
True
True
(6, 0)


In [56]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# STEP 1: Define model and adapter path
base_model_id = "microsoft/phi-2"
adapter_path = "./peft_adapter_checkpoint"

# STEP 2: Configure quantization (4-bit with offload enabled)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# STEP 3: Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

# STEP 4: Load quantized base model on GPU
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="cuda",                      # Automatically fits layers to GPU
    quantization_config=bnb_config,        # 4-bit config
    trust_remote_code=True
)

# STEP 5: Load your LoRA adapter
ft_model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
    is_trainable=False
)

# STEP 6 (optional): Merge LoRA weights to free memory
ft_model = ft_model.merge_and_unload()

# ✅ Done: Model is on GPU in 4-bit


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [90]:
%%time

from transformers import set_seed
set_seed(42)

def gen(model, prompt, max_new_tokens=60):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_text.split("### Summary:\n")[-1].strip().split("\n")[0].strip()

# Select example
index = 10
dialogue = dataset[index]['dialogue']
summary = dataset[index]['summary']

# Instruction prompt
prompt = (
    "### Instruction:\n"
    "Summarize the conversation in one sentence with exact items and quantities.\n"
    "Only output one sentence with all confirmed items and correct quantities.\n"
    "Include all confirmed items, quantities, and descriptors.\n"
    "Do not include any extra sentences, questions, or explanations.\n"
    "Do NOT include questions or opinions. Use format like:\n"
    "#Person1# asked #Person2# to buy sugar, six oranges, and a half gallon of milk.\n\n"
    f"### Conversation:\n{dialogue}\n\n"
    "### Summary:\n"
)


# Generate
peft_model_output = gen(ft_model, prompt)

# Display
dash_line = '-' * 100
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'PhiSUM MODEL:\n{peft_model_output}')


----------------------------------------------------------------------------------------------------
INPUT PROMPT:
### Instruction:
Summarize the conversation in one sentence with exact items and quantities.
Only output one sentence with all confirmed items and correct quantities.
Include all confirmed items, quantities, and descriptors.
Do not include any extra sentences, questions, or explanations.
Do NOT include questions or opinions. Use format like:
#Person1# asked #Person2# to buy sugar, six oranges, and a half gallon of milk.

### Conversation:
#Person1#: Could you do me a favor?
#Person2#: Sure. What is it?
#Person1#: Could you run over to the store? We need a few things.
#Person2#: All right. What do you want me to get?
#Person1#: Well, could you pick up some sugar?
#Person2#: Okay. How much?
#Person1#: A small bag. I guess we also need a few oranges.
#Person2#: How many?
#Person1#: Oh, let's see. . . About six.
#Person2#: Anything else?
#Person1#: Yes. We're out of milk.
#Per