In [None]:
!pip install transformers accelerate peft bitsandbytes datasets evaluate pandas tqdm torch sentencepiece

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, evaluate
Successfully installed bitsandbytes-0.47.0 evaluate-0.4.5


In [None]:
import pandas as pd, torch, gc
from datasets import Dataset
from transformers import AutoProcessor, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


**Bascially Quantizing the model in 4bit can also be done using Bitsand Bytes but I took out the already quantized model from HuggingFace. The reason is that the original model LLaVa-7B is more than 15 GB in size and if I load it in Colab then it is definitely gonna crash so i took out its pre-quantized version from HF by Unsloth which is 6.8 GB in size.**


In [None]:
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )

# processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

# model = LlavaNextForConditionalGeneration.from_pretrained(
#     "llava-hf/llava-1.5-7b-hf",
#     quantization_config=quantization_config,
#     device_map="auto",
#     torch_dtype=torch.float16
# )

Pre-Quantized Model taken from HF



In [None]:
model_id = "unsloth/llava-1.5-7b-hf-bnb-4bit"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    load_in_4bit=True,
    device_map="auto"
)
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/4.04G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

trainable params: 9,961,472 || all params: 7,073,388,544 || trainable%: 0.1408


**DailyDialog is a multi-turn dialogue dataset reflecting daily conversations on various topics and it also containes the emotions of the dialogs spoken which is required for our product to be finetuned on. It is publicly available in Kaggle you can download it from there.**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

train_path = "/content/drive/MyDrive/train.csv"
val_path   = "/content/drive/MyDrive/validation.csv"

model_id = "unsloth/llava-1.5-7b-hf-bnb-4bit"
processor = AutoProcessor.from_pretrained(model_id)
tokenizer = processor.tokenizer

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)

print("Train sample:\n", train_df.head())

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

def tokenize_function(example):
    dialog  = example["dialog"]
    act     = example["act"]
    emotion = example["emotion"]

    dialog_clean = dialog.strip("[]").replace("'", "").replace('"', "")
    dialog_clean = dialog_clean.replace("\n", " ").strip()

    target = f"Acts: {act} | Emotions: {emotion}"
    text = f"USER: {dialog_clean}\nASSISTANT: {target}"

    return tokenizer(
        text,
        truncation=True,
        max_length=1024,
        padding="max_length"
    )
train_tokenized = train_dataset.map(tokenize_function, batched=False, remove_columns=train_dataset.column_names)
val_tokenized   = val_dataset.map(tokenize_function, batched=False, remove_columns=val_dataset.column_names)

print("Tokenized sample:\n", train_tokenized[0])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

gc.collect()
torch.cuda.empty_cache()

model.gradient_checkpointing_enable()
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/llava_lora_finetune",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4, # Increased gradient accumulation steps
    num_train_epochs=3, # Increased number of epochs
    max_steps=100, # Increased max steps
    learning_rate=5e-5,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=10,
    eval_strategy="no",
    optim="paged_adamw_8bit",
    fp16=True,
    bf16=False,
    report_to=[]
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized.shuffle(seed=42).select(range(100)),
    eval_dataset=val_tokenized.shuffle(seed=42).select(range(20)),
    data_collator=data_collator
)

Mounted at /content/drive


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Train sample:
                                               dialog                    act  \
0  ['Say , Jim , how about going for a few beers ...  [3 4 2 2 2 3 4 1 3 4]   
1  ['Can you do push-ups ? '\n " Of course I can ...          [2 1 2 2 1 1]   
2  ['Can you study with the radio on ? '\n ' No ,...            [2 1 2 1 1]   
3  ['Are you all right ? '\n ' I will be all righ...              [2 1 1 1]   
4  ['Hey John , nice skates . Are they new ? '\n ...    [2 1 2 1 1 2 1 3 4]   

                 emotion  
0  [0 0 0 0 0 0 4 4 4 4]  
1          [0 0 6 0 0 0]  
2            [0 0 0 0 0]  
3              [0 0 0 0]  
4    [0 0 0 0 0 6 0 6 0]  


Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenized sample:
 {'input_ids': [32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 32001, 

In [None]:
gc.collect()
torch.cuda.empty_cache()


**As you can see i have set the bare minimum parameters and i am also going to run it for very less epochs due to GPU constraints like it gets Out-of-memory when i normalize the parameters. Since it is being done for demo purpose i have kept it like that you can set the parametrs according to your constraints.**

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


TrainOutput(global_step=100, training_loss=2.1786044311523436, metrics={'train_runtime': 1456.7567, 'train_samples_per_second': 0.275, 'train_steps_per_second': 0.069, 'total_flos': 1.70593408253952e+16, 'train_loss': 2.1786044311523436, 'epoch': 4.0})