In [None]:
%%capture
!pip install -U transformers accelerate peft bitsandbytes datasets


In [None]:
%%capture
!unzip "/content/Dataset.zip"

In [None]:
import huggingface_hub
huggingface_hub.login("hf_token")

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    CLIPVisionModel, CLIPImageProcessor,
    LlamaForCausalLM, LlamaTokenizer,
    AutoProcessor, TrainingArguments,
    Trainer, AutoTokenizer
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from PIL import Image
import pandas as pd
from transformers.utils import logging
import os
logging.set_verbosity_error()  # Suppress too much logging
from tqdm import tqdm


In [None]:
# ---- CONFIG ----
image_encoder_name = "openai/clip-vit-base-patch32"
llm_name = "meta-llama/Llama-2-7b-chat-hf"
csv_path = "/content/emotion_data.csv"  # your dataset
image_root = "/content/Dataset/"        # image folder
max_length = 128
batch_size = 4
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ------- Load Vision Encoder ------- #
vision_encoder = CLIPVisionModel.from_pretrained(image_encoder_name)
vision_encoder = vision_encoder.to(device).eval()
vision_encoder.float()  # Ensure float32 on GPU
vision_encoder.requires_grad_(False)
vision_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")


# ---- TOKENIZER ----
tokenizer = AutoTokenizer.from_pretrained(llm_name)
tokenizer.pad_token = tokenizer.eos_token


# ------- Load LLaMA Language Model ------- #
language_model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",
    load_in_4bit=True,
)
language_model.gradient_checkpointing_enable()
language_model = prepare_model_for_kbit_training(language_model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
language_model = get_peft_model(language_model, lora_config)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Dummy forward to get dims
with torch.no_grad():
    dummy_out = vision_encoder(torch.randn(1, 3, 224, 224))
vision_dim = dummy_out.last_hidden_state.shape[-1]
llama_dim = language_model.model.embed_tokens.embedding_dim

vision_proj = VisionProjector(vision_dim, llama_dim).to(language_model.device)
vision_proj = get_peft_model(vision_proj, LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1,
    target_modules=["proj"],
    bias="none", task_type="FEATURE_EXTRACTION"
))

AttributeError: 'LlamaForCausalLM' object has no attribute 'embed_tokens'

In [None]:
class ImageTextDataset(Dataset):
    def __init__(self, csv_path, image_processor, tokenizer, max_length=128):
        self.data = pd.read_csv(csv_path)
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image = Image.open("/content/Dataset/"+row['image_path']).convert("RGB")
        image_tensor = self.image_processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0)

        # Construct the full label string here
        full_label_text = f'you look {row["emotion"]}. {row["feedback"]}'

        # Pass the constructed string directly to the tokenizer
        label = self.tokenizer(
            full_label_text, padding='max_length', max_length=self.max_length, truncation=True, return_tensors='pt'
        )
        data_return = {

                "pixel_values": image_tensor,
                "input_ids": label["input_ids"].squeeze(0),
                "attention_mask": label["attention_mask"].squeeze(0)
            }
        # print(data_return)
        return data_return

def custom_data_collator(features):
    # print("Features:", features)
    pixel_values = torch.stack([f["pixel_values"] for f in features])
    input_ids = torch.stack([f["input_ids"] for f in features])
    attention_mask = torch.stack([f["attention_mask"] for f in features])

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

# # --------- Dataset Loading ----------
# tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# tokenizer.pad_token = tokenizer.eos_token

dataset = ImageTextDataset(
    csv_path="/content/Dataset/image_sentiment_dataset.csv",  # CSV with columns: image_path,text
    image_processor=vision_processor,
    tokenizer=tokenizer,
    max_length=max_length
)

train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_data_collator)


In [None]:
vision_dim = vision_encoder.config.hidden_size
text_dim = language_model.config.hidden_size
projector = nn.Linear(vision_encoder.config.hidden_size, language_model.config.hidden_size)
projector = projector.to(device).float()


In [None]:
optimizer = torch.optim.AdamW(list(language_model.parameters()) + list(projector.parameters()), lr=2e-5)


In [None]:
# ---- TRAIN LOOP ----
language_model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in pbar:
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            vision_outputs = vision_encoder(pixel_values=pixel_values)
            image_embeds = vision_outputs.last_hidden_state.mean(dim=1)

        prefix_embeds = projector(image_embeds).unsqueeze(1)
        text_embeds = language_model.model.model.embed_tokens(input_ids)  # ✅

        inputs_embeds = torch.cat([prefix_embeds, text_embeds], dim=1)

        # Adjust attention mask accordingly
        batch_size, seq_len = input_ids.size()
        prefix_attention = torch.ones((batch_size, 1), device=device)
        combined_attention_mask = torch.cat([prefix_attention, attention_mask], dim=1)

        print("inputs_embeds shape:", inputs_embeds.shape)  # (batch_size, seq_len, embed_dim)
        print("labels shape:", input_ids.shape)                # (batch_size, seq_len)
        print("attention_mask shape:", combined_attention_mask.shape)
        inputs_embeds = inputs_embeds[:, :128, :]
        attention_mask = attention_mask[:, :128]
        outputs = language_model(
            inputs_embeds=inputs_embeds,
            attention_mask=combined_attention_mask,
            labels=input_ids
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()
        pbar.set_postfix(loss=loss.item())

    avg_loss = running_loss / len(train_loader)
    print(f"✅ Epoch {epoch+1} complete | Avg Loss: {avg_loss:.4f}")


Epoch 1/3:   0%|          | 0/5 [00:00<?, ?it/s]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 1/3:  20%|██        | 1/5 [00:06<00:25,  6.27s/it, loss=25.9]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 1/3:  40%|████      | 2/5 [00:12<00:18,  6.19s/it, loss=22.8]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 1/3:  60%|██████    | 3/5 [00:18<00:12,  6.21s/it, loss=24.1]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 1/3:  80%|████████  | 4/5 [00:25<00:06,  6.28s/it, loss=23.4]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 1/3: 100%|██████████| 5/5 [00:31<00:00,  6.31s/it, loss=20.4]


✅ Epoch 1 complete | Avg Loss: 23.2980


Epoch 2/3:   0%|          | 0/5 [00:00<?, ?it/s]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 2/3:  20%|██        | 1/5 [00:06<00:26,  6.66s/it, loss=19.5]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 2/3:  40%|████      | 2/5 [00:13<00:20,  6.74s/it, loss=19.8]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 2/3:  60%|██████    | 3/5 [00:20<00:13,  6.77s/it, loss=20]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 2/3:  80%|████████  | 4/5 [00:26<00:06,  6.72s/it, loss=18.5]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 2/3: 100%|██████████| 5/5 [00:33<00:00,  6.69s/it, loss=17.4]


✅ Epoch 2 complete | Avg Loss: 19.0475


Epoch 3/3:   0%|          | 0/5 [00:00<?, ?it/s]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 3/3:  20%|██        | 1/5 [00:06<00:25,  6.37s/it, loss=17.4]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 3/3:  40%|████      | 2/5 [00:12<00:19,  6.34s/it, loss=13.1]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 3/3:  60%|██████    | 3/5 [00:18<00:12,  6.31s/it, loss=10.9]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 3/3:  80%|████████  | 4/5 [00:25<00:06,  6.28s/it, loss=10.2]

inputs_embeds shape: torch.Size([4, 129, 4096])
labels shape: torch.Size([4, 128])
attention_mask shape: torch.Size([4, 129])


Epoch 3/3: 100%|██████████| 5/5 [00:31<00:00,  6.29s/it, loss=9.66]

✅ Epoch 3 complete | Avg Loss: 12.2557





In [None]:
# import torch
# from transformers import CLIPProcessor, CLIPModel, LlamaTokenizer, LlamaForCausalLM
# from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load models and tokenizer (adjust paths/names accordingly)
# vision_encoder = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").vision_model.eval().to(device)
# tokenizer = LlamaTokenizer.from_pretrained("your-llama-model")
# language_model = language_model.to(device)
# If using LoRA with PEFT:
# from peft import PeftModel
# language_model = PeftModel.from_pretrained(language_model, "path-to-lora-checkpoint").to(device)

# Load processor for images (CLIPProcessor or your image_processor)
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

import torch.nn as nn

# Assume these are your models
vision_embed_dim = 4096
llama_embed_dim = 768

# Define a projection layer
image_projection = nn.Linear(vision_embed_dim, llama_embed_dim).to(device)

def generate_text_from_image(image_path, max_new_tokens=50):
    # Load and process the image
    image = Image.open(image_path).convert("RGB")
    inputs = vision_processor(images=image, return_tensors="pt").to(device)

    pixel_values = inputs["pixel_values"]

    with torch.no_grad():
        vision_outputs = vision_encoder(pixel_values=pixel_values)
        image_embeds = vision_outputs.last_hidden_state.mean(dim=1)  # shape: [1, 4096]

    # Project image embeddings to llama embed dim
    projected_image_embeds = projector(image_embeds)  # shape: [1, 768]

    # Prepare prompt
    prompt = "Describe the image:"
    inputs_tokenized = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs_tokenized.input_ids
    attention_mask = inputs_tokenized.attention_mask

    inputs_embeds = language_model.get_input_embeddings()(input_ids)  # shape: [1, seq_len, 768]

    # Add a sequence dimension to projected image embeds: [batch_size, 1, embed_dim]
    projected_image_embeds = projected_image_embeds.unsqueeze(1)  # shape: [1,1,768]

    # Concatenate along sequence length dim
    inputs_embeds = torch.cat([projected_image_embeds, inputs_embeds], dim=1)

    # Adjust attention mask
    extended_attention_mask = torch.cat(
        [torch.ones((attention_mask.size(0), 1), device=device), attention_mask], dim=1
    )

    # Generate text
    generated_ids = language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=extended_attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.8,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_text


# Example usage:
image_path = "/content/Dataset/images/116.jpg"
result = generate_text_from_image(image_path)
print("Generated text:", result)


Generated text: 
persuaded the persuaded the persun. to make a man. to make a man. to make a man of men. to make a man of men. to make a man of men. to make a man of men. to


In [None]:
def generate_feedback(image_path, max_new_tokens=30):
    image = Image.open(image_path).convert("RGB")
    pixel_value = vision_processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0).to(device)

    with torch.no_grad():
        vision_feat = vision_encoder(pixel_value.unsqueeze(0)).last_hidden_state.mean(dim=1)
        prefix_embed = projector(vision_feat).unsqueeze(1)

        input_ids = tokenizer("<s>", return_tensors="pt").input_ids.to(device)
        token_embed = language_model.model.model.embed_tokens(input_ids)
        input_embed = torch.cat([prefix_embed, token_embed], dim=1)

        output_ids = language_model.generate(
            inputs_embeds=input_embed,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=1.0,
            pad_token_id=tokenizer.eos_token_id
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# --- Example Usage ---
if __name__ == "__main__":
    test_image = "/content/Dataset/images/116.jpg"  # Replace with your image path
    result = generate_feedback(test_image)
    print("\n🧠 Generated Feedback:")
    print(result)


🧠 Generated Feedback:
Gestation 400 is the Latin alphabet, derived from Old Latin a 300 400 30 30


In [None]:
#🧠 Generated Feedback:
#Gestation 400 is the Latin alphabet, derived from Old Latin a 300 400 30 30


In [None]:
# ---- CONFIG ----
image_encoder_name = "openai/clip-vit-base-patch32"
llm_name = "meta-llama/Llama-2-7b-chat-hf"
csv_path = "/content/Dataset/image_sentiment_dataset.csv"
image_root = "/content/Dataset/"
max_length = 128
batch_size = 4
num_epochs = 3
lr = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- Load Vision Encoder ----
vision_encoder = CLIPVisionModel.from_pretrained(image_encoder_name).to(device)
vision_encoder.eval().requires_grad_(False)
vision_processor = CLIPImageProcessor.from_pretrained(image_encoder_name)

# ---- Load Tokenizer ----
tokenizer = AutoTokenizer.from_pretrained(llm_name)
tokenizer.pad_token = tokenizer.eos_token

# ---- Load LLaMA Language Model with LoRA + 4-bit ----
language_model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",
    load_in_4bit=True
)
language_model.gradient_checkpointing_enable()
language_model = prepare_model_for_kbit_training(language_model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
language_model = get_peft_model(language_model, lora_config)

# ---- Dataset ----
class ImageTextDataset(Dataset):
    def __init__(self, csv_path, image_processor, tokenizer, max_length=128):
        self.data = pd.read_csv(csv_path)
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = os.path.join(image_root, row['image_path'])
        image = Image.open(image_path).convert("RGB")
        image_tensor = self.image_processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0)

        full_label = f'you look {row["emotion"]}. {row["feedback"]}'
        label_encoding = self.tokenizer(full_label, padding='max_length', truncation=True,
                                        max_length=self.max_length, return_tensors='pt')

        return {
            "pixel_values": image_tensor,
            "input_ids": label_encoding["input_ids"].squeeze(0),
            "attention_mask": label_encoding["attention_mask"].squeeze(0)
        }

def custom_collate(features):
    return {
        "pixel_values": torch.stack([f["pixel_values"] for f in features]),
        "input_ids": torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features])
    }

# ---- Load Data ----
dataset = ImageTextDataset(csv_path, vision_processor, tokenizer, max_length)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

# ---- Projector ----
projector = nn.Sequential(
    nn.Linear(vision_encoder.config.hidden_size, language_model.config.hidden_size),
    nn.Tanh()
).to(device)

optimizer = torch.optim.AdamW(list(language_model.parameters()) + list(projector.parameters()), lr=lr)

# ---- Training Loop ----
language_model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in pbar:
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            vision_outputs = vision_encoder(pixel_values=pixel_values)
            image_embeds = vision_outputs.last_hidden_state.mean(dim=1)

        prefix_embeds = projector(image_embeds).unsqueeze(1)
        text_embeds = language_model.model.model.embed_tokens(input_ids)
        inputs_embeds = torch.cat([prefix_embeds, text_embeds], dim=1)

        # Adjust attention and labels
        prefix_attention = torch.ones((input_ids.size(0), 1), device=device)
        combined_attention_mask = torch.cat([prefix_attention, attention_mask], dim=1)

        labels = input_ids.clone()
        labels = torch.cat([torch.full((labels.size(0), 1), -100, dtype=torch.long, device=device), labels[:, :-1]], dim=1)

        # Truncate to max length
        inputs_embeds = inputs_embeds[:, :max_length, :]
        combined_attention_mask = combined_attention_mask[:, :max_length]
        labels = labels[:, :max_length]

        outputs = language_model(
            inputs_embeds=inputs_embeds,
            attention_mask=combined_attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()
        pbar.set_postfix(loss=loss.item())

    print(f"✅ Epoch {epoch+1} complete | Avg Loss: {running_loss / len(train_loader):.4f}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return fn(*args, **kwargs)
Epoch 1/3: 100%|██████████| 5/5 [00:34<00:00,  6.99s/it, loss=6.86]


✅ Epoch 1 complete | Avg Loss: 12.3480


Epoch 2/3: 100%|██████████| 5/5 [00:31<00:00,  6.30s/it, loss=7.16]


✅ Epoch 2 complete | Avg Loss: 7.5954


Epoch 3/3: 100%|██████████| 5/5 [00:32<00:00,  6.47s/it, loss=5.88]

✅ Epoch 3 complete | Avg Loss: 6.2044





In [None]:
# Save the LoRA-adapted LLaMA model
language_model.save_pretrained("/content/saved_model/vlm_lora_llama2")

# Save the projector layer separately
torch.save(projector.state_dict(), "/content/vlm_projector.pt")


In [None]:
from peft import PeftModel

# Load LLaMA base in 4-bit and apply LoRA weights
base_model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    load_in_4bit=True,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, "/content/vlm_lora_llama2").eval()

# Load projector
projector = nn.Sequential(
    nn.Linear(vision_encoder.config.hidden_size, model.config.hidden_size),
    nn.Tanh()
).to(device)
projector.load_state_dict(torch.load("/content/vlm_projector.pt", map_location=device))
