In [None]:
import pandas as pd
df_products = pd.read_csv("selected_products.csv")
df_products.head(3)

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,category_id,isBestSeller,boughtInLastMonth,id,category_name
0,B08ZDQX51W,Original Replacement Dell 130W Laptop Charger ...,https://m.media-amazon.com/images/I/61sADwl+YW...,https://www.amazon.com/dp/B08ZDQX51W,4.5,0,24.98,65,False,0,65,Laptop Accessories
1,B01BPCTXHC,Griffin Elevator Stand for Laptops - Lift Your...,https://m.media-amazon.com/images/I/710N2S69Nv...,https://www.amazon.com/dp/B01BPCTXHC,4.6,0,35.0,65,False,0,65,Laptop Accessories
2,B0B2RKB1HP,"Rolling Backpack, 17.3 inch Laptop Backpack wi...",https://m.media-amazon.com/images/I/61EWjAcjla...,https://www.amazon.com/dp/B0B2RKB1HP,4.6,0,89.99,65,False,0,65,Laptop Accessories


In [None]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    # New way of increasing inference speed
    # use_flash_attention_2=True
)

# Load the quantized model
model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-13b-hf",
    quantization_config=quantization_config,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-13b-hf")

In [None]:
from PIL import Image
import requests
import re

def get_image_caption(image_url):
    # Load a complete image
    try:
        image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
    except:
        return ""
        
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": "What product is shown in this image? Describe the product only"},
            ],
        },
    ]
    #  Use 3rd person perspective to describe it.
    # Describe the image as a product.
    inputs = processor.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device, torch.float16)

    # Generate
    generate_ids = model.generate(**inputs, max_new_tokens=300)
    result = processor.batch_decode(generate_ids, skip_special_tokens=True)
    return result

def process_text(text):
    # Extract text after "ASSISTANT: The image shows "
    match = re.search(r"ASSISTANT: The image shows (.*?)]", text)
    
    if match:
        extracted_text = match.group(1).strip()  # Get matched text
        formatted_text = extracted_text[0].upper() + extracted_text[1:-1]  # Capitalize first letter
        return formatted_text
    else:
        return ""

In [None]:
import csv

for index, row in enumerate(df_products.itertuples()):
    # if index < 288:
    #     continue
        
    print(f"Index No: {index}\nID: {row.asin}\n Image URL: {row.imgUrl}")

    caption = get_image_caption(row.imgUrl)
    caption = process_text(str(caption))
    
    print(f"Generated Caption: {caption}\n\n")
    
    with open("captions.csv", mode="a", newline='') as f:
        writer = csv.writer(f)
        writer.writerow([row.asin, row.imgUrl, caption])

In [None]:
import re

text = "['USER:  \nWhat product is shown in this image? Describe the image as a product. ASSISTANT: The image shows a black power bank, which is a portable charger, sitting on a white background. The power bank is connected to a USB cable, which is plugged into a device. This product is designed to provide power to electronic devices when they are not connected to a traditional power source.']"

# Extract text after "ASSISTANT: The image shows "
match = re.search(r"ASSISTANT: The image shows (.*?)]", text)

if match:
    extracted_text = match.group(1).strip()  # Get matched text
    formatted_text = extracted_text[0].upper() + extracted_text[1:-1]  # Capitalize first letter
    print(formatted_text)
else:
    print("ERROR: No match found")


A black power bank, which is a portable charger, sitting on a white background. The power bank is connected to a USB cable, which is plugged into a device. This product is designed to provide power to electronic devices when they are not connected to a traditional power source.


In [None]:
# Define existing text embeddings
texts = [
    "A sleek and modern smartphone with 128GB storage and a powerful camera.",
    "Wireless over-ear headphones with noise cancellation and 30-hour battery life.",
    "Ergonomic office chair with lumbar support and adjustable height.",
    "Gaming laptop with RTX 4060 GPU, 16GB RAM, and 1TB SSD storage."
]

# Tokenize and get embeddings for the reference texts
inputs = processor(text=texts, return_tensors="pt", padding=True)
with torch.no_grad():
    input_embeddings = model.get_text_features(**inputs)

# **New input text to compare**
new_text = ["A modern and sleek laptop with 128GB storage and a camera."]

# Tokenize and get embedding for the new text
new_input = processor(text=new_text, return_tensors="pt", padding=True)
with torch.no_grad():
    new_embedding = model.get_text_features(**new_input)

# Compute cosine similarity
similarities = cosine_similarity(new_embedding, input_embeddings)

# Convert to numpy for better readability
similarities = similarities.numpy()

print(similarities)


[0.8598733  0.78638345 0.7114111  0.8778672 ]


In [8]:
print(new_embedding.shape)

torch.Size([1, 512])


In [None]:
[0.8598733  0.78638345 0.7114111  0.8778672 ]
torch.Size([1, 512])


In [11]:
from transformers import CLIPProcessor, CLIPModel
import torch
from torch.nn.functional import cosine_similarity
from PIL import Image
import requests

# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [14]:
# **Reference text descriptions**
texts = [
    "Clover Chibi with Jumbo darning Needle Set, 6.2'' Height x 2.1'' Length x 0.8'' Width, Multicolor",
    "Clear Plastic Ornaments, Fillable for DIY Arts and Crafts (6.3 Inch, 6 Pack)",
    "Design Works Crafts Friendship, 5 x 7 Counted Cross Stitch Kit White",
    "DMC Stranded Cotton Number 3712"
]

# Tokenize and get text embeddings
inputs = processor(text=texts, return_tensors="pt", padding=True)
with torch.no_grad():
    text_embeddings = model.get_text_features(**inputs)

# **New input text to compare**
new_text = ["A modern and sleek laptop with 128GB storage and a camera."]
new_input = processor(text=new_text, return_tensors="pt", padding=True)
with torch.no_grad():
    new_text_embedding = model.get_text_features(**new_input)

# **Load multiple images**
image_urls = [
    "https://m.media-amazon.com/images/I/61cbPKvKXdL._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/61y3xtQVfYL._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/91BmFm22ZoL._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/51nJzzYy8ZL._AC_UL320_.jpg"]

# Process multiple images
image_embeddings = []
for url in image_urls:
    image = Image.open(requests.get(url, stream=True).raw)  # Load image from URL
    image_inputs = processor(images=image, return_tensors="pt")  # Preprocess image
    with torch.no_grad():
        img_embedding = model.get_image_features(**image_inputs)  # Get image features
    image_embeddings.append(img_embedding)

# Stack all image embeddings into a tensor
image_embeddings = torch.vstack(image_embeddings)  # Shape: (num_images, embedding_dim)

# **Compute similarities**
text_similarities = cosine_similarity(new_text_embedding, text_embeddings).numpy()
image_similarities = cosine_similarity(image_embeddings, text_embeddings).numpy()  # Shape: (num_images, num_texts)

# **Print results**
print(text_similarities)

print(image_similarities)


[0.4037253  0.41358453 0.24517404 0.6188889 ]
[0.27472916 0.31950188 0.30867618 0.237894  ]


In [8]:
from transformers import CLIPProcessor, CLIPModel
import torch
from torch.nn.functional import cosine_similarity
from PIL import Image
import requests

# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [10]:
# **Reference images (the dataset to search in)**
reference_image_urls = [
    "https://m.media-amazon.com/images/I/91IlO3j4kPL._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/71pjRDo52OL._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/71OueK6amcL._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/51UzaUUSLQL._AC_UL320_.jpg",
    "https://m.media-amazon.com/images/I/81SMwAAGp6L._AC_UL320_.jpg"
]

# **Process and encode reference images**
reference_embeddings = []
for url in reference_image_urls:
    image = Image.open(requests.get(url, stream=True).raw)  # Load image from URL
    image_inputs = processor(images=image, return_tensors="pt").to("cuda")  # Preprocess image
    with torch.no_grad():
        img_embedding = model.get_image_features(**image_inputs)  # Extract embedding
    reference_embeddings.append(img_embedding)

# Stack reference embeddings into a tensor
reference_embeddings = torch.vstack(reference_embeddings)  # Shape: (num_images, embedding_dim)

# **Query image (the one to search for)**
query_image_url = "https://m.media-amazon.com/images/I/919miJcpi1L.jpg"  # Change this to your query image
query_image = Image.open(requests.get(query_image_url, stream=True).raw)

# Process and encode the query image
query_inputs = processor(images=query_image, return_tensors="pt").to("cuda")
with torch.no_grad():
    query_embedding = model.get_image_features(**query_inputs)  # Extract query image embedding

# **Compute cosine similarity** between the query image and reference images
image_similarities = cosine_similarity(query_embedding, reference_embeddings).cpu().numpy()

# **Print results**
print(image_similarities)


[0.65080106 0.66882294 0.7217649  0.70273376 0.68470734]


In [3]:
import torch
# FULL MODEL
if torch.cuda.is_available():
    gpu_info = torch.cuda.get_device_properties(0)
    total_memory = gpu_info.total_memory / 1e6  # Convert to MB
    allocated_memory = torch.cuda.memory_allocated() / 1e6  # Convert to MB
    reserved_memory = torch.cuda.memory_reserved() / 1e6  # Convert to MB
    free_memory = total_memory - reserved_memory  # Free memory estimation

    print(f"Total GPU Memory: {total_memory:.2f} MB")
    print(f"Allocated GPU Memory: {allocated_memory:.2f} MB")
    print(f"Reserved GPU Memory: {reserved_memory:.2f} MB")
    print(f"Free GPU Memory: {free_memory:.2f} MB")
else:
    print("No GPU detected.")


Total GPU Memory: 16883.91 MB
Allocated GPU Memory: 14397.07 MB
Reserved GPU Memory: 15292.43 MB
Free GPU Memory: 1591.48 MB


In [29]:
import torch

# 4 BIT QUANITZATION
if torch.cuda.is_available():
    gpu_info = torch.cuda.get_device_properties(0)
    total_memory = gpu_info.total_memory / 1e6  # Convert to MB
    allocated_memory = torch.cuda.memory_allocated() / 1e6  # Convert to MB
    reserved_memory = torch.cuda.memory_reserved() / 1e6  # Convert to MB
    free_memory = total_memory - reserved_memory  # Free memory estimation

    print(f"Total GPU Memory: {total_memory:.2f} MB")
    print(f"Allocated GPU Memory: {allocated_memory:.2f} MB")
    print(f"Reserved GPU Memory: {reserved_memory:.2f} MB")
    print(f"Free GPU Memory: {free_memory:.2f} MB")
else:
    print("No GPU detected.")


Total GPU Memory: 16883.91 MB
Allocated GPU Memory: 8032.87 MB
Reserved GPU Memory: 9193.91 MB
Free GPU Memory: 7689.99 MB


In [None]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image

# Load the model in half-precision
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-13b-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-13b-hf")

In [None]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

# Load the quantized model
model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-13b-hf",
    quantization_config=quantization_config,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-13b-hf")

In [None]:
save_path = "/workspace/llava-1.5-13b-4bit"  # Best location for large models
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
print(f"Model saved to {save_path}")

In [None]:
model = LlavaForConditionalGeneration.from_pretrained(
    "/workspace/llava-1.5-13b-4bit",
    quantization_config=quantization_config,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained("/workspace/llava-1.5-13b-4bit")
print("Quantized model loaded successfully!")

In [28]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image

# Load a local image
image_path = "B0BB9CXN7J.jpg"  # Replace with your local image path
image = Image.open(image_path).convert("RGB")  # Ensure RGB format

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "What product is shown in this image?"},
        ],
    },
]
#  Use 3rd person perspective to describe it.
inputs = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device, torch.float16)

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=200)
processor.batch_decode(generate_ids, skip_special_tokens=True)

['USER:  \nWhat product is shown in this image? ASSISTANT: The image shows a collection of various animal stickers, which are likely to be used for decoration or as a part of a scrapbooking project.']