In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer
from PIL import Image
import os
import faiss
import numpy as np

def load_model():
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    return processor, model, embedder

def generate_caption(image_path, processor, model):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    with torch.no_grad():
        caption_ids = model.generate(**inputs)
    caption = processor.decode(caption_ids[0], skip_special_tokens=True)
    return caption

def get_text_embedding(text, embedder):
    embedding = embedder.encode(text, convert_to_tensor=True)
    return embedding.cpu().numpy()

def store_in_vector_db(embeddings, image_names):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, "vector_db.index")
    with open("image_names.txt", "w") as f:
        for name in image_names:
            f.write(name + "\n")

def load_vector_db():
    index = faiss.read_index("vector_db.index")
    with open("image_names.txt", "r") as f:
        image_names = [line.strip() for line in f.readlines()]
    return index, image_names

def process_images(image_folder):
    processor, model, embedder = load_model()
    results = {}
    image_names = []
    embeddings_list = []

    for filename in os.listdir(image_folder):
        if filename.lower().endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(image_folder, filename)
            caption = generate_caption(image_path, processor, model)
            embedding = get_text_embedding(caption, embedder)
            results[filename] = {
                "caption": caption,
                "embedding": embedding
            }
            image_names.append(filename)
            embeddings_list.append(embedding)

    embeddings_array = np.vstack(embeddings_list)
    store_in_vector_db(embeddings_array, image_names)
    return results

def rank_images(user_prompt):
    _, _, embedder = load_model()  # Get all three returned values
    index, image_names = load_vector_db()
    query_embedding = get_text_embedding(user_prompt, embedder)
    distances, indices = index.search(np.array([query_embedding]), k=len(image_names))
    ranked_results = [(image_names[i], distances[0][j]) for j, i in enumerate(indices[0])]
    ranked_results.sort(key=lambda x: x[1])  # Sort by distance (lower is better)

    print("Ranking of images based on user prompt:")
    for rank, (image, score) in enumerate(ranked_results, 1):
        print(f"{rank}. {image} (Score: {score:.4f})")

# Example usage:
image_folder = "/content/sample_data"  # Change this to your image folder path
results = process_images(image_folder)

# Example query:
user_prompt = "A big horse is running in the greenland"
rank_images(user_prompt)

Ranking of images based on user prompt:
1. Horse.jpg (Score: 0.8127)
2. Peacock.jpg (Score: 1.5186)


# Main one

In [4]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer
from PIL import Image
import os
import faiss
import numpy as np
import requests
import json

def load_model():
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    return processor, model, embedder

def generate_caption(image_path, processor, model):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    with torch.no_grad():
        caption_ids = model.generate(**inputs)
    caption = processor.decode(caption_ids[0], skip_special_tokens=True)
    return caption

def get_text_embedding(text, embedder):
    embedding = embedder.encode(text, convert_to_tensor=True)
    return embedding.cpu().numpy()

def store_in_vector_db(embeddings, image_names):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, "vector_db.index")
    with open("image_names.txt", "w") as f:
        for name in image_names:
            f.write(name + "\n")

def load_vector_db():
    index = faiss.read_index("vector_db.index")
    with open("image_names.txt", "r") as f:
        image_names = [line.strip() for line in f.readlines()]
    return index, image_names

def query_groq_api(user_prompt, image_captions):
    api_url = "https://api.groq.com/v1/chat/completions"
    headers = {"Authorization": "Bearer gsk_f8FNmE6Iu3OuLUYE0Abb3FYUZrPOpNIU2CG0GIHcisUfHNE", "Content-Type": "application/json"}
    messages = [{"role": "system", "content": "Rank the following image captions based on how well they match the user prompt."},
                {"role": "user", "content": f"User Prompt: {user_prompt}\nCaptions: {json.dumps(image_captions)}"}]
    payload = {"model": "gpt-4", "messages": messages, "temperature": 0.5}
    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()["choices"][0]["message"]["content"]

def process_images(image_folder):
    processor, model, embedder = load_model()
    results = {}
    image_names = []
    embeddings_list = []

    for filename in os.listdir(image_folder):
        if filename.lower().endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(image_folder, filename)
            caption = generate_caption(image_path, processor, model)
            embedding = get_text_embedding(caption, embedder)
            results[filename] = {
                "caption": caption,
                "embedding": embedding
            }
            image_names.append(filename)
            embeddings_list.append(embedding)

    embeddings_array = np.vstack(embeddings_list)
    store_in_vector_db(embeddings_array, image_names)
    return results

def rank_images(user_prompt):
    _, _, embedder = load_model()  # Get all three returned values
    index, image_names = load_vector_db()
    query_embedding = get_text_embedding(user_prompt, embedder)
    distances, indices = index.search(np.array([query_embedding]), k=len(image_names))
    ranked_results = [(image_names[i], distances[0][j]) for j, i in enumerate(indices[0])]
    ranked_results.sort(key=lambda x: x[1])  # Sort by distance (lower is better)

    print("Ranking of images based on user prompt:")
    for rank, (image, score) in enumerate(ranked_results, 1):
        print(f"{rank}. {image} (Score: {score:.4f})")

# Example usage:
image_folder = "/content/sample_data"  # Change this to your image folder path
results = process_images(image_folder)

# Example query:
user_prompt = "A yellow snake"
rank_images(user_prompt)

Ranking of images based on user prompt:
1. image_snake_yellow.jpg (Score: 0.2222)
2. image_snkae_green.jpg (Score: 0.6762)
3. Peacock.jpg (Score: 1.4351)
4. Horse.jpg (Score: 1.6648)
5. image_bd.jpg (Score: 1.7384)
