<a href="https://colab.research.google.com/github/Pradeep2535/Colab-Notebooks-for-GenAI/blob/main/Multimodal_RAG_for_E_Commerce_Products.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import clear_output
clear_output()

In [None]:
!pip install tranformers langchain langchain-community pinecone langchain-pinecone pymongo
clear_output()

#Dataset Preprocessing

In [None]:
dataset_path = '/content/drive/MyDrive/Colab Notebooks/E-Commerce Search/amazon.csv'

In [None]:
import pandas as pd

In [None]:
dataset_csv = pd.read_csv(dataset_path)
dataset_df = pd.DataFrame(dataset_csv)
dataset_df.shape

In [None]:
dataset_df.isnull().sum()

In [None]:
dataset_df.head()

In [None]:
dataset_df.columns

In [None]:
dataset_df.drop(columns=['user_id','category','product_id','rating_count','user_name','review_id',
                         'review_title'],inplace=True)
print(dataset_df.shape)
dataset_df.head()

In [None]:
!pip install pymongo
clear_output()

#MongoDB Setup

In [None]:
from pymongo import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://pradeepeng2535:N6U2H0X36wyf1Hp3@cluster0.lblr2.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

client = MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
db = client['ragimagesearch']

In [None]:
collection = db['image_links']

In [None]:
dataset_df['img_link'][99]

In [None]:
import requests
valid_rows = []

# Iterate through the DataFrame
for index, row in dataset_df.iterrows():
    link = row["img_link"]
    try:
        # Make a GET request to check the link
        response = requests.get(link)
        if response.status_code == 400:  # Check for Bad Request
            continue
        else:
            valid_rows.append(row)
    except requests.exceptions.RequestException as e:
        # Treat any request exception as a "Bad Request"
        print(f"Error with link {link}: {e}")


In [None]:
data = pd.DataFrame(valid_rows)


In [None]:
data.reset_index(inplace=True)

In [None]:
data.head()

In [None]:

data.drop(columns=['index'],inplace=True)
data.head()

In [None]:
data['img_link'][99]

In [None]:
data.shape

#CLIP Model - embeddings

In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import requests
import io

# Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def generate_clip_embeddings(image_path=None):
    """
    Generate CLIP embeddings for an image or text.
    """
    inputs = {}
    if image_path:
        image_content = requests.get(image_path).content
        image = Image.open(io.BytesIO(image_content)).convert("RGB")
        inputs["images"] = image


    processed_inputs = clip_processor(**inputs, return_tensors="pt", padding=True)
    with torch.no_grad():
        if image_path:
            image_embeddings = clip_model.get_image_features(**processed_inputs)
            image_embeddings = image_embeddings / image_embeddings.norm(p=2, dim=-1, keepdim=True)
            return image_embeddings.numpy()



In [None]:
from google.colab import userdata

PINECONE_API_KEY = userdata.get("PINECONE_API_KEY")

In [None]:
!pip install pinecone
clear_output()

#Pinecone Setup

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("ragimagesearch")

#Ingestion

In [None]:
for ind, row in data.iterrows():
  img_link = row['img_link']
  prod_link = row['product_link']
  product_name = row['product_name']
  product_description = row['about_product']

  meta_data = {
      'name': product_name,
      'description': product_description,
      'link': prod_link
  }
  # MongoDB insertion
  obj = collection.insert_one(meta_data)
  id = obj.inserted_id

  image_embedding = generate_clip_embeddings(image_path=img_link)[0]
  vector = [float(x) for x in image_embedding]

  # Pinecone insertion
  index.upsert([(str(id),vector)])



In [None]:
data.columns

In [None]:
len(data['product_link'].unique())

In [None]:
for ind,row in data.iterrows():
  discounted_price = row['discounted_price']
  actual_price = row['actual_price']
  discount_percentage = row['discount_percentage']
  rating = row['rating']
  review = row['review_content']
  img_link = row['img_link']

  additional_meta_data = {
      'discounted_price': discounted_price,
      'actual_price': actual_price,
      'discounted_percentage': discount_percentage,
      'rating': rating,
      'review': review,
      'img_link': img_link,

  }

  obj = collection.find_one({'link': row['product_link']})
  obj_id = obj['_id']
  collection.update_one({'_id': obj_id}, {'$set': {'additional_meta_data': additional_meta_data}})



In [None]:
data['img_link'][99]

In [None]:
query_image_path = "https://m.media-amazon.com/images/I/41LwSJdthGL._SX300_SY300_QL70_FMwebp_.jpg"
query_vector = generate_clip_embeddings(image_path=query_image_path)[0]
query_vector = [float(x) for x in query_vector]

# Retrieval

In [None]:
from bson import ObjectId

In [None]:
def results(query_vector):
  response = index.query(
      vector=query_vector,
      top_k=3,


  )

  response_ids = [str(x.id) for x in response.matches]
  return response_ids
