In [None]:
print("Jai Bajrang Bali")

In [None]:
import pandas as pd
from pymongo import MongoClient
from IPython.display import display

client = MongoClient("your mongo url")
db = client["Amazon"]
collection = db["products"]

# Fetch products
products = list(collection.find({}))

# Prepare flat list for DataFrame
flattened_products = []

for p in products:
    flat = {
        "productId": p.get("productId"),
        "name": p.get("name"),
        "description": p.get("description"),
        "isOrganic": p.get("isOrganic", False),
        "tags": ", ".join(p.get("tags", [])),
        "images": ", ".join(p.get("images", [])),
        "basePrice": p.get("basePrice"),
        "sustainableScore": p.get("sustainableScore", 0),
        "energyUsed": p.get("energyUsed", 0),
        "emissions": p.get("emissions", 0),
        "greenPoints": p.get("greenPoints", 0),
        "waterSaved": p.get("waterSaved", 0),
        "plasticAvoided": p.get("plasticAvoided", 0),
        "createdAt": p.get("createdAt")
    }
    flattened_products.append(flat)

 # here we are not adding like details and variety beacuse it is not usefull;


pd.set_option("display.max_columns", None)    
pd.set_option("display.width", 1000)            
pd.set_option("display.max_colwidth", None)  

# Create DataFrame
df = pd.DataFrame(flattened_products)
# df.to_csv("products_clean.csv", index=False)

# x=df.to_excel("products_clean_Excel.xlsx", index=False)



In [None]:
# Now train the model
import pandas as pd
from IPython.display import display
df = pd.read_csv("products_clean.csv")

print(display(df.head()))

# Combine important fields into a single string for embedding
df["text_for_embedding"] = (
    df["name"].fillna("") + " " +
    df["description"].fillna("") + " " +
    df["tags"].fillna("") + " " +
    df["isOrganic"].apply(lambda x: "organic product" if x else "non-organic product") + " " +
    df["sustainableScore"].apply(lambda x: "high sustainability score" if x > 75 else "low sustainability score")
)

print(df["text_for_embedding"])

In [10]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")
product_embeddings = model.encode(df["text_for_embedding"].tolist(), convert_to_tensor=True)
def search_products(query, top_k=20):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, product_embeddings)[0]

    # Attach scores to DataFrame
    df["similarity"] = cosine_scores.cpu().numpy()
    
    # Split based on eco filters
    eco = df[(df["isOrganic"] == True) & (df["sustainableScore"] >= 75)]
    rest = df[~df.index.isin(eco.index)]

    # Sort each group by similarity
    eco_sorted = eco.sort_values(by="similarity", ascending=False)
    rest_sorted = rest.sort_values(by="similarity", ascending=False)

    # Concatenate top-k results
    final_df = pd.concat([eco_sorted.head(10), rest_sorted.head(top_k - 10)])

    # Return simplified output
    return final_df[["name", "description", "tags", "isOrganic", "sustainableScore", "similarity"]]

results = search_products("cleaning agent", top_k=20)
print(results.to_string(index=False))


                                                                                   name                                                                                                                                                                                                                                                                                                                                                                                                      description                                                                                               tags  isOrganic  sustainableScore  similarity
                         Herbal Strategi Bathroom Cleaner – 500 mL Spray (Herbal Lemon)                                                                                                                                                                                                                                  Completely herbal bathroom cleaner with fragrant lemon oil

In [11]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all product texts once
product_embeddings = model.encode(df["text_for_embedding"].tolist(), convert_to_tensor=True)

def search_products(query, top_k=20):
    # Step 1: Encode the user query
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, product_embeddings)[0]
    
    # Step 2: Attach similarity scores to DataFrame
    df["similarity"] = cosine_scores.cpu().numpy()
    
    # Step 3: Prioritize eco-friendly products
    eco = df[(df["isOrganic"] == True) & (df["sustainableScore"] >= 75)]
    rest = df[~df.index.isin(eco.index)]

    # Step 4: Sort by similarity
    eco_sorted = eco.sort_values(by="similarity", ascending=False)
    rest_sorted = rest.sort_values(by="similarity", ascending=False)

    # Step 5: Combine results
    final_df = pd.concat([eco_sorted.head(10), rest_sorted.head(top_k - 10)])

    # ✅ Return full product info sorted by similarity
    return final_df.sort_values(by="similarity", ascending=False)

# Example usage
def display_full_product_details(results_df):
    for idx, row in results_df.iterrows():
        print(f"\n{'='*100}")
        print(f"🔢 Product #{idx + 1}")
        print(f"🆔 Product ID       : {row.get('productId', 'N/A')}")
        print(f"🛒 Name             : {row.get('name', 'N/A')}")
        print(f"🖼️  Image URL       : {row.get('images', '').split(',')[0] if row.get('images') else 'No image'}")
        print(f"📄 Description      : {row.get('description', '')[:100]}{'...' if len(row.get('description', '')) > 100 else ''}")
        print(f"🏷️  Tags            : {row.get('tags', 'None')}")
        print(f"🌱 Organic          : {'✅ Yes' if row.get('isOrganic') else '❌ No'}")
        print(f"📦 Base Price       : ₹{row.get('basePrice', 'N/A')}")
        print(f"📊 Sustainable Score: {row.get('sustainableScore', 0)}/100")
        print(f"🟩 Green Points     : {row.get('greenPoints', 0)}")
        print(f"🌍 Emissions        : {row.get('emissions', 0)} kg CO₂")
        print(f"💧 Water Saved      : {row.get('waterSaved', 0)} L")
        print(f"🧪 Energy Used      : {row.get('energyUsed', 0)} kWh")
        print(f"🚯 Plastic Avoided  : {row.get('plasticAvoided', 0)} g")
        print(f"🕒 Created At       : {row.get('createdAt', 'N/A')}")
        print(f"📈 Similarity Score : {row.get('similarity', 0):.4f}")
        print(f"{'='*100}")

# Run and display
results = search_products("mens tshirt", top_k=20)
display_full_product_details(results)


🔢 Product #40
🆔 Product ID       : herbal-strategi-bathroom-cleaner-500ml
🛒 Name             : Herbal Strategi Bathroom Cleaner – 500 mL Spray (Herbal Lemon)
🖼️  Image URL       : https://res.cloudinary.com/ddr2iwcho/image/upload/v1750101494/products/nbijs5mlm4yxle2zqdg9.jpg
📄 Description      : Completely herbal bathroom cleaner with fragrant lemon oil that removes hard-water stains and kills ...
🏷️  Tags            : bathroom cleaner, herbal cleaner, antibacterial spray, eco-friendly cleaner
🌱 Organic          : ✅ Yes
📦 Base Price       : ₹299.0
📊 Sustainable Score: 85.0/100
🟩 Green Points     : 78.0
🌍 Emissions        : 13.0 kg CO₂
💧 Water Saved      : 29.0 L
🧪 Energy Used      : 20.0 kWh
🚯 Plastic Avoided  : 25.0 g
🕒 Created At       : 2025-06-16 19:18:17.055
📈 Similarity Score : 0.4451

🔢 Product #37
🆔 Product ID       : 1f0dce59-c6a0-45fc-a3eb-a21ab5ae2128
🛒 Name             : Nimyle Herbal Eco-Friendly Floor Cleaner – Neem-Infused 2 L Liquid
🖼️  Image URL       : https://res.cl

In [12]:
import torch
torch.save(product_embeddings, "embeddings.pt")

In [19]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all product texts once
product_embeddings = torch.load("embeddings.pt")

import random

def search_products(query, top_k=50):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, product_embeddings)[0]
    df["similarity"] = cosine_scores.cpu().numpy()

    # Step 1: Get top 5 high-scoring eco-friendly products
    eco_sorted = df[(df["isOrganic"] == True) & (df["sustainableScore"] >= 85)].sort_values(by="similarity", ascending=False)
    top_eco = eco_sorted.head(min(5, len(eco_sorted)))

    # Step 2: Prepare the remaining pool
    rest_df = df[~df.index.isin(top_eco.index)]
    rest_eco = rest_df[(rest_df["isOrganic"] == True) & (rest_df["sustainableScore"] >= 75)]
    rest_non_eco = rest_df[~rest_df.index.isin(rest_eco.index)]

    # Step 3: Calculate 3:2 ratio
    remaining_needed = top_k - len(top_eco)
    non_eco_count = int((3/5) * remaining_needed)
    eco_count = remaining_needed - non_eco_count  # remaining 2/5

    # Step 4: Select top results and shuffle
    rest_eco_top = rest_eco.sort_values(by="similarity", ascending=False).head(eco_count)
    rest_non_eco_top = rest_non_eco.sort_values(by="similarity", ascending=False).head(non_eco_count)
    mixed_rest = pd.concat([rest_eco_top, rest_non_eco_top]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Step 5: Final result
    final_df = pd.concat([top_eco, mixed_rest], ignore_index=True)
    return final_df.head(top_k)
    
# Example usage
def display_full_product_details(results_df):
    for idx, row in results_df.iterrows():
        print(f"\n{'='*100}")
        print(f"🔢 Product #{idx + 1}")
        print(f"🆔 Product ID       : {row.get('productId', 'N/A')}")
        print(f"🛒 Name             : {row.get('name', 'N/A')}")
        print(f"🖼️  Image URL       : {row.get('images', '').split(',')[0] if row.get('images') else 'No image'}")
        print(f"📄 Description      : {row.get('description', '')[:100]}{'...' if len(row.get('description', '')) > 100 else ''}")
        print(f"🏷️  Tags            : {row.get('tags', 'None')}")
        print(f"🌱 Organic          : {'✅ Yes' if row.get('isOrganic') else '❌ No'}")
        print(f"📦 Base Price       : ₹{row.get('basePrice', 'N/A')}")
        print(f"📊 Sustainable Score: {row.get('sustainableScore', 0)}/100")
        print(f"🟩 Green Points     : {row.get('greenPoints', 0)}")
        print(f"🌍 Emissions        : {row.get('emissions', 0)} kg CO₂")
        print(f"💧 Water Saved      : {row.get('waterSaved', 0)} L")
        print(f"🧪 Energy Used      : {row.get('energyUsed', 0)} kWh")
        print(f"🚯 Plastic Avoided  : {row.get('plasticAvoided', 0)} g")
        print(f"🕒 Created At       : {row.get('createdAt', 'N/A')}")
        print(f"📈 Similarity Score : {row.get('similarity', 0):.4f}")
        print(f"{'='*100}")

# Run and display
results = search_products("mens tshirt", top_k=50)
display_full_product_details(results)


🔢 Product #1
🆔 Product ID       : 1829fa85-7037-4525-b067-8ce3aaafd789
🛒 Name             : Eco-Friendly Mens T Shirt - Short Sleeve Crew Neck Soft Fitted Tees S - 4XL Fresh Classic Tshirt
🖼️  Image URL       : https://m.media-amazon.com/images/I/71NZl070rtL._AC_UL320_.jpg
📄 Description      : This product is made with sustainable, organic, and eco-friendly materials. Mens T Shirt - Short Sle...
🏷️  Tags            : eco-friendly, sustainable, green, organic
🌱 Organic          : ✅ Yes
📦 Base Price       : ₹53.0
📊 Sustainable Score: 87.0/100
🟩 Green Points     : 70.0
🌍 Emissions        : 15.0 kg CO₂
💧 Water Saved      : 9.0 L
🧪 Energy Used      : 28.0 kWh
🚯 Plastic Avoided  : 19.0 g
🕒 Created At       : 2025-06-17 10:47:08.918
📈 Similarity Score : 0.5978

🔢 Product #2
🆔 Product ID       : 44930aa5-e6b4-40b4-a7d1-05a93f4ec4aa
🛒 Name             : Eco-Friendly INTO The AM Mens T Shirt - Short Sleeve Crew Neck Soft Fitted Tees S - 4XL Fresh Classic Basic Essential Tshirts
🖼️  Image URL   