In [1]:
!pip install -q transformers sentence-transformers datasets pandas tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [17]:
import pandas as pd
import torch
import time
import requests
from tqdm import tqdm
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from datasets import Dataset

class ProductRecommender:
    def __init__(self, dataframe, max_dataset_size=10000, chunk_size=1000, top_n=10):
        self.df = dataframe
        self.device = 0 if torch.cuda.is_available() else -1
        print(f"📦 Device: {'GPU' if self.device == 0 else 'CPU'}")

        self.chunk_size = chunk_size
        self.top_n = top_n
        self.max_dataset_size = max_dataset_size

        # Load models
        self.absa_pipe = pipeline(
            "text-classification",
            model="yangheng/deberta-v3-large-absa-v1.1",
            tokenizer="yangheng/deberta-v3-large-absa-v1.1",
            device=self.device
        )
        self.sbert = SentenceTransformer('all-MiniLM-L6-v2', device=self.device)

        # Preprocess
        self.df = self._prepare_data(self.df)

    def _prepare_data(self, df):
        df = df.sample(n=min(self.max_dataset_size, len(df)), random_state=42)
        df = df[df['reviewText'].str.len() > 15].reset_index(drop=True)

        dataset = Dataset.from_pandas(df)

        def extract_aspects(batch):
            results = self.absa_pipe(batch['reviewText'], batch_size=2)
            aspects = [r['label'].split("#[SEP]")[0].strip() for r in results]
            sentiments = [r['label'].split("#[SEP]")[-1].strip() for r in results]
            return {"aspect": aspects, "aspect_sentiment": sentiments}

        print("🧠 Running ABSA...")
        start = time.time()
        dataset = dataset.map(extract_aspects, batched=True, batch_size=2)
        print(f"✅ ABSA done in {time.time() - start:.2f} sec")

        return dataset.to_pandas()

    def _infer_product_category(self, user_input):
        query_embedding = self.sbert.encode(user_input, convert_to_tensor=True)
        titles = self.df['itemName'].unique().tolist()

        top_title = None
        max_score = -1
        for i in range(0, len(titles), self.chunk_size):
            chunk_titles = titles[i:i + self.chunk_size]
            embeddings = self.sbert.encode(chunk_titles, convert_to_tensor=True)
            scores = util.pytorch_cos_sim(query_embedding, embeddings).squeeze()
            score, idx = torch.max(scores, dim=0)
            if score > max_score:
                max_score = score
                top_title = chunk_titles[idx]

        if top_title:
            tokens = top_title.lower().split()
            return tokens[:3]  # Up to 3 category tokens
        else:
            return ["product"]

    def _generate_complementary_products(self, user_review):
        prompt = (
          f'Given the following product review: "{user_input}"\n\n'
          f"Analyze the user's needs, problem, or sentiment, and recommend 3 concise product types "
          f"that would be relevant or helpful in this context. Keep each product type to 1–2 words max. "
          f"Return the product types only, in a comma-separated format."
        )

        url = "https://openrouter.ai/api/v1/chat/completions"
        headers = {
            "Authorization": "Bearer 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",  # Replace with your real key
            "Content-Type": "application/json",
            "HTTP-Referer": "https://yourdomain.com",
            "X-Title": "Product-Recommender"
        }
        data = {
            "model": "deepseek/deepseek-r1:free",
            "messages": [{"role": "user", "content": prompt}]
        }

        try:
            response = requests.post(url, headers=headers, json=data, timeout=15)
            response.raise_for_status()
            result = response.json()
            reply = result['choices'][0]['message']['content']
            print("🔁 DeepSeek reply:", reply)
            return [item.strip() for item in reply.split(",") if item.strip()]
        except Exception as e:
            print(f"⚠️ DeepSeek API error: {e}. Using fallback product types.")
            return ["accessory", "charger", "case"]

    def recommend(self, user_review):
        absa_result = self.absa_pipe(user_review)[0]['label']
        parts = absa_result.split("#[SEP]")
        aspect = parts[0].strip().lower() if len(parts) == 2 else "quality"
        sentiment = parts[1].strip().lower() if len(parts) == 2 else "positive"
        target_sentiment = "positive"

        print(f"🔍 Aspect: {aspect}, Sentiment: {sentiment} ➜ Target: {target_sentiment}")

        # Step 1: Infer category tokens
        category_tokens = self._infer_product_category(user_review)
        print(f"🔍 Inferred category tokens: {category_tokens}")

        # Step 2: Generate complementary product types
        complements = self._generate_complementary_products(user_review)
        print(f"🔗 Complementary product types: {complements}")

        # Step 3: Find candidate products by OR-matching on category tokens
        product_titles = self.df['itemName'].unique().tolist()
        matches = []
        query_embed = self.sbert.encode(" ".join(complements), convert_to_tensor=True)

        for i in range(0, len(product_titles), self.chunk_size):
            chunk_titles = product_titles[i:i + self.chunk_size]
            filtered_titles = [
                title for title in chunk_titles
                if any(token in title.lower() for token in category_tokens)
            ]
            if not filtered_titles:
                continue
            embeddings = self.sbert.encode(filtered_titles, convert_to_tensor=True)
            scores = util.pytorch_cos_sim(query_embed, embeddings).squeeze()
            top_ids = torch.topk(scores, k=min(25, len(filtered_titles))).indices.tolist()
            if isinstance(top_ids, int):  # handle single result
              matches.append(filtered_titles[top_ids])
            else:
              matches.extend([filtered_titles[j] for j in top_ids])

        # Step 4: Filter dataset with positive sentiment and matched items
        df_filtered = self.df[
            (self.df['itemName'].isin(matches)) &
            (self.df['aspect_sentiment'].str.lower() == target_sentiment)
        ].copy()

        if df_filtered.empty:
            print("⚠️ No matching products with same sentiment. Showing generic positives.")
            df_filtered = self.df[self.df['aspect_sentiment'].str.lower() == "positive"]

        df_filtered['combo_text'] = df_filtered['itemName'] + ": " + df_filtered['reviewText']

        # Step 5: Final similarity with user's review
        user_embed = self.sbert.encode(user_review, convert_to_tensor=True)
        scores = []

        for i in range(0, len(df_filtered), self.chunk_size):
            chunk = df_filtered['combo_text'].iloc[i:i + self.chunk_size].tolist()
            chunk_embed = self.sbert.encode(chunk, convert_to_tensor=True)
            sims = util.pytorch_cos_sim(user_embed, chunk_embed).squeeze()
            scores.extend(sims.tolist())

        df_filtered['similarity'] = scores

        final = (
            df_filtered.sort_values(by='similarity', ascending=False)
            .drop_duplicates(subset='itemName')
            .head(self.top_n)
        )

        return final[['itemName', 'reviewText', 'aspect', 'aspect_sentiment', 'similarity']]

In [18]:
df = pd.read_csv("/content/drive/MyDrive/fixed_image_urls.csv")
recommender = ProductRecommender(df)

user_input = "battery life is not good"
results = recommender.recommend(user_input)

print("\n✅ Final Recommendations:")
print(results)


📦 Device: GPU


Device set to use cuda:0


🧠 Running ABSA...


Map:   0%|          | 0/9922 [00:00<?, ? examples/s]

✅ ABSA done in 371.15 sec
🔍 Aspect: quality, Sentiment: positive ➜ Target: positive
🔍 Inferred category tokens: ['lg', 'v10', 'h962']
🔁 DeepSeek reply: portable charger, replacement battery, battery case
🔗 Complementary product types: ['portable charger', 'replacement battery', 'battery case']

✅ Final Recommendations:
                                               itemName  \
8501  LG K8 Phoenix 2 K371 AT&amp;T GSM Unlocked 4G ...   
9264  LG Flip Phone Senior Unlocked GSM Unlocked Int...   
8409  LG G5 Friends Cam Plus CBG-700 Comfortable Sho...   
611   LG K7 4G K330 LTE, Android, 8GB, No-Contract T...   
5123     LG G3 Battery Charger, with USB 5V Output Port   
5657  Spigen Slim Armor LG G4 Case with Air Cushion ...   
2387  iClever BoostCube 18W Turbo Quick Charge 2.0 U...   
876   QiStone+ the Completely Wireless Portable Qi C...   
7140  Anker Quick Charge 3.0 and USB Type-C 24W USB ...   
3685  Anker USB Type C Cable, Powerline USB C to USB...   

                             

In [22]:
user_input = "recently i purchased a engine oil can for my motor bike.it was good  and had good results."
results = recommender.recommend(user_input)

print("\n✅ Final Recommendations:")
print(results)

🔍 Aspect: quality, Sentiment: positive ➜ Target: positive
🔍 Inferred category tokens: ['drainzit', 'hon1010', '10mm']
🔁 DeepSeek reply: oil filter, chain lubricant, air filter
🔗 Complementary product types: ['oil filter', 'chain lubricant', 'air filter']

✅ Final Recommendations:
                                               itemName  \
455   Drainzit HON1010 10mm Oil Changing Aid for Hon...   
9761  VALYRIA 500pcs Silver Tone Stainless Steel Ope...   
2336  24pc Clear Acrylic Bead Tubes with Container -...   
8014  PH PandaHall 1745 Pcs Iron Plated Open Jump Ri...   
9918  PandaHall Elite About 430 Pcs Tiny Satin Luste...   
6925  (3 Packages) Cadet 100-Count Munchy Chicken St...   
8047  RUBYCA Pave Czech Crystal Round Disco Ball Cla...   
5056  YOYOSTORE 50 Black and 50 Silver Tone Metal Pr...   
8574  Housweety 50 Mixed Rhinestone Love Heart Charm...   
6874  Knitter's Pride Waves Aluminium Crochet Hook, ...   

                                             reviewText    aspect  \


In [23]:
user_input = "paper quality is low"
results = recommender.recommend(user_input)

print("\n✅ Final Recommendations:")
print(results)


🔍 Aspect: quality, Sentiment: positive ➜ Target: positive
🔍 Inferred category tokens: ['advanced', 'photo', 'paper,']
🔁 DeepSeek reply: high-quality paper, cardstock, stationery sets
🔗 Complementary product types: ['high-quality paper', 'cardstock', 'stationery sets']

✅ Final Recommendations:
                                               itemName  \
3558   Ruby Paulina 13x19 Copy Paper, 100 sheet package   
3627  HP Printer Paper, Premium24, 8.5 x 11 Paper, L...   
3757  HP Printer Paper, Premium32, 8.5 x 11 Paper, L...   
2858  Hammermill Paper, Great White 30% recycled cop...   
3659  Hammermill Paper, Tidal Copy Paper, 8.5 x 14 P...   
32    HP Printer Paper, BrightWhite24, 8.5 x 11, Let...   
8843  Hammermill Paper, Tidal Copy Paper, 11 x 17 Pa...   
195   Southworth 25% Cotton Business Paper, 8.5&quot...   
830   4&quot; X 6&quot; 100 Sheets Premium Luster In...   
5926  Southworth 25% Cotton Business Paper, 8.5&rdqu...   

                                             reviewText