In [2]:
import pandas as pd
import ast

# 📂 Load your raw dataset
dataset_path = '/content/drive/MyDrive/amazon_reviews.csv'  # Replace with your actual path
df = pd.read_csv(dataset_path, dtype=str, low_memory=False)
print(f"📦 Original dataset size: {len(df)} rows")

# 🧹 Drop rows missing essential fields
df = df.dropna(subset=['itemName', 'reviewText'])

# 🧼 Clean list-like columns: description & feature only
def parse_list_column(col):
    return col.apply(lambda x: ast.literal_eval(x) if pd.notnull(x) and x.strip().startswith("[") else [])

df['description'] = parse_list_column(df['description'])
df['feature'] = parse_list_column(df['feature'])

# 📷 Process image column — keep only the first link if multiple
def process_image_column(x):
    try:
        links = ast.literal_eval(x)
        if isinstance(links, list) and links:
            return links[0]  # Keep only the first image
        else:
            return ''
    except:
        return ''

df['image'] = df['image'].apply(process_image_column)

# 🧽 Strip whitespace and clean text
df['itemName'] = df['itemName'].str.strip()
df['reviewText'] = df['reviewText'].str.strip()

# ✂️ Remove very short reviews
df = df[df['reviewText'].str.len() >= 15]

# 🛍 Keep only up to 3 reviews per unique itemName
df = df.groupby('itemName').apply(lambda x: x.head(3)).reset_index(drop=True)

# 🧾 Final stats
print(f"✅ Cleaned dataset size: {len(df)} rows")
print(f"🛒 Unique products retained: {df['itemName'].nunique()}")

# 💾 Save the cleaned data
df.to_csv('/content/drive/MyDrive/cleaned_ecommerce_dataset.csv', index=False)
print("💾 Cleaned dataset saved to: /content/cleaned_ecommerce_dataset.csv")


📦 Original dataset size: 585505 rows


  df = df.groupby('itemName').apply(lambda x: x.head(3)).reset_index(drop=True)


✅ Cleaned dataset size: 199829 rows
🛒 Unique products retained: 100697
💾 Cleaned dataset saved to: /content/cleaned_ecommerce_dataset.csv


In [None]:
!pip install -q transformers sentence-transformers datasets pandas tqdm

In [3]:
import torch
import time
import requests
from tqdm import tqdm
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from datasets import Dataset

class ProductRecommender:
    def __init__(self, dataframe, max_dataset_size=10000, chunk_size=1000, top_n=10):
        self.df = dataframe
        self.device = 0 if torch.cuda.is_available() else -1
        print(f"📦 Device: {'GPU' if self.device == 0 else 'CPU'}")

        self.chunk_size = chunk_size
        self.top_n = top_n
        self.max_dataset_size = max_dataset_size

        # Load models
        self.absa_pipe = pipeline(
            "text-classification",
            model="yangheng/deberta-v3-large-absa-v1.1",
            tokenizer="yangheng/deberta-v3-large-absa-v1.1",
            device=self.device
        )
        self.sbert = SentenceTransformer('all-MiniLM-L6-v2', device=self.device)

        # Preprocess
        self.df = self._prepare_data(self.df)

    def _prepare_data(self, df):
        df = df.sample(n=min(self.max_dataset_size, len(df)), random_state=42)
        df = df[df['reviewText'].str.len() > 15].reset_index(drop=True)

        dataset = Dataset.from_pandas(df)

        def extract_aspects(batch):
            results = self.absa_pipe(batch['reviewText'], batch_size=2)
            aspects = [r['label'].split("#[SEP]")[0].strip() for r in results]
            sentiments = [r['label'].split("#[SEP]")[-1].strip() for r in results]
            return {"aspect": aspects, "aspect_sentiment": sentiments}

        print("🧠 Running ABSA...")
        start = time.time()
        dataset = dataset.map(extract_aspects, batched=True, batch_size=2)
        print(f"✅ ABSA done in {time.time() - start:.2f} sec")

        return dataset.to_pandas()

    def _infer_product_category(self, user_input):
        query_embedding = self.sbert.encode(user_input, convert_to_tensor=True)
        titles = self.df['itemName'].unique().tolist()

        top_title = None
        max_score = -1
        for i in range(0, len(titles), self.chunk_size):
            chunk_titles = titles[i:i + self.chunk_size]
            embeddings = self.sbert.encode(chunk_titles, convert_to_tensor=True)
            scores = util.pytorch_cos_sim(query_embedding, embeddings).squeeze()
            score, idx = torch.max(scores, dim=0)
            if score > max_score:
                max_score = score
                top_title = chunk_titles[idx]

        return top_title.split()[0].lower() if top_title else "product"

    def _generate_complementary_products(self, category):
        url = "https://openrouter.ai/api/v1/chat/completions"
        headers = {
            "Authorization": "Bearer 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",  # Replace with your own key
            "Content-Type": "application/json",
            "HTTP-Referer": "https://yourdomain.com",
            "X-Title": "Product-Recommender"
        }
        data = {
            "model": "deepseek/deepseek-r1:free",
            "messages": [
                {
                    "role": "user",
                    "content": f"List 3 complementary products for a {category} in a comma-separated format only."
                }
            ],
        }

        try:
            response = requests.post(url, headers=headers, json=data)
            response.raise_for_status()
            result = response.json()
            reply = result['choices'][0]['message']['content']
            return [item.strip() for item in reply.split(",") if item.strip()]
        except Exception as e:
            print(f"⚠️ OpenRouter/DeepSeek API error: {e}. Falling back to generic accessory.")
            return ["accessory"]

    def recommend(self, user_review):
        absa_result = self.absa_pipe(user_review)[0]['label']
        parts = absa_result.split("#[SEP]")
        aspect = parts[0].strip().lower() if len(parts) == 2 else "quality"
        sentiment = parts[1].strip().lower() if len(parts) == 2 else "positive"
        target_sentiment = "positive"

        print(f"🔍 Aspect: {aspect}, Sentiment: {sentiment} ➜ Target: {target_sentiment}")

        # Step 1: Infer category
        category = self._infer_product_category(user_review)
        print(f"🔍 Inferred category: {category}")

        # Step 2: Generate complementary products
        complements = self._generate_complementary_products(category)
        print(f"🔗 Complementary items: {complements}")

        # Step 3: Retrieve matching product titles
        product_titles = self.df['itemName'].unique().tolist()
        matches = []
        query_embed = self.sbert.encode(" ".join(complements), convert_to_tensor=True)

        for i in range(0, len(product_titles), self.chunk_size):
            chunk_titles = product_titles[i:i + self.chunk_size]
            embeddings = self.sbert.encode(chunk_titles, convert_to_tensor=True)
            scores = util.pytorch_cos_sim(query_embed, embeddings).squeeze()
            top_ids = torch.topk(scores, k=min(25, len(chunk_titles))).indices.tolist()
            matches.extend([chunk_titles[j] for j in top_ids])

        # Step 4: Filter for those with correct sentiment
        df_filtered = self.df[
            (self.df['itemName'].isin(matches)) &
            (self.df['aspect_sentiment'].str.lower() == target_sentiment)
        ].copy()

        if df_filtered.empty:
            print("⚠️ No perfect complementary matches found. Showing generic positives.")
            df_filtered = self.df[self.df['aspect_sentiment'].str.lower() == "positive"]

        df_filtered['combo_text'] = df_filtered['itemName'] + ": " + df_filtered['reviewText']

        # Step 5: Compute final similarity
        user_embed = self.sbert.encode(user_review, convert_to_tensor=True)
        scores = []

        for i in range(0, len(df_filtered), self.chunk_size):
            chunk = df_filtered['combo_text'].iloc[i:i + self.chunk_size].tolist()
            chunk_embed = self.sbert.encode(chunk, convert_to_tensor=True)
            sims = util.pytorch_cos_sim(user_embed, chunk_embed).squeeze()
            scores.extend(sims.tolist())

        df_filtered['similarity'] = scores

        final = (
            df_filtered.sort_values(by='similarity', ascending=False)
            .drop_duplicates(subset='itemName')
            .head(self.top_n)
        )

        return final[['itemName', 'reviewText', 'aspect', 'aspect_sentiment', 'similarity']]


In [4]:
recommender = ProductRecommender(df)

user_input = "this chocolate is smooth and sweet"
results = recommender.recommend(user_input)

print("\n✅ Final Recommendations:")
print(results)


📦 Device: GPU


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Device set to use cuda:0


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🧠 Running ABSA...


Map:   0%|          | 0/9922 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ ABSA done in 430.30 sec
🔍 Aspect: quality, Sentiment: positive ➜ Target: positive
🔍 Inferred category: lindt
🔗 Complementary items: ['Dark chocolate truffles', 'gourmet coffee beans', 'luxury gift wrapping sets']

✅ Final Recommendations:
                                               itemName  \
8783  Swiss Miss, Dark Chocolate Sensation, Hot Coco...   
5239  Cadbury Chocolate Eclairs 166 gram - Pack of 2...   
5589  Silk Almond Milk Dark Chocolate 32 oz (Pack of...   
9488  Tru-Nut Powdered Peanut Butter 1LB Jars (2-Pac...   
9849  Madelaine Solid Premium Milk Chocolate Mini Bu...   
3038  Cameron's Coffee Holiday Roasted Ground Coffee...   
3353  Endangered Species Chocolate Variety Pack (Pac...   
7366  Ghirardelli Chocolate Intense Dark Chocolate V...   
7104  Chicory Herbal Tea - Chocolate 10/2.12 Ounce (...   
3268                Miles Kimball Dark Chocolate Sticks   

                                             reviewText    aspect  \
8783  Finally found an instant cocoa I a