# Product categorization and insights using zero-shot classification with bge-reranker.

This notebook tries to simplify the dataset by clustering it into 6 categories using a<br>
zero-shot attempt with the bge-reranker model and give valuable insights into the data

In [None]:
import warnings
import re
from collections import Counter
import mlflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sentence_transformers import CrossEncoder
warnings.filterwarnings("ignore")

## Data Loading and Preprocessing

In [None]:
FILE_PATH = "/content/Datafiniti_with_sentiments.csv"
df = pd.read_csv(FILE_PATH)

In [None]:
def create_text_features(row: pd.Series) -> str:
    """Extract and clean the product name from a DataFrame row."""
    name = row.get("name", "")
    if pd.notna(name) and str(name).strip():
        return str(name).strip().lower()
    return ""

In [None]:
df["product_name_clean"] = df.apply(create_text_features, axis=1)
df = df[df["product_name_clean"].str.len() > 0].reset_index(drop=True)

In [None]:
SENTIMENT_MAPPING = {"Positive": 1.0, "Neutral": 0.5, "Negative": 0.0}
if "predicted_sentiment_roberta" in df.columns:
    df["predicted_sentiment_roberta"] = df["predicted_sentiment_roberta"].map(
        SENTIMENT_MAPPING
    )
else:
    df["predicted_sentiment_roberta"] = np.nan

In [None]:
print(f"Dataset size after cleaning: {len(df)} reviews")

## Zero-Shot Product Categorization

In [None]:
LABELS = [
    "Fire Tablet Special",
    "AmazonBasics Performance Alkaline(Batteries)",
    "Anon/Uncategorized",
    "Echo White Amazon",
    "Fire Kids Edition",
    "Fire Amazon",
]

In [None]:
def classify_with_reranker(
    texts: list[str], labels: list[str], model_name: str
) -> tuple[list[str], list[float]]:
    """Classify texts into the most relevant label using a CrossEncoder reranker model."""
    model = CrossEncoder(model_name)
    pairs = [(text, label) for text in texts for label in labels]
    scores = model.predict(pairs).reshape(len(texts), len(labels))
    best_indices = np.argmax(scores, axis=1)
    pred_labels = [labels[i] for i in best_indices]
    pred_scores = [scores[i, best_indices[i]] for i in range(len(texts))]
    return pred_labels, pred_scores

In [None]:
def compute_category_stats(data_df: pd.DataFrame, cat_col: str) -> pd.DataFrame:
    """Compute summary statistics for each category."""
    stats = []
    for cat, group in data_df.groupby(cat_col):
        row = {
            "category": cat,
            "count": len(group),
            "avg_sentiment": group["predicted_sentiment_roberta"].mean(),
            "positive_pct": (group["predicted_sentiment_roberta"] > 0.5).mean() * 100,
            "avg_confidence": group["zero_shot_score"].mean(),
        }
        if "rating" in group.columns:
            row["avg_rating"] = group["rating"].mean()
            row["high_rating_pct"] = (group["rating"] >= 4).mean() * 100
        if "doRecommend" in group.columns:
            row["recommend_pct"] = group["doRecommend"].mean() * 100
        stats.append(row)
    return (
        pd.DataFrame(stats).sort_values("count", ascending=False).reset_index(drop=True)
    )

In [None]:
predictions, confidence_scores = classify_with_reranker(
    df["product_name_clean"].tolist(),
    labels=LABELS,
    model_name="BAAI/bge-reranker-v2-m3",
)

In [None]:
df["zero_shot_label"] = predictions
df["zero_shot_score"] = confidence_scores
category_results = compute_category_stats(df, "zero_shot_label")
category_results

## Visualization of Results

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x="category", y="count", data=category_results)
plt.xticks(rotation=30, ha="right")
plt.title("Review Distribution per Category")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x="category", y="avg_sentiment", data=category_results)
plt.xticks(rotation=30, ha="right")
plt.title("Average Sentiment per Category")
plt.show()

## Deep Dive into Product Insights

In [None]:
top_products = df["product_name_clean"].value_counts().head(3)
print("Top 3 Products:")
print(top_products)

In [None]:
top_products_stats = (
    df[df["product_name_clean"].isin(top_products.index)]
    .groupby("product_name_clean")
    .agg({
        "predicted_sentiment_roberta": "mean",
        "rating": "mean" if "rating" in df.columns else "first",
        "doRecommend": "mean" if "doRecommend" in df.columns else "first",
        "zero_shot_label": "first",
    })
)
top_products_stats

In [None]:
def extract_complaints(subset: pd.DataFrame, top_n: int = 10) -> list:
    """Extract the top N most common words from negative reviews."""
    texts = " ".join(
        subset.loc[subset["predicted_sentiment_roberta"] == 0.0, "text"]
        .dropna()
        .astype(str)
    )
    words = re.findall(r"\b\w{3,}\b", texts.lower())
    return Counter(words).most_common(top_n)

In [None]:
for product in top_products.index:
    product_complaints = extract_complaints(df[df["product_name_clean"] == product])
    print(f"\nTop complaints for {product}:")
    print(product_complaints)

## Identifying the Worst Performers

In [None]:
worst_products = (
    df.groupby(["zero_shot_label", "product_name_clean"])
    .agg(
        avg_sentiment=("predicted_sentiment_roberta", "mean"),
        count=("product_name_clean", "size"),
    )
    .reset_index()
    .sort_values(["zero_shot_label", "avg_sentiment"])
)

In [None]:
worst_by_category = worst_products.groupby("zero_shot_label").first().reset_index()
print("Worst products per category:")
worst_by_category

## MLflow Logging<br>
Log to MLflow<br>

In [2]:
# This section is commented out to prevent running without a configured MLflow server.<br>
# !mlflow ui --port 5000 &<br>
# from pyngrok import ngrok<br>
# from getpass import getpass<br>
# ngrok.kill()<br>
# NGROK_AUTH_TOKEN = getpass("Enter your ngrok authtoken: ")<br>
# ngrok.set_auth_token(NGROK_AUTH_TOKEN)<br>
# public_url = ngrok.connect(addr="5000", proto="http")<br>
# print("MLflow Tracking UI:", public_url)<br>

In [1]:
mlflow.set_experiment("Product Categorization with Reranker")

NameError: name 'mlflow' is not defined

In [None]:
with mlflow.start_run(run_name="zero_shot_reranker"):
    mlflow.log_param("model", "BAAI/bge-reranker-v2-m3")
    mlflow.log_param("candidate_labels", LABELS)
    for _, result_row in category_results.iterrows():
        sanitized_category = re.sub(
            r"[^a-zA-Z0-9_.-]", "_", str(result_row["category"]).replace(" ", "_")
        )
        prefix = f"cat_{sanitized_category}"
        mlflow.log_metric(f"{prefix}_count", int(result_row["count"]))
        if not np.isnan(result_row["avg_sentiment"]):
            mlflow.log_metric(
                f"{prefix}_avg_sentiment", float(result_row["avg_sentiment"])
            )
            mlflow.log_metric(
                f"{prefix}_positive_pct", float(result_row["positive_pct"])
            )
        if not np.isnan(result_row["avg_confidence"]):
            mlflow.log_metric(
                f"{prefix}_avg_confidence", float(result_row["avg_confidence"])
            )
        if "avg_rating" in result_row and not pd.isna(result_row["avg_rating"]):
            mlflow.log_metric(f"{prefix}_avg_rating", float(result_row["avg_rating"]))
        if "high_rating_pct" in result_row and not pd.isna(
            result_row["high_rating_pct"]
        ):
            mlflow.log_metric(
                f"{prefix}_high_rating_pct", float(result_row["high_rating_pct"])
            )
        if "recommend_pct" in result_row and not pd.isna(result_row["recommend_pct"]):
            mlflow.log_metric(
                f"{prefix}_recommend_pct", float(result_row["recommend_pct"])
            )
    df.to_csv("reranker_products.csv", index=False)
    category_results.to_csv("reranker_stats.csv", index=False)
    mlflow.log_artifact("reranker_products.csv")
    mlflow.log_artifact("reranker_stats.csv")