## Product Categorization and Insights
This notebook tries to simplify the dataset by clustering it into 6 categories using a zero-shot attempt with the bge-reranker model
and give valuable insights into the data
# 

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
import warnings
import mlflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sentence_transformers import CrossEncoder
import re
from getpass import getpass
from collections import Counter
from pyngrok import ngrok

warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'pyngrok'

## Data Loading and Preprocessing

In [None]:
file_path = "/content/Datafiniti_with_sentiments.csv"
df = pd.read_csv(file_path)

def create_text_features(row: pd.Series) -> str:
    """Extracts and cleans the product name from a DataFrame row."""
    name = row.get("name", "")
    if pd.notna(name) and str(name).strip():
        return str(name).strip().lower()
    return ""

df["product_name_clean"] = df.apply(create_text_features, axis=1)
df = df[df["product_name_clean"].str.len() > 0].reset_index(drop=True)

sentiment_mapping = {"Positive": 1.0, "Neutral": 0.5, "Negative": 0.0}
if "predicted_sentiment_roberta" in df.columns:
    df["predicted_sentiment_roberta"] = df["predicted_sentiment_roberta"].map(sentiment_mapping)
else:
    df["predicted_sentiment_roberta"] = np.nan

print(f"Dataset size after cleaning: {len(df)} reviews")

## Zero-Shot Product Categorization

In [None]:
labels = [
    "Fire Tablet Special",
    "AmazonBasics Performance Alkaline(Batteries)",
    "Anon/Uncategorized",
    "Echo White Amazon",
    "Fire Kids Edition",
    "Fire Amazon",
]

In [None]:
def classify_with_reranker(texts: list[str], labels: list[str], model_name: str) -> tuple[list[str], list[float]]:
    """
    Classifies texts into the most relevant label using a CrossEncoder reranker model.
    """
    model = CrossEncoder(model_name)
    pairs = [(text, label) for text in texts for label in labels]
    scores = model.predict(pairs).reshape(len(texts), len(labels))
    best_indices = np.argmax(scores, axis=1)
    pred_labels = [labels[i] for i in best_indices]
    pred_scores = [scores[i, best_indices[i]] for i in range(len(texts))]
    return pred_labels, pred_scores

In [None]:
def compute_category_stats(df: pd.DataFrame, cat_col: str) -> pd.DataFrame:
    """
    Computes summary statistics for each category.
    """
    stats = []
    for cat, g in df.groupby(cat_col):
        row = {
            "category": cat,
            "count": len(g),
            "avg_sentiment": g["predicted_sentiment_roberta"].mean(),
            "positive_pct": (g["predicted_sentiment_roberta"] > 0.5).mean() * 100,
            "avg_confidence": g["zero_shot_score"].mean(),
        }
        if "rating" in g.columns:
            row["avg_rating"] = g["rating"].mean()
            row["high_rating_pct"] = (g["rating"] >= 4).mean() * 100
        if "doRecommend" in g.columns:
            row["recommend_pct"] = g["doRecommend"].mean() * 100
        stats.append(row)
    return pd.DataFrame(stats).sort_values("count", ascending=False).reset_index(drop=True)

In [None]:
preds, scores = classify_with_reranker(
    df["product_name_clean"].tolist(),
    labels=labels,
    model_name="BAAI/bge-reranker-v2-m3",
)

df["zero_shot_label"] = preds
df["zero_shot_score"] = scores
cat_results = compute_category_stats(df, "zero_shot_label")
cat_results

## Visualization of Results

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="category", y="count", data=cat_results)
plt.xticks(rotation=30, ha="right")
plt.title("Review Distribution per Category")
plt.show()

plt.figure(figsize=(10,6))
sns.barplot(x="category", y="avg_sentiment", data=cat_results)
plt.xticks(rotation=30, ha="right")
plt.title("Average Sentiment per Category")
plt.show()

## Deep Dive into Product Insights

In [None]:
top_products = df["product_name_clean"].value_counts().head(3)
print("Top 3 Products:")
print(top_products)

top_products_stats = df[df["product_name_clean"].isin(top_products.index)].groupby("product_name_clean").agg({
    "predicted_sentiment_roberta": "mean",
    "rating": "mean" if "rating" in df.columns else "first",
    "doRecommend": "mean" if "doRecommend" in df.columns else "first",
    "zero_shot_label": "first"
})
top_products_stats

In [None]:
def extract_complaints(subset: pd.DataFrame, top_n: int = 10) -> list:
    """Extracts the top N most common words from negative reviews."""
    texts = " ".join(subset.loc[subset["predicted_sentiment_roberta"] == 0.0, "text"].dropna().astype(str))
    words = re.findall(r"\b\w{3,}\b", texts.lower())
    return Counter(words).most_common(top_n)

for prod in top_products.index:
    complaints = extract_complaints(df[df["product_name_clean"] == prod])
    print(f"\nTop complaints for {prod}:")
    print(complaints)

## Identifying the Worst Performers

In [None]:
worst_products = (
    df.groupby(["zero_shot_label", "product_name_clean"])
    .agg(avg_sentiment=("predicted_sentiment_roberta", "mean"), count=("product_name_clean", "size"))
    .reset_index()
    .sort_values(["zero_shot_label", "avg_sentiment"])
)

worst_by_cat = worst_products.groupby("zero_shot_label").first().reset_index()
print("Worst products per category:")
worst_by_cat

## MLflow Logging


In [None]:
# Log to MLflow
# This section is commented out to prevent running without a configured MLflow server.
# !mlflow ui --port 5000 &
# from pyngrok import ngrok
# from getpass import getpass
# ngrok.kill()
# NGROK_AUTH_TOKEN = getpass("Enter your ngrok authtoken: ")
# ngrok.set_auth_token(NGROK_AUTH_TOKEN)
# public_url = ngrok.connect(addr="5000", proto="http")
# print("MLflow Tracking UI:", public_url)

mlflow.set_experiment("Product Categorization with Reranker")

with mlflow.start_run(run_name="zero_shot_reranker"):
    mlflow.log_param("model", "BAAI/bge-reranker-v2-m3")
    mlflow.log_param("candidate_labels", labels)
    for _, r in cat_results.iterrows():
        sanitized_category = re.sub(r'[^a-zA-Z0-9_.-]', '_', str(r['category']).replace(' ', '_'))
        prefix = f"cat_{sanitized_category}"
        mlflow.log_metric(f"{prefix}_count", int(r["count"]))
        if not np.isnan(r["avg_sentiment"]):
            mlflow.log_metric(f"{prefix}_avg_sentiment", float(r["avg_sentiment"]))
            mlflow.log_metric(f"{prefix}_positive_pct", float(r["positive_pct"]))
        if not np.isnan(r["avg_confidence"]):
            mlflow.log_metric(f"{prefix}_avg_confidence", float(r["avg_confidence"]))
        if "avg_rating" in r and not pd.isna(r["avg_rating"]):
            mlflow.log_metric(f"{prefix}_avg_rating", float(r["avg_rating"]))
        if "high_rating_pct" in r and not pd.isna(r["high_rating_pct"]):
            mlflow.log_metric(f"{prefix}_high_rating_pct", float(r["high_rating_pct"]))
        if "recommend_pct" in r and not pd.isna(r["recommend_pct"]):
            mlflow.log_metric(f"{prefix}_recommend_pct", float(r["recommend_pct"]))

    df.to_csv("reranker_products.csv", index=False)
    cat_results.to_csv("reranker_stats.csv", index=False)
    mlflow.log_artifact("reranker_products.csv")
    mlflow.log_artifact("reranker_stats.csv")