In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
# Cell 1: Imports
import warnings
import mlflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sentence_transformers import CrossEncoder

warnings.filterwarnings("ignore")


In [None]:
# Cell 2: Load & preprocess data
file_path = "/content/Datafiniti_with_sentiments.csv"
df = pd.read_csv(file_path)

def create_text_features(row: pd.Series) -> str:
    name = row.get("name", "")
    if pd.notna(name) and str(name) != "nan":
        return str(name).strip().lower()
    return ""

df["product_name_clean"] = df.apply(create_text_features, axis=1)
df = df[df["product_name_clean"].str.len() > 0].reset_index(drop=True)

sentiment_mapping = {"Positive": 1.0, "Neutral": 0.5, "Negative": 0.0}
if "predicted_sentiment_roberta" in df.columns:
    df["predicted_sentiment_roberta"] = df["predicted_sentiment_roberta"].map(sentiment_mapping)
else:
    df["predicted_sentiment_roberta"] = np.nan

print(f"Dataset size after cleaning: {len(df)} reviews")


In [None]:
# Cell 3: Define categories and helper functions
labels = [
    "Fire Tablet Special",
    "AmazonBasics Performance Alkaline(Batteries)",
    "Anon/Uncategorized",
    "Echo White Amazon",
    "Fire Kids Edition",
    "Fire Amazon",
]

def classify_with_reranker(texts: list[str], labels: list[str], model_name: str) -> tuple[list[str], list[float]]:
    model = CrossEncoder(model_name)
    pairs = []
    idx_map = []
    for i, t in enumerate(texts):
        for j, l in enumerate(labels):
            pairs.append((t, l))
            idx_map.append((i, j))
    scores = model.predict(pairs)
    pred_labels, pred_scores = [], []
    for i in range(len(texts)):
        label_scores = [(labels[j], scores[k]) for k, (ti, j) in enumerate(idx_map) if ti == i]
        best_label, best_score = max(label_scores, key=lambda x: x[1])
        pred_labels.append(best_label)
        pred_scores.append(float(best_score))
    return pred_labels, pred_scores

def compute_category_stats(df: pd.DataFrame, cat_col: str) -> pd.DataFrame:
    stats = []
    for cat, g in df.groupby(cat_col):
        row = {
            "category": cat,
            "count": len(g),
            "avg_sentiment": g["predicted_sentiment_roberta"].mean(),
            "positive_pct": (g["predicted_sentiment_roberta"] > 0.5).mean() * 100,
            "avg_confidence": g["zero_shot_score"].mean(),
        }
        if "rating" in g.columns:
            row["avg_rating"] = g["rating"].mean()
            row["high_rating_pct"] = (g["rating"] >= 4).mean() * 100
        if "doRecommend" in g.columns:
            row["recommend_pct"] = g["doRecommend"].mean() * 100
        stats.append(row)
    return pd.DataFrame(stats).sort_values("count", ascending=False).reset_index(drop=True)


In [None]:
# Cell 4: Run zero-shot classification
preds, scores = classify_with_reranker(
    df["product_name_clean"].tolist(),
    labels=labels,
    model_name="BAAI/bge-reranker-v2-m3",
)

df["zero_shot_label"] = preds
df["zero_shot_score"] = scores
cat_results = compute_category_stats(df, "zero_shot_label")
cat_results


In [None]:
!mlflow ui --port 5000 &

In [None]:
from pyngrok import ngrok
from getpass import getpass

ngrok.kill()  # Terminate any existing tunnels
NGROK_AUTH_TOKEN = getpass("Enter your ngrok authtoken: ")
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

public_url = ngrok.connect(addr="5000", proto="http")
print("MLflow Tracking UI:", public_url)

In [None]:
import re

# Cell 5: Log to MLflow
mlflow.set_experiment("Product Categorization with Reranker")

with mlflow.start_run(run_name="zero_shot_reranker"):
    mlflow.log_param("model", "BAAI/bge-reranker-v2-m3")
    mlflow.log_param("candidate_labels", labels)
    for _, r in cat_results.iterrows():

        # Add the sanitization line here ⬇️
        sanitized_category = re.sub(r'[^a-zA-Z0-9_.-]', '_', str(r['category']).replace(' ', '_'))

        # Use the sanitized variable to create the prefix
        prefix = f"cat_{sanitized_category}"

        mlflow.log_metric(f"{prefix}_count", int(r["count"]))
        if not np.isnan(r["avg_sentiment"]):
            mlflow.log_metric(f"{prefix}_avg_sentiment", float(r["avg_sentiment"]))
            mlflow.log_metric(f"{prefix}_positive_pct", float(r["positive_pct"]))
        if not np.isnan(r["avg_confidence"]):
            mlflow.log_metric(f"{prefix}_avg_confidence", float(r["avg_confidence"]))
        if "avg_rating" in r and not pd.isna(r["avg_rating"]):
            mlflow.log_metric(f"{prefix}_avg_rating", float(r["avg_rating"]))
        if "high_rating_pct" in r and not pd.isna(r["high_rating_pct"]):
            mlflow.log_metric(f"{prefix}_high_rating_pct", float(r["high_rating_pct"]))
        if "recommend_pct" in r and not pd.isna(r["recommend_pct"]):
            mlflow.log_metric(f"{prefix}_recommend_pct", float(r["recommend_pct"]))

    df.to_csv("reranker_products.csv", index=False)
    cat_results.to_csv("reranker_stats.csv", index=False)
    mlflow.log_artifact("reranker_products.csv")
    mlflow.log_artifact("reranker_stats.csv")

In [None]:
# Cell 6: Visualization
plt.figure(figsize=(10,6))
sns.barplot(x="category", y="count", data=cat_results)
plt.xticks(rotation=30, ha="right")
plt.title("Review Distribution per Category")
plt.show()

plt.figure(figsize=(10,6))
sns.barplot(x="category", y="avg_sentiment", data=cat_results)
plt.xticks(rotation=30, ha="right")
plt.title("Average Sentiment per Category")
plt.show()


In [None]:
# Cell 7: Insights

# Top 3 products by review volume
top_products = df["product_name_clean"].value_counts().head(5)
print("Top 3 Products:")
print(top_products)

# Compare differences (ratings, sentiment, recommend %)
top_products_stats = df[df["product_name_clean"].isin(top_products.index)].groupby("product_name_clean").agg({
    "predicted_sentiment_roberta": "mean",
    "rating": "mean" if "rating" in df.columns else "first",
    "doRecommend": "mean" if "doRecommend" in df.columns else "first",
    "zero_shot_label": "first"
})
top_products_stats


In [None]:
# Cell 8: Top complaints per top products (most common negative words)
from collections import Counter

def extract_complaints(subset: pd.DataFrame, top_n=10):
    texts = " ".join(subset.loc[subset["predicted_sentiment_roberta"] == 0.0, "text"].dropna().astype(str))
    words = re.findall(r"\b\w{3,}\b", texts.lower())
    return Counter(words).most_common(top_n)

for prod in top_products.index:
    complaints = extract_complaints(df[df["product_name_clean"] == prod])
    print(f"\nTop complaints for {prod}:")
    print(complaints)


In [None]:
# Cell 9: Worst product in each category
worst_products = (
    df.groupby(["zero_shot_label", "product_name_clean"])
    .agg(avg_sentiment=("predicted_sentiment_roberta", "mean"), count=("product_name_clean", "size"))
    .reset_index()
    .sort_values(["zero_shot_label", "avg_sentiment"])
)

worst_by_cat = worst_products.groupby("zero_shot_label").first().reset_index()
print("Worst products per category:")
worst_by_cat
