In [3]:
import re
import warnings
from collections import Counter

import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from transformers import pipeline

warnings.filterwarnings("ignore")


In [None]:
def create_text_features(row: pd.Series) -> str:
    text = ""
    name = row.get("name", "")
    if pd.notna(name) and str(name) != "nan":
        text = str(name)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

In [None]:
def name_clusters(df: pd.DataFrame, cluster_col: str, text_col: str) -> dict:
    names = {}
    for cid in sorted(df[cluster_col].unique()):
        all_text = " ".join(df.loc[df[cluster_col] == cid, text_col].tolist())
        word_counts = Counter(re.findall(r"\b\w{3,}\b", all_text))
        top_words = [w for w, _ in word_counts.most_common(3)]
        cname = " ".join(top_words).title() if top_words else f"Product Group {cid + 1}"
        names[cid] = cname
    return names

In [None]:
def compute_cluster_stats(df: pd.DataFrame, cluster_col: str) -> pd.DataFrame:
    stats_rows = []
    for cid, g in df.groupby(cluster_col):
        row = {
            "cluster_id": cid,
            "cluster_name": g["cluster_name"].iloc[0],
            "count": len(g),
            "avg_sentiment": g["predicted_sentiment_roberta"].mean(),
            "positive_pct": (g["predicted_sentiment_roberta"] > 0.5).mean() * 100,
        }
        if "rating" in g.columns:
            row["avg_rating"] = g["rating"].mean()
            row["high_rating_pct"] = (g["rating"] >= 4).mean() * 100
        if "doRecommend" in g.columns:
            row["recommend_pct"] = g["doRecommend"].mean() * 100
        stats_rows.append(row)
    return pd.DataFrame(stats_rows).sort_values("count", ascending=False).reset_index(drop=True)

In [None]:
def compute_category_stats(df: pd.DataFrame, cat_col: str) -> pd.DataFrame:
    stats_rows = []
    for cat, g in df.groupby(cat_col):
        row = {
            "category": cat,
            "count": len(g),
            "avg_sentiment": g["predicted_sentiment_roberta"].mean(),
            "positive_pct": (g["predicted_sentiment_roberta"] > 0.5).mean() * 100,
            "avg_zero_shot_score": g["zero_shot_score"].mean() if "zero_shot_score" in g.columns else np.nan,
        }
        if "rating" in g.columns:
            row["avg_rating"] = g["rating"].mean()
            row["high_rating_pct"] = (g["rating"] >= 4).mean() * 100
        if "doRecommend" in g.columns:
            row["recommend_pct"] = g["doRecommend"].mean() * 100
        stats_rows.append(row)
    return pd.DataFrame(stats_rows).sort_values("count", ascending=False).reset_index(drop=True)

In [None]:
def zero_shot_batch(
    texts: list[str],
    labels: list[str],
    model_name: str = "facebook/bart-large-mnli",
    batch_size: int = 16,
) -> tuple[list[str], list[float]]:
    zsc = pipeline("zero-shot-classification", model=model_name)
    preds, scores = [], []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        out = zsc(batch, candidate_labels=labels, multi_label=False)
        if isinstance(out, dict):
            out = [out]
        for o in out:
            preds.append(o["labels"][0])
            scores.append(float(o["scores"][0]))
    return preds, scores

In [5]:
def main() -> None:
    file_path = "../data/processed/Datafiniti_with_sentiments.csv"
    df = pd.read_csv(file_path)

    df["product_name_clean"] = df.apply(create_text_features, axis=1)
    df = df[df["product_name_clean"].str.len() > 0].reset_index(drop=True)

    sentiment_mapping = {"Positive": 1.0, "Neutral": 0.5, "Negative": 0.0}
    if "predicted_sentiment_roberta" in df.columns:
        df["predicted_sentiment_roberta"] = df["predicted_sentiment_roberta"].map(sentiment_mapping)
    else:
        df["predicted_sentiment_roberta"] = np.nan

    embedding_model_name = "all-MiniLM-L6-v2"
    model = SentenceTransformer(embedding_model_name)
    text_features = model.encode(df["product_name_clean"].tolist(), show_progress_bar=False)

    k_range = range(4, 7)
    silhouette_scores = []
    for k in k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
        cl = km.fit_predict(text_features)
        silhouette_scores.append(silhouette_score(text_features, cl))
    optimal_k = list(k_range)[int(np.argmax(silhouette_scores))]

    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10, max_iter=300)
    clusters = kmeans.fit_predict(text_features)
    df["cluster"] = clusters

    cluster_names = name_clusters(df, "cluster", "product_name_clean")
    df["cluster_name"] = df["cluster"].map(cluster_names)

    cluster_results = compute_cluster_stats(df, "cluster")

    mlflow.set_experiment("Product Clustering & Sentiment Analysis")

    with mlflow.start_run(run_name="kmeans_clustering_embedding"):
        mlflow.log_param("embedding_model", embedding_model_name)
        mlflow.log_param("cluster_method", "KMeans")
        mlflow.log_param("k_range", list(k_range))
        mlflow.log_param("optimal_k", optimal_k)
        mlflow.log_metric("silhouette_score", float(np.max(silhouette_scores)))

        for _, r in cluster_results.iterrows():
            prefix = f"cluster_{int(r['cluster_id'])}_{str(r['cluster_name']).replace(' ', '_')}"
            mlflow.log_metric(f"{prefix}_count", int(r["count"]))
            if not np.isnan(r["avg_sentiment"]):
                mlflow.log_metric(f"{prefix}_avg_sentiment", float(r["avg_sentiment"]))
                mlflow.log_metric(f"{prefix}_positive_pct", float(r["positive_pct"]))
            if "avg_rating" in r and not pd.isna(r["avg_rating"]):
                mlflow.log_metric(f"{prefix}_avg_rating", float(r["avg_rating"]))
            if "high_rating_pct" in r and not pd.isna(r["high_rating_pct"]):
                mlflow.log_metric(f"{prefix}_high_rating_pct", float(r["high_rating_pct"]))
            if "recommend_pct" in r and not pd.isna(r["recommend_pct"]):
                mlflow.log_metric(f"{prefix}_recommend_pct", float(r["recommend_pct"]))

        df.to_csv("clustered_products.csv", index=False)
        cluster_results.to_csv("cluster_stats.csv", index=False)
        mlflow.sklearn.log_model(kmeans, "kmeans_model")
        mlflow.log_artifact("clustered_products.csv")
        mlflow.log_artifact("cluster_stats.csv")

    zero_shot_labels = [
        "Fire Tablet Special",
        "AmazonBasics Performance Alkaline",
        "Anon",
        "Echo White Amazon",
        "Fire Kids Edition",
        "Fire Amazon With",
    ]

    zs_preds, zs_scores = zero_shot_batch(
        df["product_name_clean"].tolist(),
        labels=zero_shot_labels,
        model_name="facebook/bart-large-mnli",
        batch_size=16,
    )
    df["zero_shot_label"] = zs_preds
    df["zero_shot_score"] = zs_scores

    cat_results = compute_category_stats(df, "zero_shot_label")

    with mlflow.start_run(run_name="zero_shot_classification", nested=True):
        mlflow.log_param("zero_shot_model", "facebook/bart-large-mnli")
        mlflow.log_param("candidate_labels", zero_shot_labels)

        for _, r in cat_results.iterrows():
            prefix = f"zshot_{str(r['category']).replace(' ', '_')}"
            mlflow.log_metric(f"{prefix}_count", int(r["count"]))
            if not np.isnan(r["avg_sentiment"]):
                mlflow.log_metric(f"{prefix}_avg_sentiment", float(r["avg_sentiment"]))
                mlflow.log_metric(f"{prefix}_positive_pct", float(r["positive_pct"]))
            if "avg_zero_shot_score" in r and not pd.isna(r["avg_zero_shot_score"]):
                mlflow.log_metric(f"{prefix}_avg_confidence", float(r["avg_zero_shot_score"]))
            if "avg_rating" in r and not pd.isna(r["avg_rating"]):
                mlflow.log_metric(f"{prefix}_avg_rating", float(r["avg_rating"]))
            if "high_rating_pct" in r and not pd.isna(r["high_rating_pct"]):
                mlflow.log_metric(f"{prefix}_high_rating_pct", float(r["high_rating_pct"]))
            if "recommend_pct" in r and not pd.isna(r["recommend_pct"]):
                mlflow.log_metric(f"{prefix}_recommend_pct", float(r["recommend_pct"]))

        df.to_csv("zero_shot_products.csv", index=False)
        cat_results.to_csv("zero_shot_stats.csv", index=False)
        mlflow.log_artifact("zero_shot_products.csv")
        mlflow.log_artifact("zero_shot_stats.csv")


if __name__ == "__main__":
    main()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025/08/27 12:04:02 INFO mlflow.tracking.fluent: Experiment with name 'Product Clustering & Sentiment Analysis' does not exist. Creating a new experiment.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


KeyboardInterrupt: 