# MLflow Runs for Flipkart Sentiment Analysis

This notebook logs TF-IDF + Logistic Regression experiments to MLflow and saves artifacts (plots, model, vectorizer).
- Experiment: `flipkart-sentiment`
- Metrics: F1, precision, recall, ROC AUC
- Artifacts: confusion matrix, ROC, PR curves, model/vectorizer pickles, MLflow sklearn model
- Change params below to create multiple runs; view in MLflow UI: `mlflow ui --backend-store-uri ./mlruns --port 5000`


In [None]:
import os
import tempfile
from pathlib import Path
import pickle

import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    PrecisionRecallDisplay,
    RocCurveDisplay,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    average_precision_score,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

mlflow.set_experiment("flipkart-sentiment")


<Experiment: artifact_location=('/Users/vivek/anaconda_projects/Innomatics/data science/DS '
 'Internship/sentimental_analysis_flipkart_reviews/notebooks/mlruns/1'), creation_time=1770629007944, experiment_id='1', last_update_time=1770629007944, lifecycle_stage='active', name='flipkart-sentiment', tags={}>

In [6]:
# Load processed data
raw_candidates = [Path("../data/processed/clean_reviews.csv"), Path("data/processed/clean_reviews.csv")]
data_path = next((p for p in raw_candidates if p.exists()), None)
assert data_path is not None, "clean_reviews.csv not found; run preprocessing first"

df = pd.read_csv(data_path)
print(df.shape)
print(df.columns)


(7895, 11)
Index(['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes',
       'Down Votes', 'Month', 'Review Text', 'Rating', 'sentiment', 'text',
       'clean_text'],
      dtype='object')


In [None]:
def log_metrics_binary(y_true, y_pred, y_prob=None):
    mlflow.log_metric("f1_macro", f1_score(y_true, y_pred, average="macro"))
    mlflow.log_metric("f1_weighted", f1_score(y_true, y_pred, average="weighted"))
    mlflow.log_metric("precision_macro", precision_score(y_true, y_pred, average="macro"))
    mlflow.log_metric("recall_macro", recall_score(y_true, y_pred, average="macro"))
    if y_prob is not None:
        mlflow.log_metric("roc_auc", roc_auc_score(y_true, y_prob))
        mlflow.log_metric("pr_auc", average_precision_score(y_true, y_prob))


def log_fig(name, plot_fn, *args, **kwargs):
    fig, ax = plt.subplots()
    plot_fn(*args, ax=ax, **kwargs)
    plt.tight_layout()
    mlflow.log_figure(fig, name)
    plt.close(fig)


def run_tfidf_logreg(C=1.0, max_features=5000, ngram_range=(1, 2), class_weight="balanced"):
    run_name = f"logreg_C{C}_mf{max_features}_ng{ngram_range}"
    with mlflow.start_run(run_name=run_name) as run:
        mlflow.log_params({
            "model": "logreg",
            "C": C,
            "max_features": max_features,
            "ngram_range": ngram_range,
            "class_weight": class_weight,
            "test_size": 0.2,
            "random_state": 42,
        })

        tfidf = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
        X = tfidf.fit_transform(df["clean_text"])
        y = df["sentiment"].astype(int)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        model = LogisticRegression(max_iter=1000, C=C, class_weight=class_weight)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        log_metrics_binary(y_test, y_pred, y_prob)

        log_fig("confusion_matrix.png", ConfusionMatrixDisplay.from_predictions, y_test, y_pred)
        log_fig("roc_curve.png", RocCurveDisplay.from_predictions, y_test, y_prob)
        log_fig("pr_curve.png", PrecisionRecallDisplay.from_predictions, y_test, y_prob)

        with tempfile.TemporaryDirectory() as d:
            vpath = Path(d) / "vectorizer.pkl"
            mpath = Path(d) / "model.pkl"
            pickle.dump(tfidf, open(vpath, "wb"))
            pickle.dump(model, open(mpath, "wb"))
            mlflow.log_artifact(vpath, artifact_path="artifacts")
            mlflow.log_artifact(mpath, artifact_path="artifacts")

        mlflow.sklearn.log_model(model, artifact_path="sk_model")
        mlflow.set_tag("embedding", "tfidf")
        mlflow.set_tag("stage", "baseline")

        print(f"Run {run.info.run_id} logged as {run_name}")


def run_tfidf_nb(alpha=1.0, max_features=5000, ngram_range=(1, 2)):
    run_name = f"nb_alpha{alpha}_mf{max_features}_ng{ngram_range}"
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params({
            "model": "MultinomialNB",
            "alpha": alpha,
            "max_features": max_features,
            "ngram_range": ngram_range,
            "test_size": 0.2,
            "random_state": 42,
        })
        tfidf = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
        X = tfidf.fit_transform(df["clean_text"])
        y = df["sentiment"].astype(int)
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        model = MultinomialNB(alpha=alpha)
        model.fit(Xtr, ytr)
        y_pred = model.predict(Xte)
        y_prob = model.predict_proba(Xte)[:, 1]
        log_metrics_binary(yte, y_pred, y_prob)
        log_fig("confusion_matrix.png", ConfusionMatrixDisplay.from_predictions, yte, y_pred)
        log_fig("roc_curve.png", RocCurveDisplay.from_predictions, yte, y_prob)
        log_fig("pr_curve.png", PrecisionRecallDisplay.from_predictions, yte, y_prob)
        with tempfile.TemporaryDirectory() as d:
            vpath = Path(d) / "vectorizer.pkl"
            mpath = Path(d) / "model.pkl"
            pickle.dump(tfidf, open(vpath, "wb"))
            pickle.dump(model, open(mpath, "wb"))
            mlflow.log_artifact(vpath, artifact_path="artifacts")
            mlflow.log_artifact(mpath, artifact_path="artifacts")
        mlflow.sklearn.log_model(model, artifact_path="sk_model")
        mlflow.set_tag("embedding", "tfidf")
        mlflow.set_tag("stage", "nb")


def run_tfidf_linearsvc(C=1.0, max_features=5000, ngram_range=(1, 2), calibrated=True):
    run_name = f"linearsvc_C{C}_mf{max_features}_ng{ngram_range}_{'cal' if calibrated else 'raw'}"
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params({
            "model": "LinearSVC",
            "C": C,
            "max_features": max_features,
            "ngram_range": ngram_range,
            "calibrated": calibrated,
            "test_size": 0.2,
            "random_state": 42,
        })
        tfidf = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
        X = tfidf.fit_transform(df["clean_text"])
        y = df["sentiment"].astype(int)
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        base = LinearSVC(C=C)
        if calibrated:
            model = CalibratedClassifierCV(base_estimator=base, method="sigmoid", cv=5)
            model.fit(Xtr, ytr)
            y_pred = model.predict(Xte)
            y_prob = model.predict_proba(Xte)[:, 1]
            log_metrics_binary(yte, y_pred, y_prob)
            log_fig("confusion_matrix.png", ConfusionMatrixDisplay.from_predictions, yte, y_pred)
            log_fig("roc_curve.png", RocCurveDisplay.from_predictions, yte, y_prob)
            log_fig("pr_curve.png", PrecisionRecallDisplay.from_predictions, yte, y_prob)
        else:
            model = base
            model.fit(Xtr, ytr)
            y_pred = model.predict(Xte)
            log_metrics_binary(yte, y_pred, None)
            log_fig("confusion_matrix.png", ConfusionMatrixDisplay.from_predictions, yte, y_pred)
        with tempfile.TemporaryDirectory() as d:
            vpath = Path(d) / "vectorizer.pkl"
            mpath = Path(d) / "model.pkl"
            pickle.dump(tfidf, open(vpath, "wb"))
            pickle.dump(model, open(mpath, "wb"))
            mlflow.log_artifact(vpath, artifact_path="artifacts")
            mlflow.log_artifact(mpath, artifact_path="artifacts")
        mlflow.sklearn.log_model(model, artifact_path="sk_model")
        mlflow.set_tag("embedding", "tfidf")
        mlflow.set_tag("stage", "linearsvc")


def run_tfidf_rf(n_estimators=200, max_depth=None, max_features=5000, ngram_range=(1, 2)):
    run_name = f"rf_{n_estimators}_{max_depth}_mf{max_features}_ng{ngram_range}"
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params({
            "model": "RandomForest",
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "max_features_tfidf": max_features,
            "ngram_range": ngram_range,
            "test_size": 0.2,
            "random_state": 42,
        })
        tfidf = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
        X = tfidf.fit_transform(df["clean_text"])
        y = df["sentiment"].astype(int)
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            n_jobs=-1,
            random_state=42,
        )
        model.fit(Xtr, ytr)
        y_pred = model.predict(Xte)
        y_prob = model.predict_proba(Xte)[:, 1]
        log_metrics_binary(yte, y_pred, y_prob)
        log_fig("confusion_matrix.png", ConfusionMatrixDisplay.from_predictions, yte, y_pred)
        log_fig("roc_curve.png", RocCurveDisplay.from_predictions, yte, y_prob)
        log_fig("pr_curve.png", PrecisionRecallDisplay.from_predictions, yte, y_prob)
        with tempfile.TemporaryDirectory() as d:
            vpath = Path(d) / "vectorizer.pkl"
            mpath = Path(d) / "model.pkl"
            pickle.dump(tfidf, open(vpath, "wb"))
            pickle.dump(model, open(mpath, "wb"))
            mlflow.log_artifact(vpath, artifact_path="artifacts")
            mlflow.log_artifact(mpath, artifact_path="artifacts")
        mlflow.sklearn.log_model(model, artifact_path="sk_model")
        mlflow.set_tag("embedding", "tfidf")
        mlflow.set_tag("stage", "rf")


In [None]:
# Logistic Regression sweep
for C in [0.5, 1.0, 2.0]:
    run_tfidf_logreg(C=C, max_features=5000, ngram_range=(1, 2), class_weight="balanced")

# MultinomialNB sweep
for alpha in [0.5, 1.0, 2.0]:
    run_tfidf_nb(alpha=alpha, max_features=5000, ngram_range=(1, 2))

# LinearSVC sweep (calibrated for probabilities)
for C in [0.5, 1.0, 2.0]:
    run_tfidf_linearsvc(C=C, max_features=5000, ngram_range=(1, 2), calibrated=True)

# RandomForest sweep
for n_estimators in [200, 500]:
    run_tfidf_rf(n_estimators=n_estimators, max_depth=None, max_features=5000, ngram_range=(1, 2))


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


Run 104c7aaf326a4b57ab4f1f8d00d39c7e logged as logreg_C0.5_mf5000_ng(1, 2)


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


Run 60fef656fb3a46a080ca7e3d36a04ca6 logged as logreg_C1.0_mf5000_ng(1, 2)


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


Run 4447323394534e48b07f60b5558fdc97 logged as logreg_C2.0_mf5000_ng(1, 2)


Run MLflow UI locally to explore runs:

```bash
mlflow ui --backend-store-uri ./mlruns --port 5000
```
Open http://127.0.0.1:5000 and view metrics, plots, and artifacts. Use the UI to register the best run as a model if desired.
