# Recommender

In [None]:
import os

import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from pymongo import MongoClient
from sklearn.model_selection import train_test_split
from surprise import (
    NMF,
    SVD,
    BaselineOnly,
    CoClustering,
    Dataset,
    KNNBaseline,
    KNNBasic,
    KNNWithMeans,
    KNNWithZScore,
    NormalPredictor,
    Reader,
    SlopeOne,
    SVDpp,
    accuracy,
)
from surprise.model_selection import GridSearchCV, KFold, cross_validate
from transformers import pipeline

In [None]:
load_dotenv()

In [None]:
MONGO_URI = os.getenv("MONGO_URI")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
INDEX_NAME = "recommender-system"
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {"allow_download": "True"}
model_path = "nlptown/bert-base-multilingual-uncased-sentiment"
RANDOM_STATE = 101
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_EXPERIMENT_NAME = os.getenv("MLFLOW_EXPERIMENT_NAME")

## Reviews Data

In [None]:
connection = MongoClient(MONGO_URI)
db = connection["shein-mirror"]

In [None]:
input_data = db["product_reviews"]
data = pd.DataFrame(list(input_data.find()))

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data["rating"].plot(kind="hist")

In [None]:
data["rating"].value_counts()

In [None]:
sentiment_pipeline = pipeline(
    "sentiment-analysis", model=model_path, tokenizer=model_path, device="mps", batch_size=8, truncation=True
)

In [None]:
res = sentiment_pipeline(data["review"].to_list())

In [None]:
res[0:5]

In [None]:
data["sentiment"] = [int(r["label"][0:1]) for r in res]
data["sentiment_score"] = [r["score"] for r in res]
data["rating_from_score"] = np.round(data["sentiment_score"] * 5)

In [None]:
data.head()

In [None]:
data["sentiment"].plot(kind="hist")

In [None]:
data["rating_from_score"].plot(kind="hist")

In [None]:
data.head()

In [None]:
data.to_parquet("data/processed/reviews.parquet", engine="pyarrow")

## Collaborative Filtering

In [None]:
data = pd.read_parquet("data/processed/reviews.parquet", engine="pyarrow")
data.head()

In [None]:
data["rating"].plot(kind="hist")

In [None]:
data["rating"] = data["sentiment"].apply(lambda x: int(x[0:1]))
data.head()

In [None]:
data["rating"].plot(kind="hist")

In [None]:
db["product_reviews-mirror"].drop()
db["product_reviews-mirror"].insert_many(data.to_dict(orient="records"))

In [None]:
data = data[["nickname", "product_id", "sentiment"]]
data = data.rename(columns={"nickname": "userID", "product_id": "itemID", "sentiment": "rating"})

In [None]:
data.info()

In [None]:
data["rating"].plot(kind="hist")

In [None]:
users = list(data["userID"].unique())

In [None]:
len(users)

In [None]:
products = list(data["itemID"].unique())

In [None]:
len(products)

In [None]:
train, test = train_test_split(data, test_size=0.3, random_state=RANDOM_STATE, stratify=data[["rating"]])

In [None]:
reader = Reader(rating_scale=(1, 5))
data_sp = Dataset.load_from_df(data, reader=reader)
train_sp = Dataset.load_from_df(train, reader=reader)
test_sp = Dataset.load_from_df(test, reader=reader)

In [None]:
trainset = train_sp.build_full_trainset()
testset = test_sp.build_full_trainset().build_testset()

In [None]:
algo = [
    SVD(random_state=RANDOM_STATE),
    BaselineOnly(),
    CoClustering(),
    KNNBaseline(),
    KNNWithZScore(),
    KNNWithMeans(),
    SlopeOne(),
    KNNBasic(),
    NormalPredictor(),
    NMF(random_state=RANDOM_STATE),
    SVDpp(random_state=RANDOM_STATE),
]
names = [algo[i].__class__.__name__ for i in range(len(algo))]

In [None]:
algos = {names[i]: {"algo": algo[i]} for i in range(len(algo))}

In [None]:
result = np.zeros((len(names), 4))

In [None]:
with mlflow.start_run(run_name="Default parameters") as run:
    experiment_id = run.info.experiment_id
    for k, v in algos.items():
        with mlflow.start_run(experiment_id=experiment_id, run_name=k, nested=True) as subruns:
            tab = cross_validate(v["algo"], data_sp, cv=5, verbose=True, n_jobs=-1)
            v["test_RMSE"] = tab["test_rmse"]
            v["test_mae"] = tab["test_mae"]
            v["fit_time"] = tab["fit_time"]
            v["test_time"] = tab["test_time"]

            rmse = np.mean(tab["test_rmse"])
            mae = np.mean(tab["test_mae"])
            ft = np.mean(tab["fit_time"])
            tt = np.mean(tab["test_time"])

            mlflow.log_metrics({"rmse_test": rmse, "mae_test": mae, "fit_time": ft, "test_time": tt})
            mlflow.sklearn.log_model(v["algo"], k)

            result[algo.index(v["algo"])] = [rmse, mae, ft, tt]

In [None]:
plt.figure(figsize=(12, 8))
for k, v in algos.items():
    plt.boxplot(
        v["test_RMSE"],
        positions=[list(algos.keys()).index(k)],
        widths=0.6,
        patch_artist=True,
        boxprops=dict(
            facecolor="C" + str(list(algos.keys()).index(k)), color="C" + str(list(algos.keys()).index(k)), linewidth=2
        ),
    )
plt.ylabel("RMSE")
plt.xticks(range(len(algos)), list(algos.keys()), rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
for k, v in algos.items():
    plt.boxplot(
        v["test_mae"],
        positions=[list(algos.keys()).index(k)],
        widths=0.6,
        patch_artist=True,
        boxprops=dict(
            facecolor="C" + str(list(algos.keys()).index(k)), color="C" + str(list(algos.keys()).index(k)), linewidth=2
        ),
    )
plt.ylabel("MAE")
plt.xticks(range(len(algos)), list(algos.keys()), rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
for k, v in algos.items():
    plt.boxplot(
        v["fit_time"],
        positions=[list(algos.keys()).index(k)],
        widths=0.6,
        patch_artist=True,
        boxprops=dict(
            facecolor="C" + str(list(algos.keys()).index(k)), color="C" + str(list(algos.keys()).index(k)), linewidth=2
        ),
    )
plt.ylabel("Time (s)")
plt.xticks(range(len(algos)), list(algos.keys()), rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
for k, v in algos.items():
    plt.boxplot(
        v["test_time"],
        positions=[list(algos.keys()).index(k)],
        widths=0.6,
        patch_artist=True,
        boxprops=dict(
            facecolor="C" + str(list(algos.keys()).index(k)), color="C" + str(list(algos.keys()).index(k)), linewidth=2
        ),
    )
plt.ylabel("Time (s)")
plt.xticks(range(len(algos)), list(algos.keys()), rotation=45)
plt.tight_layout()

In [None]:
result = result.round(3)
result = pd.DataFrame(result, index=list(algos.keys()), columns=["RMSE", "MAE", "fit_time", "test_time"])
result = result.sort_values(by="RMSE")

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(result, annot=True, linecolor="w", linewidth=2, cmap=sns.color_palette("Blues"))
plt.title("Data summary")
plt.show()

In [None]:
param_grid_svd = {
    "n_epochs": [5, 10, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6],
    "n_factors": [15, 30, 100],
    "random_state": [RANDOM_STATE],
}

In [None]:
param_grid_bso = {"bsl_options": {"method": ["als", "sgd"], "n_epochs": [5, 15], "reg_u": [10, 20], "reg_i": [5, 15]}}

In [None]:
algos = {
    "SVD": {"algo": SVD, "params": param_grid_svd},
    "BaselineOnly": {"algo": BaselineOnly, "params": param_grid_bso},
}

In [None]:
with mlflow.start_run(run_name="Fine-tuned-selected") as run:
    experiment_id = run.info.experiment_id
    for k, v in algos.items():
        print(k)
        with mlflow.start_run(experiment_id=experiment_id, run_name=k, nested=True) as subruns:
            gs = GridSearchCV(v["algo"], v["params"], measures=["rmse", "mae"], cv=5, n_jobs=-1)
            gs.fit(train_sp)

            mlflow.log_params(gs.best_params["rmse"])
            mlflow.log_metrics({"rmse": gs.best_score["rmse"], "mae": gs.best_score["mae"]})

            algo = v["algo"](**gs.best_params["rmse"])
            algo.fit(trainset)
            predictions = algo.test(testset)
            rmse = accuracy.rmse(predictions)
            mae = accuracy.mae(predictions)

            mlflow.log_metrics({"rmse_test": rmse, "mae_test": mae})

            mlflow.sklearn.log_model(algo, k)

In [None]:
gs.best_params

In [None]:
algo.fit(trainset)

In [None]:
# predictions_train = algo.predict(trainset.build_testset())
predictions_test = algo.test(testset)

In [None]:
print(accuracy.rmse(predictions_test))

In [None]:
# Crea la matriz de predicciones
n = len(users)
m = len(products)
recomendation = np.zeros((n, m))

for k in users:
    u = users.index(k)
    for l in products:
        i = products.index(l)
        recomendation[u, i] = algo.predict(k, l, verbose=False)[3]

recomendation = pd.DataFrame(recomendation, index=users, columns=products)

In [None]:
recomendation.head()

In [None]:
kf = KFold(n_splits=3)
bsl_options = {"method": "als", "n_epochs": 5, "reg_u": 12, "reg_i": 5}

algo = BaselineOnly(bsl_options=bsl_options)
for trainset, testset in kf.split(data_sp):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

In [None]:
algo.fit(data_sp.build_full_trainset())

In [None]:
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data_sp, measures=["RMSE", "MAE"], cv=3, verbose=False)

In [None]:
trainset, testset = train_test_split(data_sp, test_size=0.25)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

In [None]:
def get_Iu(uid):
    """Return the number of items rated by given user
    args:
      uid: the id of the user
    returns:
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0


def get_Ui(iid):
    """Return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0


df = pd.DataFrame(predictions, columns=["uid", "iid", "rui", "est", "details"])
df["Iu"] = df.uid.apply(get_Iu)
df["Ui"] = df.iid.apply(get_Ui)
df["err"] = abs(df.est - df.rui)

In [None]:
df[["Iu", "err"]].plot(kind="scatter", x="Iu", y="err")

In [None]:
df["err"].plot(kind="hist", bins=30)

In [None]:
df.groupby("Iu")["err"].min().plot(label="min")
df.groupby("Iu")["err"].max().plot(label="max")
df.groupby("Iu")["err"].mean().plot(label="mean")
plt.legend()

In [None]:
df.groupby("Iu")["err"].min().plot(label="min")
df.groupby("Iu")["err"].max().plot(label="max")
df.groupby("Iu")["err"].mean().plot(label="mean")
plt.xlim(0, 20)
plt.legend()

In [None]:
best_predictions = df[(df["err"] <= 1.0)]
worst_predictions = df[(df["err"] > 1.0)]

In [None]:
worst_predictions["Iu"].plot(kind="hist")

In [None]:
best_predictions["Iu"].plot(kind="hist")

In [None]:
best_predictions.groupby("Iu")["err"].min().plot(label="min")
best_predictions.groupby("Iu")["err"].mean().plot(label="mean")
best_predictions.groupby("Iu")["err"].max().plot(label="max")
plt.legend()

In [None]:
worst_predictions.groupby("Iu")["err"].min().plot(label="min")
worst_predictions.groupby("Iu")["err"].mean().plot(label="mean")
worst_predictions.groupby("Iu")["err"].max().plot(label="max")
plt.legend()

In [None]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid in top_n:
            top_n[uid].append((iid, est))
        else:
            top_n[uid] = [(iid, est)]
    # Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [None]:
top_n = get_top_n(predictions, n=10)

In [None]:
len(top_n)