In [None]:
import os

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain_community.embeddings import GPT4AllEmbeddings
from pinecone import Pinecone
from pymongo import MongoClient
from transformers import pipeline

In [None]:
!export PYDEVD_DISABLE_FILE_VALIDATION=1

In [None]:
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
INDEX_NAME = "recommender-system"
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {"allow_download": "True"}

In [None]:
def get_embeddings(text: str) -> np.ndarray:  # noqa
    gpt4all_kwargs = {"allow_download": "True"}
    gpt4all_embd = GPT4AllEmbeddings(model_name=model_name, gpt4all_kwargs=gpt4all_kwargs)
    return gpt4all_embd.embed_query(text)

In [None]:
def get_recommendations(pinecone_index, search_term, top_k=10):  # noqa
    embed = get_embeddings(search_term)
    res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
    return res

In [None]:
data = pd.read_csv("data/raw/shein-mirror.csv")

In [None]:
data.head()

In [None]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
# if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
#     pinecone.delete_index(INDEX_NAME)
#
# pinecone.create_index(
#     name=INDEX_NAME, dimension=384, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
# )

In [None]:
index = pinecone.Index(INDEX_NAME)

In [None]:
index.describe_index_stats()

In [None]:
CHUNK_SIZE = 400
TOTAL_ROWS = 10000
chunks = pd.read_csv("data/raw/shein-mirror.csv", chunksize=CHUNK_SIZE, nrows=TOTAL_ROWS)
chunk_num = 0
for chunk in chunks:
    titles = chunk["title"].tolist()
    full_description = chunk["full_description"].tolist()
    _ids = chunk["_id"].tolist()
    prepped = [
        {
            "id": str(chunk_num * CHUNK_SIZE + i),
            "values": get_embeddings(full_description[i]),
            "metadata": {"title": titles[i], "_id": str(_ids[i])},
        }
        for i in range(0, len(titles))
    ]
    chunk_num = chunk_num + 1
    if len(prepped) >= 200:
        index.upsert(prepped)
        prepped = []

In [None]:
index.describe_index_stats()

In [None]:
reco = get_recommendations(index, "tooth")

In [None]:
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

# Recommender

In [None]:
connection = MongoClient(MONGO_URI)
db = connection["shein-mirror"]
input_data = db["product_reviews"]
data = pd.DataFrame(list(input_data.find()))

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data["rating"].plot(kind="hist")

In [None]:
data["rating"].value_counts()

In [None]:
model_path = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_pipeline = pipeline(
    "sentiment-analysis", model=model_path, tokenizer=model_path, device="mps", batch_size=8, truncation=True
)

In [None]:
res = sentiment_pipeline(data["review"].to_list())

In [None]:
res[0:5]

In [None]:
data["sentiment"] = [int(r["label"][0:1]) for r in res]
data["sentiment_score"] = [r["score"] for r in res]
data["rating_from_score"] = np.round(data["sentiment_score"] * 5)

In [None]:
data.head()

In [None]:
data["sentiment"].plot(kind="hist")

In [None]:
data["rating_from_score"].plot(kind="hist")

In [None]:
data.head()

In [None]:
data.to_parquet("data/processed/reviews.parquet", engine="pyarrow")