In [1]:
import pandas as pd
df = pd.read_csv("../data/processed/electronics_subset.csv")
df = df[["asin", "reviewText"]]
df.head()

Unnamed: 0,asin,reviewText
0,B003LPUWT0,I had not heard of the TCL brand before and di...
1,B004BFXBXI,For $12 you get about $100 worth of fun. Bough...
2,B004QJ9JLW,"Not much I can say about an HDMI cord. Solid, ..."
3,B00HAWW590,Looks like the 2014 version of the classic 200...
4,B008Z2661W,I bought this for my 4 year old a few months a...


In [3]:
df["reviewText"] = df["reviewText"].fillna("").astype(str)
product_text = (
    df
    .groupby("asin")["reviewText"]
    .apply(lambda x: " ".join(x))
    .reset_index()
)
product_text.head()

Unnamed: 0,asin,reviewText
0,0972683275,I have purchased 3 of these wall mounts and th...
1,1400532655,"I've owned the Sony PRS-500, Kindle 1, 2 & 3, ..."
2,140053271X,"This is a sold update to the older nook, but g..."
3,9983891212,"As everyone should know, any HDMI cable should..."
4,B000001OM5,This arrived in only two days so I popped it i...


In [4]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]"," ",text)
    text = re.sub(r"\s+", " ", text).strip()
    return text
product_text["clean_text"] = product_text["reviewText"].apply(clean_text)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(
    max_features=20_000,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=5
)
tfidf_matrix = tfidf.fit_transform(product_text["clean_text"])
tfidf_matrix.shape

(7370, 20000)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
item_index = {
    asin: idx
    for idx, asin in enumerate(product_text["asin"])
}

In [7]:
def recommend_similar_items(asin, top_n=10):
    if asin not in item_index:
        return []
    idx = item_index[asin]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(
        sim_scores,
        key=lambda x: x[1],
        reverse=True
    )[1:top_n+1]
    similar_indices = [i for i, _ in sim_scores]
    return product_text.iloc[similar_indices]["asin"].tolist()

In [8]:
sample_asin = product_text["asin"].iloc[0]
recommend_similar_items(sample_asin, top_n=5)

['B003O1UYHG', 'B000NMFCIA', 'B001GTTEBK', 'B001TIG36C', 'B0032PAOWY']

In [9]:
import pickle
with open("../models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
with open("../models/item_index.pkl", "wb") as f:
    pickle.dump(item_index, f)
with open("../models/product_text.pkl", "wb") as f:
    pickle.dump(product_text, f)