In [85]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from nltk import download


df = pd.read_csv("dataset/amazon_products_sample.csv")

In [109]:
import torch

In [86]:
df.shape

(9890, 13)

In [87]:
df.columns

Index(['asin', 'title', 'imgUrl', 'productURL', 'stars', 'reviews', 'price',
       'listPrice', 'category_id', 'isBestSeller', 'boughtInLastMonth', 'id',
       'category_name'],
      dtype='object')

In [88]:
df['isBestSeller'].value_counts()

isBestSeller
False    9822
True       68
Name: count, dtype: int64

In [89]:
df['category_name'].value_counts()

category_name
Abrasive & Finishing Products                     40
Paint, Wall Treatments & Supplies                 40
Party Decorations                                 40
Party Supplies                                    40
Perfumes & Fragrances                             40
                                                  ..
Heavy Duty & Commercial Vehicle Equipment         40
eBook Readers & Accessories                       40
Garment Bags                                      39
Gift Cards                                        29
Smart Home Thermostats - Compatibility Checker    22
Name: count, Length: 248, dtype: int64

In [90]:
df[df['category_name'] == "Men's Watches"]

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,id,category_name
5508,B07PWJ563Y,MTP-VT01L-7B2 Men's Minimalistic Silver Dial B...,https://m.media-amazon.com/images/I/61olBcYExf...,https://www.amazon.com/dp/B07PWJ563Y,4.4,0,35.94,0.0,113,False,0,113,Men's Watches
5509,B077HF3GZZ,Bund Strap Waterproof Leather Watch Strap,https://m.media-amazon.com/images/I/61ciw46Hvl...,https://www.amazon.com/dp/B077HF3GZZ,4.1,0,39.95,0.0,113,False,0,113,Men's Watches
5510,B0CB8JZ1CF,Men's 21Jewel Mechanical Watch 316L Stainless ...,https://m.media-amazon.com/images/I/71KGe2pg1T...,https://www.amazon.com/dp/B0CB8JZ1CF,4.8,0,189.99,0.0,113,False,0,113,Men's Watches
5511,B00OI8IIS6,Men's Stainless Steel Quart Watch with Leather...,https://m.media-amazon.com/images/I/71FsxiYQyn...,https://www.amazon.com/dp/B00OI8IIS6,4.7,0,28.95,0.0,113,False,100,113,Men's Watches
5512,B00PT3RZN8,20mm Black Rubber Watch Band Strap Scuba Diver...,https://m.media-amazon.com/images/I/312VqSXuuq...,https://www.amazon.com/dp/B00PT3RZN8,5.0,0,13.99,0.0,113,False,0,113,Men's Watches
5513,B08XWNCKKW,"Men's Quartz Watch with Stainless Steel Strap,...",https://m.media-amazon.com/images/I/61xZHuZWJz...,https://www.amazon.com/dp/B08XWNCKKW,5.0,0,352.0,440.0,113,False,0,113,Men's Watches
5514,B07RHVR87B,Men's Watch Western Collection (Steel/Black),https://m.media-amazon.com/images/I/71UCMiPUTc...,https://www.amazon.com/dp/B07RHVR87B,5.0,0,59.95,0.0,113,False,0,113,Men's Watches
5515,B07QSC61MT,Timex Unisex Chronograph Watch The Fairfield w...,https://m.media-amazon.com/images/I/71pz8Z8PVa...,https://www.amazon.com/dp/B07QSC61MT,4.6,0,49.99,0.0,113,False,0,113,Men's Watches
5516,B0932S6J8G,BENYAR Men's Watch Quartz Movement Watches for...,https://m.media-amazon.com/images/I/81MMrwVb3c...,https://www.amazon.com/dp/B0932S6J8G,4.1,0,29.99,0.0,113,False,0,113,Men's Watches
5517,B0946SXJCJ,"Men's Wrist Watch, Stylish Stainless Steel Wat...",https://m.media-amazon.com/images/I/717Qt5cwMt...,https://www.amazon.com/dp/B0946SXJCJ,4.2,0,29.99,0.0,113,False,0,113,Men's Watches


In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9890 entries, 0 to 9889
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   asin               9890 non-null   object 
 1   title              9890 non-null   object 
 2   imgUrl             9890 non-null   object 
 3   productURL         9890 non-null   object 
 4   stars              9890 non-null   float64
 5   reviews            9890 non-null   int64  
 6   price              9890 non-null   float64
 7   listPrice          9890 non-null   float64
 8   category_id        9890 non-null   int64  
 9   isBestSeller       9890 non-null   bool   
 10  boughtInLastMonth  9890 non-null   int64  
 11  id                 9890 non-null   int64  
 12  category_name      9890 non-null   object 
dtypes: bool(1), float64(3), int64(4), object(5)
memory usage: 937.0+ KB


In [93]:
df.isnull().sum()

asin                 0
title                0
imgUrl               0
productURL           0
stars                0
reviews              0
price                0
listPrice            0
category_id          0
isBestSeller         0
boughtInLastMonth    0
id                   0
category_name        0
dtype: int64

In [94]:
df.dropna(inplace=True)

In [95]:
df['title'].duplicated().sum()

0

In [96]:
df.drop_duplicates(subset=['title'], inplace=True)

In [97]:
df_copy = df.copy()

In [98]:
df_copy['isBestSeller'] = df_copy['isBestSeller'].map({True: 'Best Seller', False: 'None'})


In [99]:
df_copy['information'] = df_copy.apply(lambda x : str(x['category_name'])+" "+str(x['title']) +" "+ str(x['isBestSeller']), axis=1)
df_copy.head()

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,id,category_name,information
0,B07XFL44BG,uxcell 5 Inch Wet Dry Sanding Discs 5000 Grit ...,https://m.media-amazon.com/images/I/61Sv9zTj+C...,https://www.amazon.com/dp/B07XFL44BG,4.2,0,9.49,0.0,142,,0,142,Abrasive & Finishing Products,Abrasive & Finishing Products uxcell 5 Inch We...
1,B0821XBPC3,uxcell 3-Inch Sanding Sponge Hook and Loop San...,https://m.media-amazon.com/images/I/61NTU3igOY...,https://www.amazon.com/dp/B0821XBPC3,4.3,0,12.99,0.0,142,,0,142,Abrasive & Finishing Products,Abrasive & Finishing Products uxcell 3-Inch Sa...
2,B09ZRTGG7D,Steel Wool - 2-Pack (12-Count a Pack) Fine Ste...,https://m.media-amazon.com/images/I/617a9Blyhs...,https://www.amazon.com/dp/B09ZRTGG7D,0.0,0,13.99,0.0,142,,0,142,Abrasive & Finishing Products,Abrasive & Finishing Products Steel Wool - 2-P...
3,B08HY4V9GK,"uxcell 2"" Hook and Loop Sanding Disc 5000/7000...",https://m.media-amazon.com/images/I/51kFNDGkKc...,https://www.amazon.com/dp/B08HY4V9GK,3.2,0,4.99,0.0,142,,0,142,Abrasive & Finishing Products,"Abrasive & Finishing Products uxcell 2"" Hook a..."
4,B0002FU670,"3M 07522 Scotch-Brite Multi-Flex, 8-Inch-by-20...",https://m.media-amazon.com/images/I/A1iyKYRvYF...,https://www.amazon.com/dp/B0002FU670,4.1,0,55.44,0.0,142,,0,142,Abrasive & Finishing Products,Abrasive & Finishing Products 3M 07522 Scotch-...


In [103]:
df_copy[df_copy['isBestSeller']=='Best Seller']['information'].loc[421]

'Automotive Replacement Parts SUPFINE Magnetic for iPhone 13 Case [Compatible with MagSafe][10 FT Military Grade Drop Protection] [2+Tempered Glass Screen Protector] Non-Slip Full-Body Shockproof Slim Phone Case,Matte Black Best Seller'

In [47]:
df_copy['information'].loc[0]

'Safety & Security 3M P100 Advanced Respirator Filter 2297, 1 Pair, Helps Protect Against Oil and Non-Oil Based Particulates, Nuisance Level Organic Vapor Relief, Mining, Shipbuilding, Abatement, Utilities None'

In [55]:
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# import string
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt_tab')
# from nltk.stem import PorterStemmer
#
# stemmer = PorterStemmer()
# def cleaned_text(text):
#     text = text.lower()
#     stop_words = set(stopwords.words('english'))
#     tokenzie_text = word_tokenize(text)
#     word = [word for word in tokenzie_text if word not in stop_words and string.punctuation and word.isalnum()]
#     stem_word = [stemmer.stem(word) for word in word]
#     return " ".join(stem_word)

[nltk_data] Downloading package stopwords to /Users/ricky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/ricky/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [104]:
import re

def cleaned_text(text):
    text_cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return text_cleaned

In [105]:
text = 'Household Cleaning Supplies Electric Spin Scrubber,Cordless Cleaning Brush,Shower Cleaning Brush with 8 Replaceable Brush Heads, Power Scrubber 3 Adjustable Speeds,Adjustable & Detachable Long Handle,Voice Broadcast Best Seller'
cleaned_text(text)

'household cleaning supplies electric spin scrubbercordless cleaning brushshower cleaning brush with 8 replaceable brush heads power scrubber 3 adjustable speedsadjustable  detachable long handlevoice broadcast best seller'

In [106]:
df_copy['cleaned'] = df_copy['information'].apply(cleaned_text)
df_copy.head()

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,id,category_name,information,cleaned
0,B07XFL44BG,uxcell 5 Inch Wet Dry Sanding Discs 5000 Grit ...,https://m.media-amazon.com/images/I/61Sv9zTj+C...,https://www.amazon.com/dp/B07XFL44BG,4.2,0,9.49,0.0,142,,0,142,Abrasive & Finishing Products,Abrasive & Finishing Products uxcell 5 Inch We...,abrasive finishing products uxcell 5 inch wet...
1,B0821XBPC3,uxcell 3-Inch Sanding Sponge Hook and Loop San...,https://m.media-amazon.com/images/I/61NTU3igOY...,https://www.amazon.com/dp/B0821XBPC3,4.3,0,12.99,0.0,142,,0,142,Abrasive & Finishing Products,Abrasive & Finishing Products uxcell 3-Inch Sa...,abrasive finishing products uxcell 3inch sand...
2,B09ZRTGG7D,Steel Wool - 2-Pack (12-Count a Pack) Fine Ste...,https://m.media-amazon.com/images/I/617a9Blyhs...,https://www.amazon.com/dp/B09ZRTGG7D,0.0,0,13.99,0.0,142,,0,142,Abrasive & Finishing Products,Abrasive & Finishing Products Steel Wool - 2-P...,abrasive finishing products steel wool 2pack...
3,B08HY4V9GK,"uxcell 2"" Hook and Loop Sanding Disc 5000/7000...",https://m.media-amazon.com/images/I/51kFNDGkKc...,https://www.amazon.com/dp/B08HY4V9GK,3.2,0,4.99,0.0,142,,0,142,Abrasive & Finishing Products,"Abrasive & Finishing Products uxcell 2"" Hook a...",abrasive finishing products uxcell 2 hook and...
4,B0002FU670,"3M 07522 Scotch-Brite Multi-Flex, 8-Inch-by-20...",https://m.media-amazon.com/images/I/A1iyKYRvYF...,https://www.amazon.com/dp/B0002FU670,4.1,0,55.44,0.0,142,,0,142,Abrasive & Finishing Products,Abrasive & Finishing Products 3M 07522 Scotch-...,abrasive finishing products 3m 07522 scotchbr...


In [110]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [130]:
from transformers import DistilBertTokenizer, DistilBertModel

model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def get_text_embedding(text):
    tokenized_text = tokenizer.encode(text, max_length=512, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(tokenized_text)
    cls_embedding = model_output.last_hidden_state[:,0,:]
    return cls_embedding.cpu().numpy()

In [131]:
df_copy['text_embedding'] = df_copy['cleaned'].apply(get_text_embedding)
df_copy.head()

KeyboardInterrupt: 

In [116]:
df_copy.columns

Index(['asin', 'title', 'imgUrl', 'productURL', 'stars', 'reviews', 'price',
       'listPrice', 'category_id', 'isBestSeller', 'boughtInLastMonth', 'id',
       'category_name', 'information', 'cleaned', 'text_embedding'],
      dtype='object')

In [115]:
df_copy.to_csv("amz_text_embedding2.csv", index=False)

In [119]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarities(embedding, embeddings):
    similarities = cosine_similarity(embedding.reshape(1, -1), np.vstack(embeddings)).flatten()
    return similarities

def recommendation_function(text, df , top_n):
    text_cleaned = cleaned_text(text)
    embedding = get_text_embedding(text_cleaned)
    embeddings = df['text_embedding'].tolist()
    similarities = cosine_similarities(embedding, embeddings)
    df['similarity'] = similarities
    df_sorted = df.sort_values(by='similarity', ascending=False)
    recommendation = df_sorted.head(top_n)
    return recommendation[['title', 'imgUrl', 'productURL', 'stars', 'reviews', 'price',
       'listPrice', 'category_id', 'isBestSeller', 'boughtInLastMonth',
       'category_name','similarity']]

In [122]:
text = "boy's nike shoes best seller"
recommendation_function(text, df_copy, top_n=10)

Unnamed: 0,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,category_name,similarity
3256,Unisex-Child Handle It Rain Boots,https://m.media-amazon.com/images/I/61G7IJ0Hkq...,https://www.amazon.com/dp/B00RBBEMPK,4.7,0,34.99,39.99,97,Best Seller,300,Girls' Shoes,0.987114
8252,Ice Blocks,https://m.media-amazon.com/images/I/61t9di+u0P...,https://www.amazon.com/dp/B00CWL3M96,4.7,0,0.98,1.65,200,Best Seller,10000,Sports & Outdoors,0.982186
1485,Unisex-Child Stanton Fashion Boot,https://m.media-amazon.com/images/I/81g8CBlUwQ...,https://www.amazon.com/dp/B08YP3V3HH,4.7,0,30.5,46.0,90,,0,Boys' Shoes,0.977412
745,Baby Boys' Trefoil Tee,https://m.media-amazon.com/images/I/71OmZD-I3E...,https://www.amazon.com/dp/B07D7QQ1D2,4.8,0,17.4,20.0,43,,50,Baby Boys' Clothing & Shoes,0.972265
3252,girls Cowboy Cool,https://m.media-amazon.com/images/I/712CWu-RhX...,https://www.amazon.com/dp/B01CVMGPY4,4.8,0,60.99,66.99,97,,0,Girls' Shoes,0.971209
1342,Boys Icebox Beanie,https://m.media-amazon.com/images/I/51Iu7RGuAu...,https://www.amazon.com/dp/B0B4H5G4PP,0.0,0,11.89,14.99,87,,0,Boys' Accessories,0.971163
3230,Unisex-Child Foxtail-k Sandal,https://m.media-amazon.com/images/I/61RIdOWzBE...,https://www.amazon.com/dp/B0BG6LZ1R1,1.0,0,38.85,0.0,97,,0,Girls' Shoes,0.971057
3109,Girl's Classic Script Fleece Hoodie (Big Kids),https://m.media-amazon.com/images/I/71OCYyf9Ld...,https://www.amazon.com/dp/B0B2MQ8XSR,0.0,0,0.0,0.0,91,,0,Girls' Clothing,0.970876
1484,Nebzed Lifestyle Lace Running Shoes Kids',https://m.media-amazon.com/images/I/41vDpeFqKu...,https://www.amazon.com/dp/B0CCGSRJGL,0.0,0,49.95,0.0,90,,0,Boys' Shoes,0.970869
3241,Ultraboost DNA (Big Kid),https://m.media-amazon.com/images/I/51BlqdO21I...,https://www.amazon.com/dp/B08THDF5SR,3.0,0,89.99,0.0,97,,0,Girls' Shoes,0.970464


In [123]:
from IPython.display import display, HTML

def display_recommendations(recommendations):
    # Generate HTML for displaying thumbnails and metadata
    html = '<div style="display: flex; flex-wrap: wrap; justify-content: space-around;">'

    # Iterate over recommendations in pairs (2 thumbnails per row)
    for i in range(0, len(recommendations), 2):
        row_html = '<div style="display: flex; justify-content: space-around; width: 100%;">'
# ['title', 'imgUrl', 'productURL', 'stars', 'reviews', 'price',
#        'listPrice', 'category_id', 'isBestSeller', 'boughtInLastMonth',
#        'category_name','similarity']
        # Iterate through each item in the pair (2 thumbnails per row)
        for j in range(5):
            if i + j < len(recommendations):
                # Fetch data for each recommendation
                recommendation = recommendations.iloc[i + j]
                title = recommendation['title']
                image = recommendation['imgUrl']
                stars = recommendation['stars']
                reviews = recommendation['reviews']
                price = recommendation['price']
                boughtInLastMonth = recommendation['boughtInLastMonth']
                category_name = recommendation['category_name']
                similarity = recommendation['similarity']


                # Generate HTML for each video thumbnail and details
                row_html += f'''
                <div style="width: 20%; margin: 2px; text-align: center; border: 1px solid #ddd; padding: 2px; border-radius: 10px;">
                    <img src="{image}" alt="{title}" style="width: 100%; border-radius: 5px;">
                    <h4>{title}</h4>
                    <p>Price: {price}</p>
                    <p>Stars: {stars} | Reviews: {reviews}</p>
                    <p>Similarity: {similarity:.2f}</p>
                    <p>Bought in last month: {boughtInLastMonth}</p>
                    <p>Category: {category_name}</p>
                </div>
                '''
        row_html += '</div>'
        html += row_html

    html += '</div>'
    display(HTML(html))

In [127]:
text = "1. **Kids' educational toys for learning colors and shapes**"
get_product=recommendation_function(text, df_copy, top_n=10)
display_recommendations(get_product)

In [129]:
from joblib import dump

dump(df_copy, "streamlit_app/dataframe.joblib")

['streamlit_app/dataframe.joblib']