In [1]:
import pandas as pd
import numpy as np
import nltk

In [3]:
product_data = pd.read_csv("/content/amazon_product.csv")

In [None]:
product_data.head()

Unnamed: 0,id,Title,Description,Category
0,1,Swissmar Capstore Select Storage Rack for 18-...,Swissmar's capstore select 18 storage unit kee...,Home & Kitchen Kitchen & Dining Kitchen Utens...
1,2,Gemini200 Delta CV-880 Gold Crown Livery Airc...,Welcome to the exciting world of GeminiJets! O...,Toys & Games Hobbies Models & Model Kits Pre-...
2,5,Superior Threads 10501-2172 Magnifico Cream P...,"For quilting and embroidery, this product is m...","Arts, Crafts & Sewing Sewing Thread & Floss S..."
3,6,Fashion Angels Color Rox Hair Chox Kit,Experiment with the haute trend of hair chalki...,Beauty & Personal Care Hair Care Hair Colorin...
4,8,Union Creative Giant Killing Figure 05: Daisu...,From Union Creative. Turn your display shelf i...,Toys & Games › Action Figures & Statues › Sta...


Clean the data by removing unnecessary columns and rows. For example, you can remove the "id" column.

In [4]:
product_data.drop('id',axis=1)

Unnamed: 0,Title,Description,Category
0,Swissmar Capstore Select Storage Rack for 18-...,Swissmar's capstore select 18 storage unit kee...,Home & Kitchen Kitchen & Dining Kitchen Utens...
1,Gemini200 Delta CV-880 Gold Crown Livery Airc...,Welcome to the exciting world of GeminiJets! O...,Toys & Games Hobbies Models & Model Kits Pre-...
2,Superior Threads 10501-2172 Magnifico Cream P...,"For quilting and embroidery, this product is m...","Arts, Crafts & Sewing Sewing Thread & Floss S..."
3,Fashion Angels Color Rox Hair Chox Kit,Experiment with the haute trend of hair chalki...,Beauty & Personal Care Hair Care Hair Colorin...
4,Union Creative Giant Killing Figure 05: Daisu...,From Union Creative. Turn your display shelf i...,Toys & Games › Action Figures & Statues › Sta...
...,...,...,...
663,Rosemery (Rosemary) - Box of Six 20 Stick Hex...,"Six tubes, each containing 20 sticks of incens...",Home & Kitchen Home Décor Home Fragrance Ince...
664,"InterDesign Linus Stacking Organizer Bin, Ext...",The InterDesign Linus Organizer Bins are stack...,Home & Kitchen Kitchen & Dining Storage & Org...
665,Gourmet Rubber Stamps Diagonal Stripes Stenci...,Gourmet Rubber Stamps-Stencil. This delicious ...,Toys & Games Arts & Crafts Printing & Stamping
666,Spenco RX Arch Cushion Full Length Comfort Su...,"Soft, durable arch support. consumers with gen...",Health & Household › Health Care › Foot Healt...


In [5]:
product_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           668 non-null    int64 
 1   Title        668 non-null    object
 2   Description  668 non-null    object
 3   Category     668 non-null    object
dtypes: int64(1), object(3)
memory usage: 21.0+ KB


In [11]:
product_data.isnull().sum()

id                0
Title             0
Description       0
Category          0
stemmed_tokens    0
dtype: int64

In [12]:
from nltk.stem.snowball import SnowballStemmer

In [13]:
stemmer = SnowballStemmer("english")
def tokensize_stem(text):
    tokens = nltk.word_tokenize(text.lower())
    stem = [stemmer.stem(w) for w in tokens]
    return " ".join(stem)

In [14]:
import nltk
nltk.download('punkt')

product_data['stemmed_tokens'] = product_data.apply(lambda row: tokensize_stem(row['Title']+ ' '+row['Description']),axis=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
product_data.head()

Unnamed: 0,id,Title,Description,Category,stemmed_tokens
0,1,Swissmar Capstore Select Storage Rack for 18-...,Swissmar's capstore select 18 storage unit kee...,Home & Kitchen Kitchen & Dining Kitchen Utens...,swissmar capstor select storag rack for 18-pac...
1,2,Gemini200 Delta CV-880 Gold Crown Livery Airc...,Welcome to the exciting world of GeminiJets! O...,Toys & Games Hobbies Models & Model Kits Pre-...,gemini200 delta cv-880 gold crown liveri aircr...
2,5,Superior Threads 10501-2172 Magnifico Cream P...,"For quilting and embroidery, this product is m...","Arts, Crafts & Sewing Sewing Thread & Floss S...",superior thread 10501-2172 magnifico cream puf...
3,6,Fashion Angels Color Rox Hair Chox Kit,Experiment with the haute trend of hair chalki...,Beauty & Personal Care Hair Care Hair Colorin...,fashion angel color rox hair chox kit experi w...
4,8,Union Creative Giant Killing Figure 05: Daisu...,From Union Creative. Turn your display shelf i...,Toys & Games › Action Figures & Statues › Sta...,union creativ giant kill figur 05 : daisuk tsu...


In [16]:
product_data['stemmed_tokens']

0      swissmar capstor select storag rack for 18-pac...
1      gemini200 delta cv-880 gold crown liveri aircr...
2      superior thread 10501-2172 magnifico cream puf...
3      fashion angel color rox hair chox kit experi w...
4      union creativ giant kill figur 05 : daisuk tsu...
                             ...                        
663    rosemeri ( rosemari ) - box of six 20 stick he...
664    interdesign linus stack organ bin , extra larg...
665    gourmet rubber stamp diagon stripe stencil , 6...
666    spenco rx arch cushion full length comfort sup...
667                                  car kit kit for car
Name: stemmed_tokens, Length: 668, dtype: object

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
TfidfVectorizer = TfidfVectorizer(tokenizer=tokensize_stem)

In [19]:
def cosine_sim(tex1,tex2):
    tfid_mat = TfidfVectorizer.fit_transform([tex1,tex2])
    return cosine_similarity(tfid_mat)[0][1]

In [20]:
def search_product(query):
    stemmed_query = tokensize_stem(query)
    #calcualting cosine similarity between query and stemmed tokens columns
    product_data['similarity'] = product_data['stemmed_tokens'].apply(lambda x:cosine_sim(stemmed_query,x))
    res = product_data.sort_values(by=['similarity'],ascending=False).head(10)[['Title','Description','Category']]
    return res

In [22]:
search_product(' PURELL ES8 Professional HEALTHY SOAP Foam Refill, Fresh Scent Fragrance, 1200 mL Soap Refill for PURELL ES8 Touch-Free Dispenser (Pack of 2) - 7777-02 ')



Unnamed: 0,Title,Description,Category
10,PURELL ES8 Professional HEALTHY SOAP Foam Ref...,1200 ml refill for Purell ES8 touch-free soap ...,Industrial & Scientific › Janitorial & Sanita...
541,Remington SP290 for F4790 Shaver (2-Pack),Technical Features for Remington SP290-2 The R...,Beauty & Personal Care › Shave & Hair Removal...
176,Angel Soft Professional Series Premium 2-Ply ...,Angel Soft Professional Series offers a qualit...,Health & Household Household Supplies Paper &...
160,"Dixie 8.5""Medium-Weight Paper Plates by GP PR...",The WiseSize product offering provides a packa...,Health & Household › Household Supplies › Pap...
206,Filofax 2018 Personal/Compact Academic Week t...,"Over the years, the Filofax organizer has evol...",Office Products › Office & School Supplies › ...
220,Serta iComfort Premium Infant Sleeper Replace...,Replacement cover for the Serta’s icomfort Pre...,Baby Products › Nursery › Bedding › Baby Bedd...
492,"Seventh Generation Baby Overnight Diapers, Fr...",Everyone could use a little sleep. Seventh Gen...,Baby Products Diapering Disposable Diapers
209,ALEX Toys Artist Studio Ultimate Easel Access...,ALEX Toys Artist Studio Ultimate Easel Accesso...,Toys & Games Arts & Crafts
523,"School Smart 2-Pocket Folders, Green, Pack of...",Compile and file with ease with School Smart's...,Office Products Office & School Supplies Fili...
390,"Elmer's Foam Board, 20 x 28 Inches, 3/16 Inch...",Elmer's Foam Board makes it easier to create e...,Office Products Office & School Supplies Pres...
