In [437]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity


[nltk_data] Downloading package stopwords to /home/leo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [438]:
dataset = pd.read_csv("AmazonData.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                10002 non-null  object 
 1   Product Name           10002 non-null  object 
 2   Brand Name             0 non-null      float64
 3   Asin                   0 non-null      float64
 4   Category               9172 non-null   object 
 5   Upc Ean Code           34 non-null     object 
 6   List Price             0 non-null      float64
 7   Selling Price          9895 non-null   object 
 8   Quantity               0 non-null      float64
 9   Model Number           8232 non-null   object 
 10  About Product          9729 non-null   object 
 11  Product Specification  8370 non-null   object 
 12  Technical Details      9212 non-null   object 
 13  Shipping Weight        8864 non-null   object 
 14  Product Dimensions     479 non-null    object 
 15  Im

In [439]:
cols = [0,2,3,5,6,8,9,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
dataset.drop(dataset.columns[cols], axis =1, inplace=True)
dataset.dropna(inplace = True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7216 entries, 0 to 10001
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Product Name           7216 non-null   object
 1   Category               7216 non-null   object
 2   Selling Price          7216 non-null   object
 3   About Product          7216 non-null   object
 4   Product Specification  7216 non-null   object
 5   Shipping Weight        7216 non-null   object
dtypes: object(6)
memory usage: 394.6+ KB


In [440]:
new = dataset["Category"].str.split("|", n = 3, expand = True)
new = new.apply(lambda x: x.str.strip())
  
# making the first category called Main Category
dataset["Main Category"]= new[0] 
  
# making the second category called sub_category 
dataset["Sub Category"]= new[1]

# making the third category called side_category 
dataset["Side Category"]= new[2]

# making the last column consist of the remaining categories
dataset["Other Category"]= new[3]

# Dropping old category columns and the remaining categories 
dataset.drop(columns =["Category"], inplace = True)

In [441]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7216 entries, 0 to 10001
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Product Name           7216 non-null   object
 1   Selling Price          7216 non-null   object
 2   About Product          7216 non-null   object
 3   Product Specification  7216 non-null   object
 4   Shipping Weight        7216 non-null   object
 5   Main Category          7216 non-null   object
 6   Sub Category           7216 non-null   object
 7   Side Category          6231 non-null   object
 8   Other Category         2791 non-null   object
dtypes: object(9)
memory usage: 563.8+ KB


In [442]:
def preprocessamento(texto):
    texto = texto.replace('[^a-zA-Z]',' ').lower()
    stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
    texto = texto.replace(stop_re, '')
    texto = texto.split()
    return texto

In [443]:
dataset["Processed Product Name"] = dataset["Product Name"].apply(preprocessamento)
dataset["Processed Product Name"]

0        [db, longboards, coreflex, crossbow, 41", bamb...
1        [electronic, snap, circuits, mini, kits, class...
2        [3doodler, create, flexy, 3d, printing, filame...
3        [guillow, airplane, design, studio, with, trav...
4                   [woodstock-, collage, 500, pc, puzzle]
                               ...                        
9995     [cozy, line, home, fashions, size, 2, piece, o...
9996           [lego, 8-brick, storage, box,, bright, red]
9998     [trends, international, nfl, la, chargers, hg,...
9999     [newpath, learning, 10, piece, science, owls, ...
10001    [hasegawa, ladders, lucano, step, ladder,, ora...
Name: Processed Product Name, Length: 7216, dtype: object

In [444]:
modelo = Word2Vec(sentences=dataset["Processed Product Name"], vector_size=100, window=5, min_count=1, workers=4)

In [445]:
def vectorize_product(product_name):
    words = [word for word in product_name if word in modelo.wv]
    if len(words) > 0:
        return np.mean([modelo.wv[word] for word in words], axis=0)
    else:
        return np.zeros(modelo.wv.vector_size)

In [446]:
dataset["Product Vector"] = dataset["Processed Product Name"].apply(vectorize_product)
dataset["Product Vector"]

0        [-0.026015468, 0.038651183, 0.035992764, 0.000...
1        [-0.103402406, 0.14822036, 0.1363379, -0.00632...
2        [-0.11704521, 0.18137215, 0.16436759, -0.01115...
3        [-0.2694781, 0.3530375, 0.3001987, -0.03014556...
4        [-0.12634698, 0.21866429, 0.1763436, -0.016193...
                               ...                        
9995     [-0.17574261, 0.27484408, 0.22719797, -0.01636...
9996     [-0.14175738, 0.21316409, 0.19264893, -0.00976...
9998     [-0.1412848, 0.21853943, 0.19099298, -0.012475...
9999     [-0.15240571, 0.24434316, 0.19968817, -0.01123...
10001    [-0.03310827, 0.055054367, 0.043399375, -0.001...
Name: Product Vector, Length: 7216, dtype: object

In [447]:
def recomendar_produtos(input_text, top_n=5):
    # Pré-processar a entrada do usuário
    input_text_processed = preprocessamento(input_text)
    
    # Gerar vetor para a entrada
    input_vector = vectorize_product(input_text_processed)
    
    # Calcular similaridades cosseno
    similarities = dataset["Product Vector"].apply(lambda x: cosine_similarity([input_vector], [x])[0][0])
    
    # Ordenar por similaridade e pegar os top_n produtos
    top_indices = similarities.nlargest(top_n).index
    
    # Retornar o DataFrame com os produtos recomendados, mas mantendo os nomes originais
    return dataset.loc[top_indices, dataset.columns != 'Product Vector']

In [448]:
input_text = input("Digite o nome do produto: ")

In [449]:
recomendacoes = recomendar_produtos(input_text, top_n=5)
print(recomendacoes)

                                        Product Name Selling Price  \
0  DB Longboards CoreFlex Crossbow 41" Bamboo Fib...       $237.68   
1  Electronic Snap Circuits Mini Kits Classpack, ...        $99.95   
2  3Doodler Create Flexy 3D Printing Filament Ref...        $34.99   
3  Guillow Airplane Design Studio with Travel Cas...        $28.91   
4                   Woodstock- Collage 500 pc Puzzle        $17.49   

                                       About Product  \
0  Make sure this fits by entering your model num...   
1  Make sure this fits by entering your model num...   
2  Make sure this fits by entering your model num...   
3  Make 8 different Planes at one time. | Experim...   
4  Make sure this fits by entering your model num...   

                               Product Specification Shipping Weight  \
0  Shipping Weight: 10.7 pounds (View shipping ra...     10.7 pounds   
1  Product Dimensions:         14.7 x 11.1 x 10.2...        4 pounds   
2  ProductDimensions:10.3x