In [1]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity


[nltk_data] Downloading package stopwords to /home/leo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv("../data/AmazonData.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                10002 non-null  object 
 1   Product Name           10002 non-null  object 
 2   Brand Name             0 non-null      float64
 3   Asin                   0 non-null      float64
 4   Category               9172 non-null   object 
 5   Upc Ean Code           34 non-null     object 
 6   List Price             0 non-null      float64
 7   Selling Price          9895 non-null   object 
 8   Quantity               0 non-null      float64
 9   Model Number           8232 non-null   object 
 10  About Product          9729 non-null   object 
 11  Product Specification  8370 non-null   object 
 12  Technical Details      9212 non-null   object 
 13  Shipping Weight        8864 non-null   object 
 14  Product Dimensions     479 non-null    object 
 15  Im

In [3]:
cols = [0,2,3,5,6,8,9,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
dataset.drop(dataset.columns[cols], axis =1, inplace=True)
dataset.dropna(inplace = True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7216 entries, 0 to 10001
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Product Name           7216 non-null   object
 1   Category               7216 non-null   object
 2   Selling Price          7216 non-null   object
 3   About Product          7216 non-null   object
 4   Product Specification  7216 non-null   object
 5   Shipping Weight        7216 non-null   object
dtypes: object(6)
memory usage: 394.6+ KB


In [4]:
new = dataset["Category"].str.split("|", n = 3, expand = True)
new = new.apply(lambda x: x.str.strip())
  
# making the first category called Main Category
dataset["Main Category"]= new[0] 
  
# making the second category called sub_category 
dataset["Sub Category"]= new[1]

# making the third category called side_category 
dataset["Side Category"]= new[2]

# making the last column consist of the remaining categories
dataset["Other Category"]= new[3]

# Dropping old category columns and the remaining categories 
dataset.drop(columns =["Category"], inplace = True)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7216 entries, 0 to 10001
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Product Name           7216 non-null   object
 1   Selling Price          7216 non-null   object
 2   About Product          7216 non-null   object
 3   Product Specification  7216 non-null   object
 4   Shipping Weight        7216 non-null   object
 5   Main Category          7216 non-null   object
 6   Sub Category           7216 non-null   object
 7   Side Category          6231 non-null   object
 8   Other Category         2791 non-null   object
dtypes: object(9)
memory usage: 563.8+ KB


In [6]:
def preprocessamento(texto):
    texto = texto.replace('[^a-zA-Z]',' ').lower()
    stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
    texto = texto.replace(stop_re, '')
    texto = texto.split()
    return texto

In [7]:
dataset["Processed Product Name"] = dataset["Product Name"].apply(preprocessamento)
dataset["Processed Product Name"]

0        [db, longboards, coreflex, crossbow, 41", bamb...
1        [electronic, snap, circuits, mini, kits, class...
2        [3doodler, create, flexy, 3d, printing, filame...
3        [guillow, airplane, design, studio, with, trav...
4                   [woodstock-, collage, 500, pc, puzzle]
                               ...                        
9995     [cozy, line, home, fashions, size, 2, piece, o...
9996           [lego, 8-brick, storage, box,, bright, red]
9998     [trends, international, nfl, la, chargers, hg,...
9999     [newpath, learning, 10, piece, science, owls, ...
10001    [hasegawa, ladders, lucano, step, ladder,, ora...
Name: Processed Product Name, Length: 7216, dtype: object

In [8]:
modelo = Word2Vec(sentences=dataset["Processed Product Name"], vector_size=100, window=5, min_count=1, workers=4)

In [9]:
def vectorize_product(product_name):
    words = [word for word in product_name if word in modelo.wv]
    if len(words) > 0:
        return np.mean([modelo.wv[word] for word in words], axis=0)
    else:
        return np.zeros(modelo.wv.vector_size)

In [10]:
dataset["Product Vector"] = dataset["Processed Product Name"].apply(vectorize_product)
dataset["Product Vector"]

0        [-0.028427698, 0.038930576, 0.034446027, 0.000...
1        [-0.11887617, 0.15092024, 0.1340297, -0.003559...
2        [-0.13145433, 0.18151474, 0.15550838, -0.00980...
3        [-0.30230486, 0.341934, 0.29935572, -0.0174990...
4        [-0.14529756, 0.22786859, 0.17120734, -0.01370...
                               ...                        
9995     [-0.1954298, 0.27134368, 0.21675357, -0.014566...
9996     [-0.15959293, 0.20706719, 0.17997284, -0.00754...
9998     [-0.15552968, 0.22217631, 0.18348745, -0.01008...
9999     [-0.17371441, 0.24964409, 0.19012699, -0.01401...
10001    [-0.03730512, 0.056377154, 0.043323267, 0.0004...
Name: Product Vector, Length: 7216, dtype: object

In [11]:
def recomendar_produtos(input_text, top_n=5):
    # Pré-processar a entrada do usuário
    input_text_processed = preprocessamento(input_text)
    
    # Gerar vetor para a entrada
    input_vector = vectorize_product(input_text_processed)
    
    # Calcular similaridades cosseno
    similarities = dataset["Product Vector"].apply(lambda x: cosine_similarity([input_vector], [x])[0][0])
    
    # Ordenar por similaridade e pegar os top_n produtos
    top_indices = similarities.nlargest(top_n).index
    
    # Retornar o DataFrame com os produtos recomendados, mas mantendo os nomes originais
    return dataset.loc[top_indices, dataset.columns != 'Product Vector']

In [12]:
input_text = input("Digite o nome do produto: ")

In [13]:
recomendacoes = recomendar_produtos(input_text, top_n=5)
print(recomendacoes)

                                           Product Name Selling Price  \
3349                  LEGO Storage Brick 8, Bright Pink        $39.99   
3940                     LEGO Round Storage Box 1, Blue        $11.92   
8755  Ashley Productions Big Hardcover Blank Book Pa...        $51.35   
2423  School Smart Blunt Tip Student Scissors, 6-1/4...        $25.32   
3821  Mega Construx Heroes Battle of Eternia Collection        $10.19   

                                          About Product  \
3349  Make sure this fits by entering your model num...   
3940  Make sure this fits by entering your model num...   
8755  Make sure this fits by entering your model num...   
2423  Make sure this fits by entering your model num...   
3821  Make sure this fits by entering your model num...   

                                  Product Specification Shipping Weight  \
3349  ProductDimensions:19.7x9.8x7.1inches|ItemWeigh...      4.8 pounds   
3940  ProductDimensions:4.8x4.8x7.2inches|ItemWeight... 