In [30]:
import pandas as pd
import numpy as np
import nltk
import gensim


from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

dataset = pd.read_csv("AmazonData.csv")

# Excluding columns that we dont use
cols = [0,2,3,5,6,8,9,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
dataset.drop(dataset.columns[cols], axis =1, inplace=True)
dataset.dropna(inplace = True)

# Splitting Category in 3 parts
new = dataset["Category"].str.split("|", n = 2, expand = True)
  
# making the first category called Main Category
dataset["Main Category"]= new[0] 
  
# making the second category called sub_category 
dataset["Sub-Category"]= new[1]

# making the third category called side_category 
dataset["Side Category"]= new[2]


dataset.info()

[nltk_data] Downloading package punkt to C:\Users\ricar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ricar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ricar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
Index: 7216 entries, 0 to 10001
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Product Name           7216 non-null   object
 1   Category               7216 non-null   object
 2   Selling Price          7216 non-null   object
 3   About Product          7216 non-null   object
 4   Product Specification  7216 non-null   object
 5   Shipping Weight        7216 non-null   object
 6   Main Category          7216 non-null   object
 7   Sub-Category           7216 non-null   object
 8   Side Category          6231 non-null   object
dtypes: object(9)
memory usage: 563.8+ KB


In [31]:
# Setting Column Selling Price as float value
# Database Price and weight treatment
dataset.rename(columns = {'Uniq Id':'Id','Shipping Weight':'Shipping Weight(Pounds)', 'Selling Price':'Selling Price($)'}, inplace = True)

# Removing units from Price and Weight
dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('ounces')
dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('pounds')
dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace('$', '')

# Removing rows with Total Price invalid
indexes = dataset[dataset['Selling Price($)'] == 'Total price:'].index
dataset.drop(indexes, inplace=True)

# Removing rows with '-' character
dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace(',', '', regex=False)
indexes = dataset[dataset['Selling Price($)'].str.contains('-', na=False)].index
dataset.drop(indexes, inplace=True)

# Removing rows with '&' character
indexes = dataset[dataset['Selling Price($)'].str.contains('&', na=False)].index
dataset.drop(indexes, inplace=True)

# Removing rows with 'Currently' character
indexes = dataset[dataset['Selling Price($)'].str.contains('Currently', na=False)].index
dataset.drop(indexes, inplace=True)

# Removing rows with 'from' character
indexes = dataset[dataset['Selling Price($)'].str.contains('from', na=False)].index
dataset.drop(indexes, inplace=True)

# Adjusting values with wrong format
dataset['Selling Price($)'] = dataset['Selling Price($)'].str.split(' ').str[0]
dataset['Selling Price($)'] = dataset['Selling Price($)'].astype(float)


dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7137 entries, 0 to 10001
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product Name             7137 non-null   object 
 1   Category                 7137 non-null   object 
 2   Selling Price($)         7137 non-null   float64
 3   About Product            7137 non-null   object 
 4   Product Specification    7137 non-null   object 
 5   Shipping Weight(Pounds)  7137 non-null   object 
 6   Main Category            7137 non-null   object 
 7   Sub-Category             7137 non-null   object 
 8   Side Category            6155 non-null   object 
dtypes: float64(1), object(8)
memory usage: 557.6+ KB


In [32]:
# Setting Column Shipping Weight as float value
indexes = dataset[dataset['Shipping Weight(Pounds)'].str.contains(r'\. ', na=False)].index

dataset.at[1619, 'Shipping Weight(Pounds)']
dataset.drop(1619, inplace=True)
dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace(',', '', regex=False)
dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].astype(float)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7136 entries, 0 to 10001
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product Name             7136 non-null   object 
 1   Category                 7136 non-null   object 
 2   Selling Price($)         7136 non-null   float64
 3   About Product            7136 non-null   object 
 4   Product Specification    7136 non-null   object 
 5   Shipping Weight(Pounds)  7136 non-null   float64
 6   Main Category            7136 non-null   object 
 7   Sub-Category             7136 non-null   object 
 8   Side Category            6155 non-null   object 
dtypes: float64(2), object(7)
memory usage: 557.5+ KB


In [33]:
# use of lemmatization
def preprocessamento(texto):
    texto = texto.replace('[^a-zA-Z]',' ').lower()
    stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
    texto = texto.replace(stop_re, '')
    texto = texto.split()

    # Add lemmatization using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in texto]
    return lemmatized_text

In [34]:
# Preprocess with lemmatization
dataset["Processed Product Name"] = dataset["Product Name"].apply(preprocessamento)
dataset["Processed Product Name"]

0        [db, longboards, coreflex, crossbow, 41", bamb...
1        [electronic, snap, circuit, mini, kit, classpa...
2        [3doodler, create, flexy, 3d, printing, filame...
3        [guillow, airplane, design, studio, with, trav...
4                   [woodstock-, collage, 500, pc, puzzle]
                               ...                        
9995     [cozy, line, home, fashion, size, 2, piece, oc...
9996           [lego, 8-brick, storage, box,, bright, red]
9998     [trend, international, nfl, la, charger, hg, -...
9999     [newpath, learning, 10, piece, science, owl, a...
10001    [hasegawa, ladder, lucano, step, ladder,, orange]
Name: Processed Product Name, Length: 7136, dtype: object

In [35]:
#Inicio do modelo de recomendacao
modelo = Word2Vec(sentences=dataset["Processed Product Name"], vector_size=100, window=5, min_count=1, workers=4)

def vectorize_product(product_name):
    words = [word for word in product_name if word in modelo.wv]
    if len(words) > 0:
        return np.mean([modelo.wv[word] for word in words], axis=0)
    else:
        return np.zeros(modelo.wv.vector_size)
    
dataset["Product Vector"] = dataset["Processed Product Name"].apply(vectorize_product)
dataset["Product Vector"]

0        [-0.0062452788, 0.029205084, 0.013634283, 0.01...
1        [-0.020676203, 0.15442109, 0.053754345, 0.1054...
2        [-0.018686125, 0.13685417, 0.044907585, 0.0983...
3        [-0.0619574, 0.26906204, 0.09278533, 0.1580728...
4        [-0.023948986, 0.16697064, 0.046132583, 0.1102...
                               ...                        
9995     [-0.040653158, 0.21899302, 0.07272695, 0.15713...
9996     [-0.024467006, 0.16234973, 0.060247432, 0.1109...
9998     [-0.031669613, 0.1703979, 0.048933074, 0.12644...
9999     [-0.03083203, 0.20535116, 0.06601711, 0.155057...
10001    [-0.004292644, 0.041014943, 0.017365305, 0.029...
Name: Product Vector, Length: 7136, dtype: object

In [36]:
def recomendar_produtos(input_text, top_n=5):
    # Pré-processar a entrada do usuário
    input_text_processed = preprocessamento(input_text)
    
    # Gerar vetor para a entrada
    input_vector = vectorize_product(input_text_processed)
    
    # Calcular similaridades cosseno
    similarities = dataset["Product Vector"].apply(lambda x: cosine_similarity([input_vector], [x])[0][0])
    
    # Ordenar por similaridade e pegar os top_n produtos
    top_indices = similarities.nlargest(top_n).index
    
    # Retornar o DataFrame com os produtos recomendados, mas mantendo os nomes originais
    return dataset.loc[top_indices, dataset.columns != 'Product Vector']

In [37]:
input_text = input("Digite o nome do produto: ")

In [38]:
recomendacoes = recomendar_produtos(input_text, top_n=5)
print(recomendacoes)


                                           Product Name  \
2762                    Foil Snowflake Danglers (2/Pkg)   
2305                 Plaid Hat Pandemic: Rapid Response   
1313                             Lemming 7" (Item 4707)   
4910  Sunstaches Marvel Avengers Hulk Character Sung...   
370                    Marvel Hulk Fist Pewter Key Ring   

                                               Category  Selling Price($)  \
2762       Toys & Games | Party Supplies | Party Favors             13.35   
2305   Toys & Games | Games & Accessories | Board Games             34.34   
1313  Toys & Games | Stuffed Animals & Plush Toys | ...             21.75   
4910  Clothing, Shoes & Jewelry | Costumes & Accesso...              8.98   
370   Toys & Games | Collectible Toys | Collectible ...              5.51   

                                          About Product  \
2762  Make sure this fits by entering your model num...   
2305  Make sure this fits by entering your model num...   
1313 