In [1]:
import pandas as pd                                             # importando pandas
pd.set_option('display.max_rows', None)                         # 
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', None)


In [2]:
dataset = pd.read_csv('../data/AmazonData.csv')                    # atribuindo a base de dados à variavel dataset
dataset["Category"].head()                                      # printando as 5 primeiras linhas da base de dados, somente a coluna "Category"

0    Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards
1                                                                                   Toys & Games | Learning & Education | Science Kits & Toys
2                                                                                                   Toys & Games | Arts & Crafts | Craft Kits
3                                                             Toys & Games | Hobbies | Models & Model Kits | Model Kits | Airplane & Jet Kits
4                                                                                                     Toys & Games | Puzzles | Jigsaw Puzzles
Name: Category, dtype: object

In [3]:
dataset.info()                                          # printando um resumo da base de dados

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                10002 non-null  object 
 1   Product Name           10002 non-null  object 
 2   Brand Name             0 non-null      float64
 3   Asin                   0 non-null      float64
 4   Category               9172 non-null   object 
 5   Upc Ean Code           34 non-null     object 
 6   List Price             0 non-null      float64
 7   Selling Price          9895 non-null   object 
 8   Quantity               0 non-null      float64
 9   Model Number           8230 non-null   object 
 10  About Product          9729 non-null   object 
 11  Product Specification  8370 non-null   object 
 12  Technical Details      9212 non-null   object 
 13  Shipping Weight        8864 non-null   object 
 14  Product Dimensions     479 non-null    object 
 15  Im

In [4]:
cols = [0,2,3,5,6,8,9,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
dataset.drop(dataset.columns[cols], axis =1, inplace=True)
dataset.dropna(inplace = True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7216 entries, 0 to 10001
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Product Name           7216 non-null   object
 1   Category               7216 non-null   object
 2   Selling Price          7216 non-null   object
 3   About Product          7216 non-null   object
 4   Product Specification  7216 non-null   object
 5   Shipping Weight        7216 non-null   object
dtypes: object(6)
memory usage: 394.6+ KB


In [5]:
index_to_drop = pd.read_csv('../data/index_to_drop.csv', header=0).squeeze().tolist()
dataset.drop(index_to_drop, inplace=True)

In [6]:
                                                                                                        # trattamento de dados   
dataset['Selling Price_processed'] = dataset['Selling Price'].apply(lambda x: str(x).replace('$',''))   # retirando "$" dos preços
# dataset['Selling Price_processed'] = dataset['Selling Price_processed'].astype(float)                   # definindo os preços como float, e não object
dataset.to_csv('../data/ReferenceData.csv', index=False)

We can compute the similarity between product descriptions using TfidfVectorizer

# TF-IDF 

In [None]:
dataset = pd.read_csv('../data/ReferenceData.csv') 

In [8]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer         # importando feature TF-IDF de sklearn                
                                                                    # TF-IDF da valores as palavras de um documento com base na sua 
                                                                    # frequencia de uso

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')                       # função que remove palavras de pouca relevancia

#Replace NaN with an empty string
dataset["Category"] = dataset["Category"].fillna("")                # preenche valores NA da base de dados com "nada"

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(dataset["Category"])             # retorna uma matriz com o vocabulário e o IDF aprendidos

#Output the shape of tfidf_matrix
tfidf_matrix.shape                                                  # printa as dimensãoes da matriz de vocabulário

(7136, 1017)

We can observe that there are 1133 vocabularies in our dataset of 10002 products.

In [9]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[0:20]                                     # printa os vocabulários

array(['accent', 'accents', 'accessories', 'action', 'activities',
       'activity', 'additives', 'adhesives', 'adirondack', 'advent',
       'agility', 'aids', 'air', 'airbrush', 'aircraft', 'airplane',
       'airplanes', 'albums', 'alternators', 'amazonpets'], dtype=object)

We will use this matrix to calculate the similarity score with linear kernel, cosine similarity and sigmoid score.

# Content-Based Filtering

In [10]:
# Import linear_kernel, cosine_similarity, and sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel                      # importa os métodos para calcular distancias de 3 maneiras:
from sklearn.metrics.pairwise import cosine_similarity                  # linear, cosseno e sigmoide
from sklearn.metrics.pairwise import sigmoid_kernel                     # ver https://scikit-learn.org/stable/modules/metrics.html#linear-kernel

# Compute the cosine similarity matrix
linear = linear_kernel(tfidf_matrix, tfidf_matrix)                      # retorna a matriz de distancias de todos os vocabuláriros por todos os vocabulários   
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)              # obtendo as distancias calculadas por 3 métodos diferentes
sig_score = sigmoid_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
print(linear.shape)                                 # printa as dimensões das matrizes 
print(cosine_sim.shape)
print(sig_score.shape)

(7136, 7136)
(7136, 7136)
(7136, 7136)


In [12]:
print("linear:\n")                                  # printa as matrizes de similaridade (quanto maior o valor, mais similar)
print(linear)                                       # tirando a diagonal principal, q é 1(ou 0,76...) por ser a distncia da palavra,
print("\ncosine:\n")                                # por ela mesma
print(cosine_sim)
print("\nsigmoid:\n")
print(sig_score)

linear:

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.2762659  ... 0.09542959 0.26489059 0.        ]
 [0.         0.2762659  1.         ... 0.64177978 0.07873749 0.        ]
 ...
 [0.         0.09542959 0.64177978 ... 1.         0.12268616 0.        ]
 [0.         0.26489059 0.07873749 ... 0.12268616 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]

cosine:

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.2762659  ... 0.09542959 0.26489059 0.        ]
 [0.         0.2762659  1.         ... 0.64177978 0.07873749 0.        ]
 ...
 [0.         0.09542959 0.64177978 ... 1.         0.12268616 0.        ]
 [0.         0.26489059 0.07873749 ... 0.12268616 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]

sigmoid:

[[0.7620068  0.76159416 0.76159416 ... 0.76159416 0.76159416 0.76159416]
 [0.76159

This matrix represents each product category's similarity score with every other product category.

We need to define a function that takes the product name as an input and outputs a list of the 10 most similar products. For this we need a reverse mapping of products and DataFrame indices. This means we need a mechanism to identify the index of a product in our DataFrame.

In [13]:
#Construct a reverse map of indices and product names
indices = pd.Series(dataset.index, index=dataset["Product Name"])       # constroi um vetor de indices com os nomes dos produtos da base de dados

In [14]:
indices[:20]                                                            #  printa os 20 primeiros valores de indices

Product Name
DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete                                                                             0
Electronic Snap Circuits Mini Kits Classpack, FM Radio, Motion Detector, Music Box (Set of 5)                                                        1
3Doodler Create Flexy 3D Printing Filament Refill Bundle (X5 Pack, Over 1000'. of Extruded Plastics! - Innovate                                      2
Guillow Airplane Design Studio with Travel Case Building Kit                                                                                         3
Woodstock- Collage 500 pc Puzzle                                                                                                                     4
Rubie's Child's Pokemon Deluxe Pikachu Costume, X-Small                                                                                              5
ARTSCAPE Etched Glass 24" x 36" Window Film, 24-by-36-Inch                       

# UDF

In [15]:
from fuzzywuzzy import fuzz                                             # importa fuzzywuzzy
from fuzzywuzzy import process



In [16]:
choices = list(indices.index)                                           # cria vetor com as escolha a serem selecionadas

In [17]:
%%time

extracted = process.extract("lego", choices, limit=1)
extracted[0][0]

CPU times: total: 781 ms
Wall time: 1.97 s


'LEGO Lunch Box, Medium Pink'

In [18]:
print(dataset.loc[39])

Product Name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               VTech Twist and Hug Koala Rattle
Category                                                                                                                                                                                                                                                                                                                    

In [19]:
# Function that takes in product name as input and outputs most similar product
def rec_lin(user_input, linear=linear):
    
    # use fuzzywuzzy to grab the product with name closest to user input
    extracted = process.extract(user_input, choices, limit=1)               # usa extract para extrair do vetor de escolhas, um produto que possui similaridade com o input do usuário
    product_name = extracted[0][0]                                          # atribui essa escolha a uma variável
    
    # Get the index of the product that matches the product name
    idx = indices[product_name]                                             # cria uma variável para armazenar o indice do produto

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(linear[idx]))                               # cria lista com os valaores de produtos similares

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)       # organiza a lista

    # Get the scores of the 10 most similar products
    sim_scores = sim_scores[1:11]                                           # pega os 10 produtos mais similares

    # Get the product indices
    product_indices = [i[0] for i in sim_scores]                            # pega os indices dos produtos

    df_return = dataset[["Product Name","Selling Price", "Selling Price_processed"]].loc[product_indices]
    # Return the top 10 most similar products
    return df_return.sort_values(by="Selling Price_processed", ascending=True)[["Product Name","Selling Price"]]

In [40]:
name = input("What would you like to search for today? ")
rec_lin(name)

Unnamed: 0,Product Name,Selling Price
88,Firefly: The Game - Esmeralda Game Expansion,$12.50
291,Pressman Toys Giant Snakes & Ladders Game (4 Player),$14.90
118,Indie Boards and Cards Flash Point Fire Rescue 2nd Story,$14.99
278,Smart Play Ingenio Colors & Shapes Memory Match Game,$15.20
252,"Toysmith Get Outside GO! Neon Dart Ball Set, Packaging may vary",$16.78
223,Schylling Shuffle Shot,$19.99
250,"Carson-Dellosa Publishing Language Arts Learning Games, Grade 2",$24.99
302,Ninja Division NAS Howl & Yip Board Game,$3.07
193,GreenBrier Games Yashima Legends from Fairytale Board Game,$35.00
299,Inlaid Cribbage Box with Cards,$43.00
