# 0.0. Importações

## 0.1. Bibliotecas

In [2]:
import ast
import nltk
import sklearn
import numpy as np
import pandas as pd

from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 0.2. Dataset

In [4]:
df_raw = pd.read_csv('../data/amazon.csv')

## 0.3. Funções

# 1.0. Descrição dos Dados

In [7]:
df1 = df_raw.copy()

## 1.1. Shape

In [8]:
print(f'Number of rows: {df1.shape[0]}')
print(f'Number of cols: {df1.shape[1]}')

Number of rows: 1465
Number of cols: 16


## 1.2. Types

In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

# 2.0. Filtros

In [10]:
df2 = df1.copy()

## 2.1. Colunas

In [13]:
df2 = df2[['product_id', 'product_name', 'category', 'about_product', 'review_title', 'review_content']]

# 3.0. Análise Exploratória de Dados

In [15]:
df3 = df2.copy()

## 3.1. Check NA

In [16]:
df3.isnull().sum()

product_id        0
product_name      0
category          0
about_product     0
review_title      0
review_content    0
dtype: int64

## 3.2. Shape

In [20]:
df3.shape

(1465, 6)

# 4.0. Processamento de Texto

In [33]:
df4 = df3.copy()

In [34]:
df4.head(1)

Unnamed: 0,product_id,product_name,category,about_product,review_title,review_content
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,High Compatibility : Compatible With iPhone 12...,"Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...


## 4.1. Limpeza dos Dados de Texto

In [35]:
df4['category'] = df4['category'].apply(lambda x:x.split())
df4['about_product'] = df4['about_product'].apply(lambda x:x.split())
df4['review_title'] = df4['review_title'].apply(lambda x:x.split())
df4['review_content'] = df4['review_content'].apply(lambda x:x.split())

In [37]:
df4.head(1)

Unnamed: 0,product_id,product_name,category,about_product,review_title,review_content
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,[Computers&Accessories|Accessories&Peripherals...,"[High, Compatibility, :, Compatible, With, iPh...","[Satisfied,Charging, is, really, fast,Value, f...","[Looks, durable, Charging, is, fine, tooNo, co..."


## 4.2. Preparando os Dados para Vetorização em Outro Espaço Vetorial

In [38]:
df4['tags'] = df4['category'] + \
              df4['about_product'] + \
              df4['review_title'] + \
              df4['review_content']

In [40]:
# selecionar features que vão ser usadas
df4 = df4[['product_id', 'product_name', 'tags']]
# simplificar o vetor
df4['tags'] = df4['tags'].apply(lambda x:" ".join(x))
# colocar tudo em minúsculo
df4['tags'] = df4['tags'].apply(lambda x:x.lower())

In [42]:
df4.head(1)

Unnamed: 0,product_id,product_name,tags
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,computers&accessories|accessories&peripherals|...
