In [1]:
# Importando o Pandas 2.2.2
import pandas as pd
pd.__version__

'2.2.2'

In [2]:
# Importando o dataset de avaliações de vendas da Amazon Índia
amazon = pd.read_csv('../dados/amazon.csv')

In [3]:
# Nome das colunas
amazon.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

In [4]:
# Quantidade de valores sem tratamento
amazon.index

RangeIndex(start=0, stop=1465, step=1)

In [5]:
# Selecionando apenas as colunas relevantes do dataset
amazon = amazon[['category', 'discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count']]
amazon.head()

Unnamed: 0,category,discounted_price,actual_price,discount_percentage,rating,rating_count
0,Computers&Accessories|Accessories&Peripherals|...,₹399,"₹1,099",64%,4.2,24269
1,Computers&Accessories|Accessories&Peripherals|...,₹199,₹349,43%,4.0,43994
2,Computers&Accessories|Accessories&Peripherals|...,₹199,"₹1,899",90%,3.9,7928
3,Computers&Accessories|Accessories&Peripherals|...,₹329,₹699,53%,4.2,94363
4,Computers&Accessories|Accessories&Peripherals|...,₹154,₹399,61%,4.2,16905


In [6]:
# Removendo os caracteres especiais das strings
amazon['discounted_price'] = amazon['discounted_price'].str.replace('₹', '')
amazon['discounted_price'] = amazon['discounted_price'].str.replace(',', '')

amazon['actual_price'] = amazon['actual_price'].str.replace('₹', '')
amazon['actual_price'] = amazon['actual_price'].str.replace(',', '')

amazon['discount_percentage'] = amazon['discount_percentage'].str.replace('%', '')
                                                                          
amazon['rating_count'] = amazon['rating_count'].str.replace(',', '')

In [7]:
# Removendo um valor incoerente na coluna de 'rating'
amazon.drop(amazon[amazon['rating'] == '|'].index, inplace = True)

In [8]:
# Removendo nulls
amazon = amazon.dropna()

In [9]:
# Transformando as colunas de strings para números (float e int)
amazon['discounted_price'] = amazon['discounted_price'].astype(float)
amazon['actual_price'] = amazon['actual_price'].astype(float)
amazon['discount_percentage'] = amazon['discount_percentage'].astype(float)
amazon['rating'] = amazon['rating'].astype(float)
amazon['rating_count'] = amazon['rating_count'].astype(int)

In [10]:
# Primeira visualização pós transformações
amazon.head()

Unnamed: 0,category,discounted_price,actual_price,discount_percentage,rating,rating_count
0,Computers&Accessories|Accessories&Peripherals|...,399.0,1099.0,64.0,4.2,24269
1,Computers&Accessories|Accessories&Peripherals|...,199.0,349.0,43.0,4.0,43994
2,Computers&Accessories|Accessories&Peripherals|...,199.0,1899.0,90.0,3.9,7928
3,Computers&Accessories|Accessories&Peripherals|...,329.0,699.0,53.0,4.2,94363
4,Computers&Accessories|Accessories&Peripherals|...,154.0,399.0,61.0,4.2,16905


In [11]:
# Criando uma matriz de correlação linear, para avaliar se algum fator pode ser altamente descritivo de outro
amazon[['discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count']].corr()

Unnamed: 0,discounted_price,actual_price,discount_percentage,rating,rating_count
discounted_price,1.0,0.96191,-0.242298,0.121132,-0.027304
actual_price,0.96191,1.0,-0.117855,0.122467,-0.036215
discount_percentage,-0.242298,-0.117855,1.0,-0.155679,0.011294
rating,0.121132,0.122467,-0.155679,1.0,0.102235
rating_count,-0.027304,-0.036215,0.011294,0.102235,1.0


In [12]:
# Descobrindo quantos registros existem por categorias de produtos
categories_value_counts = dict(amazon['category'].value_counts())
categories_value_counts

{'Computers&Accessories|Accessories&Peripherals|Cables&Accessories|Cables|USBCables': 231,
 'Electronics|WearableTechnology|SmartWatches': 76,
 'Electronics|Mobiles&Accessories|Smartphones&BasicMobiles|Smartphones': 68,
 'Electronics|HomeTheater,TV&Video|Televisions|SmartTelevisions': 63,
 'Electronics|Headphones,Earbuds&Accessories|Headphones|In-Ear': 52,
 'Electronics|HomeTheater,TV&Video|Accessories|RemoteControls': 49,
 'Home&Kitchen|Kitchen&HomeAppliances|SmallKitchenAppliances|MixerGrinders': 27,
 'Electronics|HomeTheater,TV&Video|Accessories|Cables|HDMICables': 24,
 'Home&Kitchen|Kitchen&HomeAppliances|Vacuum,Cleaning&Ironing|Irons,Steamers&Accessories|Irons|DryIrons': 24,
 'Computers&Accessories|Accessories&Peripherals|Keyboards,Mice&InputDevices|Mice': 24,
 'Home&Kitchen|Heating,Cooling&AirQuality|WaterHeaters&Geysers|InstantWaterHeaters': 23,
 'Home&Kitchen|Kitchen&HomeAppliances|Vacuum,Cleaning&Ironing|Irons,Steamers&Accessories|LintShavers': 22,
 'Home&Kitchen|Heating,Cooli

In [13]:
# Criando diferentes dataframes, segmentados por categorias
# 4 dataframes contendo 1455 registros. Representam 99,31% de todos os registros do .csv original
computers_accessories = amazon[amazon['category'].str.startswith('Computers')] # 451 registros
computers_accessories.reset_index(inplace = True)
                                 
electronics = amazon[amazon['category'].str.startswith('Electronics')] # 526 registros
electronics.reset_index(inplace = True)

home = amazon[amazon['category'].str.startswith('Home')] # 449 registros
home.reset_index(inplace = True)

office = amazon[amazon['category'].str.startswith('Office')] # 29 registros
office.reset_index(inplace = True)

In [14]:
# Medidas de tendência de centralidade e variância do segmento de Computadores
computers_accessories_agg = computers_accessories[['discounted_price', 'actual_price', 'rating', 'rating_count']].agg(['mean', 'median', 'min', 'max', 'std'])
computers_accessories_agg

Unnamed: 0,discounted_price,actual_price,rating,rating_count
mean,845.393836,1686.659157,4.155654,17136.782705
median,349.0,999.0,4.2,7732.0
min,39.0,39.0,3.3,5.0
max,37247.0,59890.0,5.0,253105.0
std,2388.047248,3885.71959,0.252776,30133.915819


In [15]:
# Medidas de tendência de centralidade e variância do segmento de Eletrônicos
eletronics_agg = electronics[['discounted_price', 'actual_price', 'rating', 'rating_count']].agg(['mean', 'median', 'min', 'max', 'std'])
eletronics_agg

Unnamed: 0,discounted_price,actual_price,rating,rating_count
mean,5965.887833,10127.311787,4.081749,29997.809886
median,1399.0,3199.5,4.1,10689.0
min,79.0,171.0,2.8,7.0
max,77990.0,139900.0,4.7,426973.0
std,10279.670967,15495.772671,0.26962,61729.576162


In [16]:
# Medidas de tendência de centralidade e variância para o segmento Doméstico
home_agg = home[['discounted_price', 'actual_price', 'rating', 'rating_count']].agg(['mean', 'median', 'min', 'max', 'std'])
home_agg

Unnamed: 0,discounted_price,actual_price,rating,rating_count
mean,2322.251247,4150.797327,4.041648,6678.492205
median,1199.0,1999.0,4.1,2326.0
min,79.0,79.0,2.0,2.0
max,42990.0,75990.0,4.8,270563.0
std,3466.06991,6797.371662,0.334648,16418.584549


In [17]:
# Medidas de tendência de centralidade e variância para o segmento Escritório
office_agg = office[['discounted_price', 'actual_price', 'rating', 'rating_count']].agg(['mean', 'median', 'min', 'max', 'std'])
office_agg

Unnamed: 0,discounted_price,actual_price,rating,rating_count
mean,301.580645,397.193548,4.309677,4828.225806
median,178.0,210.0,4.3,4426.0
min,50.0,50.0,4.0,388.0
max,1399.0,2999.0,4.5,12179.0
std,317.288699,552.781839,0.149119,2853.095766


In [18]:
# Fazendo a matriz de correlação linear segmentada por categoria de Computadores e Acessórios
computers_accessories[['discounted_price', 'actual_price', 'rating', 'rating_count']].corr()

Unnamed: 0,discounted_price,actual_price,rating,rating_count
discounted_price,1.0,0.966529,0.079644,-0.005534
actual_price,0.966529,1.0,0.084355,-0.007987
rating,0.079644,0.084355,1.0,0.159041
rating_count,-0.005534,-0.007987,0.159041,1.0


In [19]:
# Fazendo a matriz de correlação linear segmentada por categoria de Eletrônicos
electronics[['discounted_price', 'actual_price', 'rating', 'rating_count']].corr()

Unnamed: 0,discounted_price,actual_price,rating,rating_count
discounted_price,1.0,0.963839,0.212982,-0.104925
actual_price,0.963839,1.0,0.212889,-0.123015
rating,0.212982,0.212889,1.0,0.122618
rating_count,-0.104925,-0.123015,0.122618,1.0


In [20]:
# Fazendo a matriz de correlação linear segmentada por categoria de Itens Domésticos
home[['discounted_price', 'actual_price', 'rating', 'rating_count']].corr()

Unnamed: 0,discounted_price,actual_price,rating,rating_count
discounted_price,1.0,0.93417,0.202371,-0.027408
actual_price,0.93417,1.0,0.19236,-0.039668
rating,0.202371,0.19236,1.0,0.062597
rating_count,-0.027408,-0.039668,0.062597,1.0


In [21]:
# Fazendo a matriz de correlação linear segmentada por categoria de Itens de Escritório
office[['discounted_price', 'actual_price', 'rating', 'rating_count']].corr()

Unnamed: 0,discounted_price,actual_price,rating,rating_count
discounted_price,1.0,0.9006,0.206934,0.023848
actual_price,0.9006,1.0,0.070703,0.028086
rating,0.206934,0.070703,1.0,0.205024
rating_count,0.023848,0.028086,0.205024,1.0
