In [None]:
!pip install pandas

In [None]:
import pandas as pd

### Importação dos arquivos CSV

Abaixo importei todos os arquivos .CSV do dataset "Brazilian E-commerce Public Data ny Olist" utilizando a biblioteca pandas. Cada DataFrame representa uma das entidades principais e seus relacionamentos.

In [None]:
# Pedidos e Detalhes 

df_orders = pd.read_csv("olist_orders_dataset.csv", sep=",")
df_order_items = pd.read_csv("olist_order_items_dataset.csv", sep=",")
df_payments = pd.read_csv("olist_order_payments_dataset.csv", sep=",")
df_reviews = pd.read_csv("olist_order_reviews_dataset.csv", sep=",")

#Entidades Relacionadas
df_customers = pd.read_csv("olist_customers_dataset.csv", sep=",")
df_products = pd.read_csv("olist_products_dataset.csv", sep=",")
df_sellers = pd.read_csv("olist_sellers_dataset.csv", sep=",")
df_category = pd.read_csv("product_category_name_translation.csv", sep=",")
df_geolocation = pd.read_csv("olist_geolocation_dataset.csv", sep=",")

### Limpeza dos Dados

In [None]:
# Verificando valores nulos e duplicatas em todos os DataFrames

dataframes = {
    'orders': df_orders,
    'order_items': df_order_items,
    'payments': df_payments,
    'reviews': df_reviews,
    'customers': df_customers,
    'sellers': df_sellers,
    'products': df_products,
    'category': df_category,
    'geolocation': df_geolocation
}

for name, df in dataframes.items():
    print(f"\n {name.upper()}")
    print("Quantidade de linhas com valores nulos:\n", df.isnull().sum())
    print("Linhas com valores duplicados:", df.duplicated().sum())

In [48]:
# Converter as colunas de data para datetime, serve para podermos utilizar análises temporais mais eficazes.

date_cols = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 
             'order_delivered_customer_date', 'order_estimated_delivery_date']
df_orders[date_cols] = df_orders[date_cols].apply(pd.to_datetime)

# Nenhuma coluna precisa ser preenchida com média ou mediana, pois datas ausentes em (approved ou delivered) podem representar cancelamentos.
# existem valores duplicados?

# removendo linhas duplicadas de orders
df_orders.drop_duplicates(inplace=True)
# removendo linhas duplicadas de order_items
df_order_items.drop_duplicates(inplace=True)
# removendo linhas duplicadas de payments
df_payments.drop_duplicates(inplace=True)

# Converter colunas de data da tabela reviews
df_reviews['review_creation_date'] = pd.to_datetime(df_reviews['review_creation_date'])
df_reviews['review_answer_timestamp'] = pd.to_datetime(df_reviews['review_answer_timestamp'])
# removendo linhas duplicadas de reviews
df_reviews.drop_duplicates(inplace=True)

# removendo linhas duplicadas de customers
df_customers.drop_duplicates(inplace=True)
# removendo linhas duplicadas de sellers
df_sellers.drop_duplicates(inplace=True)

# Já que sabemos que existem linhas nulas na tabela products, então teremos que usar uma estageria de manter ou remover os nulos
# isso vai depender de como nos iremos utilizar os dados futuramente.

# Verificar colunas com nulos da tabela produto. Retorna um valor boleano True or False.
print(df_products.isnull().sum())

# Removemos da tabela df_products todas as linhas que têm valor nulo apenas na coluna product_category_name
# Esse processo é importante para análises por categoria. Manter valores nulos nela pode atrapalhar agregações e visualizações.
df_products.dropna(subset=['product_category_name'], inplace=True)

# Removemos todas as linhas duplicadas da tabela df_products, pois o pandas considera como duplicadas as linhas que possuem valores iguais em todas as colunas.
df_products.drop_duplicates(inplace=True)
# removendo linhas duplicadas de category
df_category.drop_duplicates(inplace=True)
# Geolocation pode ter duplicatas porque existem várias entradas por cidade
df_geolocation.drop_duplicates(inplace=True)

product_id                    0
product_category_name         0
product_name_lenght           0
product_description_lenght    0
product_photos_qty            0
product_weight_g              1
product_length_cm             1
product_height_cm             1
product_width_cm              1
dtype: int64


In [49]:
# Essa função, faz o tratamento de remoção de espaços em brancos nas colunas do tipo string.
def remover_espacos(df):
    colunas_str = df.select_dtypes(include = 'object').columns
    df[colunas_str] = df[colunas_str].apply(lambda x: x.str.strip())
    return df

# Aplicar nos csv principais
df_orders = remover_espacos(df_orders)
df_order_items = remover_espacos(df_order_items)
df_payments = remover_espacos(df_payments)
df_reviews = remover_espacos(df_reviews)
df_customers = remover_espacos(df_customers)
df_sellers = remover_espacos(df_sellers)
df_products = remover_espacos(df_products)
df_category = remover_espacos(df_category)
df_geolocation = remover_espacos(df_geolocation)

In [50]:
df_orders

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26
...,...,...,...,...,...,...,...,...
99436,9c5dedf39a927c1b2549525ed64a053c,39bd1228ee8140590ac3aca26f2dfe00,delivered,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28
99437,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02
99438,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27
99439,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15
