# Preprocesado de datasets

En este proyecto, trabajamos con 6 datasets de información de e-commerce en Brasil, proporcionados por Olist.

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

In [2]:
BASE_DIR = Path.cwd().parent
DATA_DIR = (BASE_DIR / "data").resolve()

In [3]:
customers = pd.read_csv(DATA_DIR/"raw"/"olist_customers_dataset.csv")
df_customers = pd.DataFrame(customers)

items = pd.read_csv(DATA_DIR/"raw"/"olist_order_items_dataset.csv")
df_items = pd.DataFrame(items)

payments = pd.read_csv(DATA_DIR/"raw"/"olist_order_payments_dataset.csv")
df_payments = pd.DataFrame(payments)

reviews = pd.read_csv(DATA_DIR/"raw"/"olist_order_reviews_dataset.csv")
df_reviews = pd.DataFrame(reviews)

orders = pd.read_csv(DATA_DIR/"raw"/"olist_orders_dataset.csv")
df_orders = pd.DataFrame(orders)

# No lo uso
# products = pd.read_csv(DATA_DIR/"raw"/"olist_products_dataset.csv")
# df_products = pd.DataFrame(products)

In [4]:
print("\ncustomer columns:\n", df_customers.columns)
print("\nitems columns:\n", df_items.columns)
print("\npayments columns:\n", df_payments.columns)
print("\nreviews columns:\n", df_reviews.columns)
print("\norders columns:\n", df_orders.columns)
#print("\nproducts columns:\n", df_products.columns)


customer columns:
 Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state'],
      dtype='object')

items columns:
 Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value'],
      dtype='object')

payments columns:
 Index(['order_id', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value'],
      dtype='object')

reviews columns:
 Index(['review_id', 'order_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp'],
      dtype='object')

orders columns:
 Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date'],
      dtype='object')


Podemos observar que no todos los DataFrames tienen las mismas columnas, por lo que tendremos que hacer varios merges entre ellas.

Primero comenzare por los que puedo unir a traves de la columna *"order_id"*

In [5]:
print(df_items['order_id'].value_counts().max())
print(df_payments['order_id'].value_counts().max())
print(df_reviews['order_id'].value_counts().max())
print(df_orders['order_id'].value_counts().max())

21
29
3
1


Los dataframes, tienen el valor "order_id" repetido, menos el que utilizaré como referendcia *"df_orders"*.

Para limpiar los dataframes, lo haré en scripts separados y luego importaré los .csv procesados.

## Union de datasets order_id

Una vez procesados, lo primero será importarlos.

In [6]:
items_processed = pd.read_csv(DATA_DIR/"processed"/"processed_items.csv")
df_items = pd.DataFrame(items_processed)

payments_processed = pd.read_csv(DATA_DIR/"processed"/"processed_payments.csv")
df_payments = pd.DataFrame(payments_processed)

reviews_processed = pd.read_csv(DATA_DIR/"processed"/"processed_reviews.csv")
df_reviews = pd.DataFrame(reviews_processed)

El dataset de orders, no es necesario procesarlo, ya que es nuestra referencia y no tiene duplicados en "order_id"

Antes de nada un health check para ver que los order_id no estan duplicados en ninguno de los datasets.

Es importante que mi dataset orders, sea el de mayor dimensión, en caso de que mis dataset no sean iguales

In [7]:
for nombre, df in [("orders", df_orders),
                   ("payments", df_payments),
                   ("reviews", df_reviews),
                   ("items", df_items)]:
    dups = df.duplicated("order_id").sum()  
    print(f"{nombre:<8}  filas = {len(df):>6,}   pedidos únicos = {df['order_id'].nunique():>6,}   duplicados = {dups}")

orders    filas = 99,441   pedidos únicos = 99,441   duplicados = 0
payments  filas = 99,440   pedidos únicos = 99,440   duplicados = 0
reviews   filas = 98,673   pedidos únicos = 98,673   duplicados = 0
items     filas = 98,666   pedidos únicos = 98,666   duplicados = 0


Procedo con los merge, siempre tomando df_orders como tabla maestra

In [8]:
df_orders = df_orders.merge(df_payments, how="inner", on="order_id")
df_orders = df_orders.merge(df_reviews, how='inner', on="order_id")
df_orders = df_orders.merge(df_items, how="inner", on="order_id")

df_orders.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,total_payment,payment_type,max_installments,total_reviews,worst_review,mean_review_score,last_review,review_comment_message,review_comment_title,total_price,item_count,total_freight_value
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,38.71,voucher,1,1,4,4.0,4,"Não testei o produto ainda, mas ele veio corre...",,29.99,1,8.72
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,141.46,boleto,1,1,4,4.0,4,Muito bom o produto.,Muito boa a loja,118.7,1,22.76
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,179.12,credit_card,3,1,5,5.0,5,,,159.9,1,19.22
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,72.2,credit_card,1,1,5,5.0,5,O produto foi exatamente o que eu esperava e e...,,45.0,1,27.2
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,28.62,credit_card,1,1,5,5.0,5,,,19.9,1,8.72


## Customers Dataset

In [9]:
df_customers.head(3)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP


En el caso de este dataset, lo haré en este notebook, dado que es más sencillo.

Las columna que no me interesan son:
- *"customer_zip_code_prefix"* -> es solo el prefijo, no me aporta suficiente información
- *"customer_unique_id"* -> id que identifica a los clientes, no me sirve y ya tengo la columna *"customer_id"*

In [10]:
df_customers.drop(columns=["customer_zip_code_prefix", "customer_unique_id"], inplace=True)

## Preparado final de datos

Por último, uno todos los datos en un dataset final y compruebo que todo este correcto

In [11]:
df_processed = df_orders.merge(df_customers, how="inner", on="customer_id")

In [12]:
df_processed.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,total_payment,payment_type,...,worst_review,mean_review_score,last_review,review_comment_message,review_comment_title,total_price,item_count,total_freight_value,customer_city,customer_state
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,38.71,voucher,...,4,4.0,4,"Não testei o produto ainda, mas ele veio corre...",,29.99,1,8.72,sao paulo,SP
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,141.46,boleto,...,4,4.0,4,Muito bom o produto.,Muito boa a loja,118.7,1,22.76,barreiras,BA
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,179.12,credit_card,...,5,5.0,5,,,159.9,1,19.22,vianopolis,GO
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,72.2,credit_card,...,5,5.0,5,O produto foi exatamente o que eu esperava e e...,,45.0,1,27.2,sao goncalo do amarante,RN
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,28.62,credit_card,...,5,5.0,5,,,19.9,1,8.72,santo andre,SP


In [13]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97916 entries, 0 to 97915
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_id                       97916 non-null  object 
 1   customer_id                    97916 non-null  object 
 2   order_status                   97916 non-null  object 
 3   order_purchase_timestamp       97916 non-null  object 
 4   order_approved_at              97902 non-null  object 
 5   order_delivered_carrier_date   96925 non-null  object 
 6   order_delivered_customer_date  95829 non-null  object 
 7   order_estimated_delivery_date  97916 non-null  object 
 8   total_payment                  97916 non-null  float64
 9   payment_type                   97916 non-null  object 
 10  max_installments               97916 non-null  int64  
 11  total_reviews                  97916 non-null  int64  
 12  worst_review                   97916 non-null 

Puedo ver que tengo algunos datos nulos en mi dataset, esto lo tendré en cuenta para el EDA.

In [14]:
df_processed.to_csv(DATA_DIR/"processed"/"processed_dataset.csv", index=False)