# Preprocesado de datasets

En este proyecto, trabajamos con 6 datasets de información de e-commerce en Brasil, proporcionados por Olist.

In [52]:
from pathlib import Path


import pandas as pd
import numpy as np

In [53]:
BASE_DIR = Path.cwd().parent
DATA_DIR = (BASE_DIR / "data").resolve()

In [54]:
customers = pd.read_csv(DATA_DIR/"raw"/"olist_customers_dataset.csv")
df_customers = pd.DataFrame(customers)

items = pd.read_csv(DATA_DIR/"raw"/"olist_order_items_dataset.csv")
df_items = pd.DataFrame(items)

payments = pd.read_csv(DATA_DIR/"raw"/"olist_order_payments_dataset.csv")
df_payments = pd.DataFrame(payments)

reviews = pd.read_csv(DATA_DIR/"raw"/"olist_order_reviews_dataset.csv")
df_reviews = pd.DataFrame(reviews)

orders = pd.read_csv(DATA_DIR/"raw"/"olist_orders_dataset.csv")
df_orders = pd.DataFrame(orders)

products = pd.read_csv(DATA_DIR/"raw"/"olist_products_dataset.csv")
df_products = pd.DataFrame(products)

In [55]:
df_items.head(3)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87


In [56]:
print("\ncustomer columns:\n", df_customers.columns)
print("\nitems columns:\n", df_items.columns)
print("\npayments columns:\n", df_payments.columns)
print("\nreviews columns:\n", df_reviews.columns)
print("\norders columns:\n", df_orders.columns)
print("\nproducts columns:\n", df_products.columns)


customer columns:
 Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state'],
      dtype='object')

items columns:
 Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value'],
      dtype='object')

payments columns:
 Index(['order_id', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value'],
      dtype='object')

reviews columns:
 Index(['review_id', 'order_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp'],
      dtype='object')

orders columns:
 Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date'],
      dtype='object')

products columns:
 Index(['product_id', 'product_category_name', 'product_name

Podemos observar que no todos los DataFrames tienen las mismas columnas, por lo que tendremos que hacer varios merges entre ellas.

Primero comenzare por los que puedo unir a traves de la columna *"order_id"*

In [57]:
print(df_items['order_id'].value_counts().head(1))
print(df_payments['order_id'].value_counts().head(1))
print(df_reviews['order_id'].value_counts().head(1))
print(df_orders['order_id'].value_counts().head(1))

order_id
8272b63d03f5f79c56e9e4120aec44ef    21
Name: count, dtype: int64
order_id
fa65dad1b0e818e3ccc5cb0e39231352    29
Name: count, dtype: int64
order_id
c88b1d1b157a9999ce368f218a407141    3
Name: count, dtype: int64
order_id
e481f51cbdc54678b7cc49136f2d6af7    1
Name: count, dtype: int64


Estas columnas tienen duplicados menos en el dataframe *df_orders*, por lo tanto habrá que tratar esto antes de realizar el join

### Payments Dataset

In [58]:
# Con esto conseguimos evitar duplicados al hacer merge de datasets por tener order_id duplicados

payments_total = df_payments.groupby("order_id", as_index=False)["payment_value"].sum()
payments_total.rename(columns={"payment_value": "total_payment"}, inplace=True)

payment_type_main = df_payments.groupby("order_id")["payment_type"].agg(lambda x: x.mode()[0]).reset_index()

max_installments = df_payments.groupby("order_id", as_index=False)["payment_installments"].max()
max_installments.rename(columns={"payment_installments": "max_installments"}, inplace=True)

df_payments = payments_total.merge(payment_type_main, on="order_id", how="left")
df_payments = df_payments.merge(max_installments, on="order_id", how="left")

df_payments.head(3)

Unnamed: 0,order_id,total_payment,payment_type,max_installments
0,00010242fe8c5a6d1ba2dd792cb16214,72.19,credit_card,2
1,00018f77f2f0320c557190d7a144bdd3,259.83,credit_card,3
2,000229ec398224ef6ca0657da4fc703e,216.87,credit_card,5
