# Preprocess Items Dataset

In [7]:
from pathlib import Path

import pandas as pd
import numpy as np

In [8]:
# Directorios donde se encuentran mis datos
BASE_DIR = Path.cwd().parent
DATA_DIR = (BASE_DIR / "data").resolve()

In [9]:
items = pd.read_csv(DATA_DIR/"raw"/"olist_order_items_dataset.csv")
df_items = pd.DataFrame(items)

In [10]:
print("\nitems columns:\n", df_items.columns)


items columns:
 Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value'],
      dtype='object')


In [11]:
print(df_items['order_id'].value_counts().max())

21


Se puede observar que tenemos hasta 21 order_id repetidos

In [12]:
df_items.head(3)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87


Lo primero al observar este dataset es que hay columnas que descartar, por no aportar a nuestra hipotesis de predecir churn. Estas son:
- "seller_id"
- "product_id"
- "shipping_limit_date"

*"shipping_limit_date"* la eliminaremos dado que ya tenemos la fecha estimada de entrega y la fecha de entrega en el dataset

Además crearemos una nueva columna, *"item_count"* mostrando el numero total de items por order

In [13]:
total_price = df_items.groupby("order_id", as_index=False)["price"].sum()
total_price.rename(columns={"price": "total_price"}, inplace=True)

total_items = df_items.groupby("order_id").size().reset_index(name="item_count")
total_items

freight_price = df_items.groupby("order_id", as_index=False)["freight_value"].sum()
freight_price.rename(columns={"freight_value": "total_freight_value"}, inplace=True)

df_items = total_price.merge(total_items, on="order_id", how="left")
df_items = df_items.merge(freight_price, on="order_id", how="left")

df_items

Unnamed: 0,order_id,total_price,item_count,total_freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,58.90,1,13.29
1,00018f77f2f0320c557190d7a144bdd3,239.90,1,19.93
2,000229ec398224ef6ca0657da4fc703e,199.00,1,17.87
3,00024acbcdf0a6daa1e931b038114c75,12.99,1,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,199.90,1,18.14
...,...,...,...,...
98661,fffc94f6ce00a00581880bf54a75a037,299.99,1,43.41
98662,fffcd46ef2263f404302a634eb57f7eb,350.00,1,36.53
98663,fffce4705a9662cd70adb13d4a31832d,99.90,1,16.95
98664,fffe18544ffabc95dfada21779c9644f,55.99,1,8.72


Con esto he conseguido el total gastado por order, la cantidad de articulos y el total de coste por transporte, más adelante esto puede servirnos para hacer ingenieria de características