## Import libraries

In [1]:
import pandas as pd
import numpy as np

import os

import utils

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = '../data/'
filenames = os.listdir(data_dir)
for i, ele in enumerate(filenames):
    print(i, ele)

0 olist_customers_dataset.csv
1 olist_geolocation_dataset.csv
2 olist_orders_dataset.csv
3 olist_order_items_dataset.csv
4 olist_order_payments_dataset.csv
5 olist_order_reviews_dataset.csv
6 olist_products_dataset.csv
7 olist_sellers_dataset.csv
8 processed
9 product_category_name_translation.csv


## Orders data

* Read date time columns correctly, convert order_status to categorical variable and store the codes

In [3]:
df_orders = pd.read_csv(data_dir+filenames[2], parse_dates=['order_purchase_timestamp',
                                                            'order_approved_at', 
                                                            'order_delivered_carrier_date',
                                                            'order_delivered_customer_date',
                                                            'order_estimated_delivery_date'])
t = df_orders['order_status'].astype('category')
df_orders['order_status'] = df_orders['order_status'].astype('category').cat.codes.values
df_orders = utils.reduce_mem_usage(df_orders)

Mem. usage decreased to  5.41 Mb (0.0% reduction)


In [4]:
dict(zip(t, t.cat.codes))

{'approved': 0,
 'canceled': 1,
 'created': 2,
 'delivered': 3,
 'invoiced': 4,
 'processing': 5,
 'shipped': 6,
 'unavailable': 7}

In [5]:
utils.to_pickles(df_orders, '../data/processed/orders')

0it [00:00, ?it/s]1it [00:02,  2.12s/it]3it [00:02,  1.36it/s]


## Payments data

* Store 'payment_type' as categorical variable

In [6]:
df_payments = pd.read_csv(data_dir+filenames[4])
t = df_payments['payment_type'].astype('category')

df_payments['payment_type'] = df_payments['payment_type'].astype('category').cat.codes.values

df_payments = utils.reduce_mem_usage(df_payments)

Mem. usage decreased to  1.29 Mb (60.6% reduction)


In [7]:
dict(zip(t, t.cat.codes))

{'boleto': 0,
 'credit_card': 1,
 'debit_card': 2,
 'not_defined': 3,
 'voucher': 4}

In [8]:
utils.to_pickles(df_payments, '../data/processed/payments')

0it [00:00, ?it/s]3it [00:00, 25.64it/s]


## Customers data

* order_id from orders table is joined here, this is done purely to define relationship between the two datasets, while using FeatureTools
* customers and seller data has state and city level information, both columns are similar and their categories are stored and reused again.


In [14]:
df_customers = pd.read_csv(data_dir+filenames[0])

df_customers = df_customers.merge(df_orders[['order_id', 'customer_id']], on='customer_id')

t = df_customers['customer_city'].astype('category')  ##store the cat codes, use it for seller info
cust_city = dict(zip(t, t.cat.codes))

t = df_customers['customer_state'].astype('category')  ##store the cat codes, use it for seller info
cust_state = dict(zip(t, t.cat.codes))

df_customers['customer_state'] = df_customers['customer_state'].astype('category').cat.codes.values
df_customers['customer_city'] = df_customers['customer_city'].astype('category').cat.codes.values


df_customers = utils.reduce_mem_usage(df_customers)

Mem. usage decreased to  3.70 Mb (9.3% reduction)


In [11]:
import pickle

In [15]:
cust_state_t =dict(zip( t.cat.codes, t))
with open('../data/processed/cust_state.pickle', 'wb') as f:
    pickle.dump(cust_state_t, f )

In [8]:
utils.to_pickles(df_customers, '../data/processed/customers')

0it [00:00, ?it/s]1it [00:00,  5.06it/s]2it [00:00,  6.02it/s]3it [00:00,  6.42it/s]


## Products data

* Product_category_name is stored as categorical data

In [11]:
df_products = pd.read_csv(data_dir+filenames[6])

t = df_products['product_category_name'].astype('category')
df_products['product_category_name'] = df_products['product_category_name'].astype('category').cat.codes

df_products = utils.reduce_mem_usage(df_products)

Mem. usage decreased to  0.72 Mb (64.6% reduction)


In [12]:
dict(zip(t, t.cat.codes))

{'perfumaria': 62,
 'artes': 3,
 'esporte_lazer': 32,
 'bebes': 9,
 'utilidades_domesticas': 72,
 'instrumentos_musicais': 45,
 'cool_stuff': 26,
 'moveis_decoracao': 54,
 'eletrodomesticos': 28,
 'brinquedos': 12,
 'cama_mesa_banho': 13,
 'construcao_ferramentas_seguranca': 25,
 'informatica_acessorios': 44,
 'beleza_saude': 11,
 'malas_acessorios': 50,
 'ferramentas_jardim': 40,
 'moveis_escritorio': 55,
 'automotivo': 8,
 'eletronicos': 30,
 'fashion_calcados': 34,
 'telefonia': 70,
 'papelaria': 59,
 'fashion_bolsas_e_acessorios': 33,
 'pcs': 61,
 'casa_construcao': 16,
 'relogios_presentes': 66,
 'construcao_ferramentas_construcao': 21,
 'pet_shop': 63,
 'eletroportateis': 31,
 'agro_industria_e_comercio': 0,
 nan: -1,
 'moveis_sala': 57,
 'sinalizacao_e_seguranca': 68,
 'climatizacao': 19,
 'consoles_games': 20,
 'livros_interesse_geral': 48,
 'construcao_ferramentas_ferramentas': 22,
 'fashion_underwear_e_moda_praia': 39,
 'fashion_roupa_masculina': 38,
 'moveis_cozinha_area_de_

In [10]:
utils.to_pickles(df_products, '../data/processed/products')

0it [00:00, ?it/s]3it [00:00, 37.12it/s]


## Sellers data

* State and city information from customer dataset is re-used here

In [11]:
df_sellers = pd.read_csv(data_dir+filenames[7])

df_sellers['seller_city'] = df_sellers['seller_city'].astype('category').map(cust_city)
df_sellers['seller_city'].fillna(-1, inplace=True)    ##reuse cat codes from customer info

df_sellers['seller_state'] = df_sellers['seller_state'].astype('category').map(cust_state)
df_sellers['seller_state'].fillna(-1, inplace=True)   ##reuse cat codes from customer info

df_sellers = utils.reduce_mem_usage(df_sellers)

Mem. usage decreased to  0.04 Mb (53.1% reduction)


In [12]:
utils.to_pickles(df_sellers, '../data/processed/sellers')

0it [00:00, ?it/s]3it [00:00, 51.22it/s]
