# Imports

In [1]:
import pandas as pd
import numpy as np

# Data Load

In [2]:
df_aisles = pd.read_csv('../data/aisles.csv')
df_departments = pd.read_csv('../data/departments.csv')
df_orders = pd.read_csv('../data/orders.csv')
df_products = pd.read_csv('../data/products.csv')
df_order_products_prior = pd.read_csv('../data/order_products__prior.csv')
df_order_products_train = pd.read_csv('../data/order_products__train.csv')

# Sample Order

In [3]:
np.random.seed(42)

NUM_ORDERS_SAMPLE = 1000

unique_orders = df_orders['order_id'].unique()
total_unique_orders = len(unique_orders)

sampled_orders_id = np.random.choice(unique_orders, size=NUM_ORDERS_SAMPLE, replace=False)

df_order_products_prior_sample = df_order_products_prior[df_order_products_prior['order_id'].isin(sampled_orders_id)]

In [6]:
df_market = pd.merge(
    df_order_products_prior_sample,
    df_products,
    on='product_id',
    how='left'
)

df_market_aisles = pd.merge(
    df_market,
    df_aisles,on='aisle_id',
    how='left'
)

df_full = pd.merge(
    df_market_aisles,
    df_departments,
    on='department_id',
    how='left'
)

In [7]:
df_full

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,322,13819,1,1,Roasted Salted Cashews,117,19,nuts seeds dried fruit,snacks
1,322,432,2,1,Vanilla Almond Breeze Almond Milk,91,16,soy lactosefree,dairy eggs
2,322,19311,3,1,Almond Flour Tortillas,128,3,tortillas flat bread,bakery
3,322,36646,4,0,Lactose Free Sour Cream,108,16,other creams cheeses,dairy eggs
4,322,28842,5,1,Bunched Cilantro,16,4,fresh herbs,produce
...,...,...,...,...,...,...,...,...,...
9351,3420574,40568,21,0,Paleo Pancake & Waffle Mix,130,14,hot cereal pancake mixes,breakfast
9352,3420574,29439,22,0,"Wild Non-Pareil Capers, Sunkissed in the Medit...",110,13,pickled goods olives,pantry
9353,3420574,35750,23,0,Organic Garbanzo Beans No Salt Added,59,15,canned meals beans,canned goods
9354,3420574,15123,24,0,Gluten Free Fudge Covered Pretzels,45,19,candy chocolate,snacks


# 1.0 Análise Descritiva

## 1.1 Dimensão dos Dados

In [8]:
print('Quantidade de Linhas: {}'.format(df_full.shape[0]))
print('Quantidade de Colunas: {}'.format(df_full.shape[1]))

Quantidade de Linhas: 9356
Quantidade de Colunas: 9


## 1.2 Tipo dos Dados

In [9]:
df_full.dtypes

order_id              int64
product_id            int64
add_to_cart_order     int64
reordered             int64
product_name         object
aisle_id              int64
department_id         int64
aisle                object
department           object
dtype: object

## 1.3 Check Na

In [10]:
df_full.isna().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
product_name         0
aisle_id             0
department_id        0
aisle                0
department           0
dtype: int64

# 2.0 Exploratory Data Analysis (EDA)

In [None]:
##### Tamanho da Cesta de Compras #####7
basket_sizes = df_full.groupby('order_id').size()
print(basket_sizes.describe())
print()
print('Em média o tamanho do carrinho de compras é de {:.2f} itens'.format(basket_sizes.mean()))

count    938.000000
mean       9.974414
std        7.137720
min        1.000000
25%        5.000000
50%        8.000000
75%       13.000000
max       44.000000
dtype: float64

Em média o tamanho do carrinho de compras é de 9.97 itens


In [18]:
basket_size_category = df_full.groupby(['order_id', 'department']).size()

In [43]:
basket_size_category = df_full[['order_id', 'department']].groupby('order_id').nunique().reset_index()
print(f'Tamanho médio da cesta de compras em departamentos: {basket_size_category["department"].mean():.2f}')
print(f'Tamanha minimo da cesta de compras em departamentos: {basket_size_category["department"].min():.2f}')
print(f'Tamanho maximo da cesta de compras em departamentos: {basket_size_category["department"].max():.2f}')
print(f'Mediana do tamanho da cesta de compras em departamentos: {basket_size_category["department"].median():.2f}')

Tamanho médio da cesta de compras em departamentos: 4.75
Tamanha minimo da cesta de compras em departamentos: 1.00
Tamanho maximo da cesta de compras em departamentos: 13.00
Mediana do tamanho da cesta de compras em departamentos: 4.00


In [37]:
df_full.loc[df_full['order_id'] == 322]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,322,13819,1,1,Roasted Salted Cashews,117,19,nuts seeds dried fruit,snacks
1,322,432,2,1,Vanilla Almond Breeze Almond Milk,91,16,soy lactosefree,dairy eggs
2,322,19311,3,1,Almond Flour Tortillas,128,3,tortillas flat bread,bakery
3,322,36646,4,0,Lactose Free Sour Cream,108,16,other creams cheeses,dairy eggs
4,322,28842,5,1,Bunched Cilantro,16,4,fresh herbs,produce
5,322,41259,6,0,Poblano Pepper,83,4,fresh vegetables,produce
6,322,4605,7,0,Yellow Onions,83,4,fresh vegetables,produce
7,322,46526,8,0,French Green Beans,123,4,packaged vegetables fruits,produce
8,322,37011,9,0,Artichokes,83,4,fresh vegetables,produce
