# Imports

In [69]:
import pandas as pd
import numpy as np

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Data Load

In [70]:
df_aisles = pd.read_csv('../data/aisles.csv')
df_departments = pd.read_csv('../data/departments.csv')
df_orders = pd.read_csv('../data/orders.csv')
df_products = pd.read_csv('../data/products.csv')
df_order_products_prior = pd.read_csv('../data/order_products__prior.csv')
df_order_products_train = pd.read_csv('../data/order_products__train.csv')

# Sample Order

In [102]:
np.random.seed(42)

NUM_ORDERS_SAMPLE = 10000

unique_orders = df_orders['order_id'].unique()
total_unique_orders = len(unique_orders)

sampled_orders_id = np.random.choice(unique_orders, size=NUM_ORDERS_SAMPLE, replace=False)

df_order_products_prior_sample = df_order_products_prior[df_order_products_prior['order_id'].isin(sampled_orders_id)]

In [103]:
df_market = pd.merge(
    df_order_products_prior_sample,
    df_products,
    on='product_id',
    how='left'
)

df_market_aisles = pd.merge(
    df_market,
    df_aisles,on='aisle_id',
    how='left'
)

df_full = pd.merge(
    df_market_aisles,
    df_departments,
    on='department_id',
    how='left'
)

In [104]:
df_full

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,322,13819,1,1,Roasted Salted Cashews,117,19,nuts seeds dried fruit,snacks
1,322,432,2,1,Vanilla Almond Breeze Almond Milk,91,16,soy lactosefree,dairy eggs
2,322,19311,3,1,Almond Flour Tortillas,128,3,tortillas flat bread,bakery
3,322,36646,4,0,Lactose Free Sour Cream,108,16,other creams cheeses,dairy eggs
4,322,28842,5,1,Bunched Cilantro,16,4,fresh herbs,produce
...,...,...,...,...,...,...,...,...,...
96598,3420968,40708,6,0,Fennel,83,4,fresh vegetables,produce
96599,3420968,39275,7,0,Organic Blueberries,123,4,packaged vegetables fruits,produce
96600,3420968,29487,8,1,Roma Tomato,83,4,fresh vegetables,produce
96601,3420968,24852,9,1,Banana,24,4,fresh fruits,produce


# 1.0 Análise Descritiva

## 1.1 Dimensão dos Dados

In [105]:
print('Quantidade de Linhas: {}'.format(df_full.shape[0]))
print('Quantidade de Colunas: {}'.format(df_full.shape[1]))

Quantidade de Linhas: 96603
Quantidade de Colunas: 9


## 1.2 Tipo dos Dados

In [106]:
df_full.dtypes

order_id              int64
product_id            int64
add_to_cart_order     int64
reordered             int64
product_name         object
aisle_id              int64
department_id         int64
aisle                object
department           object
dtype: object

## 1.3 Check Na

In [107]:
df_full.isna().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
product_name         0
aisle_id             0
department_id        0
aisle                0
department           0
dtype: int64

# 2.0 Exploratory Data Analysis (EDA)

In [108]:
##### Tamanho da Cesta de Compras #####7
basket_sizes = df_full.groupby('order_id').size()
print(basket_sizes.describe())
print()
print('Em média o tamanho do carrinho de compras é de {:.2f} itens'.format(basket_sizes.mean()))

count    9414.000000
mean       10.261632
std         7.645446
min         1.000000
25%         5.000000
50%         8.000000
75%        14.000000
max        72.000000
dtype: float64

Em média o tamanho do carrinho de compras é de 10.26 itens


In [109]:
basket_size_category = df_full.groupby(['order_id', 'department']).size()

In [110]:
basket_size_category = df_full[['order_id', 'department']].groupby('order_id').nunique().reset_index()
print(f'Tamanho médio da cesta de compras em departamentos: {basket_size_category["department"].mean():.2f}')
print(f'Tamanha minimo da cesta de compras em departamentos: {basket_size_category["department"].min():.2f}')
print(f'Tamanho maximo da cesta de compras em departamentos: {basket_size_category["department"].max():.2f}')
print(f'Mediana do tamanho da cesta de compras em departamentos: {basket_size_category["department"].median():.2f}')

Tamanho médio da cesta de compras em departamentos: 4.80
Tamanha minimo da cesta de compras em departamentos: 1.00
Tamanho maximo da cesta de compras em departamentos: 15.00
Mediana do tamanho da cesta de compras em departamentos: 5.00


# 3.0 Data Preparation

In [111]:
transactions = df_full.groupby('order_id')['product_name'].apply(list).reset_index()
transactions_list = transactions['product_name'].tolist()

In [112]:
te = TransactionEncoder()
transactions_encoded = te.fit(transactions_list).transform(transactions_list)

df_encoded = pd.DataFrame(transactions_encoded, columns=te.columns_)

In [113]:
min_support = 0.003
min_confidence = 0.01
min_lift = 0.4

frequent_itemsets = apriori(
    df_encoded,
    min_support=min_support,
    use_colnames=True,
    verbose=1
)


Processing 76 combinations | Sampling itemset size 4e 32


In [114]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.004461,(0% Greek Strained Yogurt)
1,0.004355,(1% Lowfat Milk)
2,0.013278,(100% Raw Coconut Water)
3,0.008710,(100% Recycled Paper Towels)
4,0.020289,(100% Whole Wheat Bread)
...,...,...
822,0.003399,"(Banana, Organic Baby Spinach, Organic Avocado)"
823,0.004037,"(Banana, Organic Strawberries, Organic Avocado)"
824,0.003081,"(Banana, Organic Strawberries, Organic Whole M..."
825,0.003081,"(Organic Strawberries, Organic Hass Avocado, O..."


In [115]:
rules =association_rules(
    frequent_itemsets,
    metric='confidence',
    min_threshold=min_confidence
)
print(f'Total de Regras: {len(rules)}')

ant_sizes = rules['antecedents'].apply(len)
conseq_sizes = rules['consequents'].apply(len)

recommendations = rules[
    (ant_sizes == 1) &
    (conseq_sizes >=2)
].copy()

print('Total de Recomendações: {}'.format(len(recommendations)))

Total de Regras: 658
Total de Recomendações: 30


In [116]:
recommendations

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
601,(Bag of Organic Bananas),"(Organic Hass Avocado, Organic Baby Spinach)",0.118122,0.014871,0.004461,0.03777,2.539748,1.0,0.002705,1.023797,0.687465,0.034711,0.023244,0.168885
602,(Organic Hass Avocado),"(Bag of Organic Bananas, Organic Baby Spinach)",0.070215,0.015721,0.004461,0.06354,4.041665,1.0,0.003358,1.051063,0.80941,0.054759,0.048583,0.173662
603,(Organic Baby Spinach),"(Bag of Organic Bananas, Organic Hass Avocado)",0.076801,0.021245,0.004461,0.058091,2.734357,1.0,0.00283,1.039119,0.687049,0.047673,0.037646,0.134046
607,(Organic Raspberries),"(Bag of Organic Bananas, Organic Hass Avocado)",0.04249,0.021245,0.004355,0.1025,4.824675,1.0,0.003453,1.090535,0.82791,0.073345,0.083019,0.15375
608,(Bag of Organic Bananas),"(Organic Raspberries, Organic Hass Avocado)",0.118122,0.008923,0.004355,0.036871,4.13213,1.0,0.003301,1.029018,0.859523,0.035498,0.028199,0.262483
609,(Organic Hass Avocado),"(Organic Raspberries, Bag of Organic Bananas)",0.070215,0.014022,0.004355,0.062027,4.423669,1.0,0.003371,1.05118,0.832389,0.054521,0.048688,0.186317
613,(Bag of Organic Bananas),"(Organic Strawberries, Organic Hass Avocado)",0.118122,0.013491,0.004568,0.038669,2.866382,1.0,0.002974,1.026191,0.738343,0.035953,0.025523,0.188626
614,(Organic Strawberries),"(Bag of Organic Bananas, Organic Hass Avocado)",0.085723,0.021245,0.004568,0.053284,2.508067,1.0,0.002746,1.033842,0.657664,0.044606,0.032734,0.134142
615,(Organic Hass Avocado),"(Bag of Organic Bananas, Organic Strawberries)",0.070215,0.018271,0.004568,0.065053,3.560514,1.0,0.003285,1.050037,0.773449,0.05443,0.047653,0.157526
619,(Organic Raspberries),"(Bag of Organic Bananas, Organic Strawberries)",0.04249,0.018271,0.004249,0.1,5.473256,1.0,0.003473,1.09081,0.853561,0.075188,0.08325,0.166279
