### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import warnings
warnings.filterwarnings("ignore")

root_csv = '../csv files/'

### Creating a function to reduce the memory of dataframe

In [2]:
def reduce_mem_usage(train_data):
    start_mem = train_data.memory_usage().sum() / 1024**2
    
    for col in train_data.columns:
        col_type = train_data[col].dtype
        
        if col_type != object:
            if pd.api.types.is_categorical_dtype(train_data[col]):
                train_data[col] = train_data[col].cat.as_ordered()
            else:
                c_min = train_data[col].min()
                c_max = train_data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        train_data[col] = train_data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        train_data[col] = train_data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        train_data[col] = train_data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        train_data[col] = train_data[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        train_data[col] = train_data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        train_data[col] = train_data[col].astype(np.float32)
                    else:
                        train_data[col] = train_data[col].astype(np.float64)
    
    end_mem = train_data.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB --> {end_mem:.2f} MB (Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%)')
    
    return train_data

### Reading the csv files

In [3]:
orders = pd.read_csv(root_csv + 'orders.csv')
order_products_prior = pd.read_csv(root_csv + 'order_products_prior.csv')
order_products_train = pd.read_csv(root_csv + 'order_products_train.csv')
products = pd.read_csv(root_csv + 'products.csv')

### Reducing memory usage for all dataframes

In [4]:
reduce_mem_usage(order_products_prior)
reduce_mem_usage(order_products_train)
reduce_mem_usage(products)
reduce_mem_usage(orders)

Memory usage of dataframe is 989.82 MB --> 340.25 MB (Decreased by 65.6%)
Memory usage of dataframe is 42.26 MB --> 13.20 MB (Decreased by 68.7%)
Memory usage of dataframe is 1.52 MB --> 0.71 MB (Decreased by 53.1%)
Memory usage of dataframe is 182.71 MB --> 68.51 MB (Decreased by 62.5%)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


### Merging train and prior data in single dataframe

In [5]:
order_products = order_products_prior.append(order_products_train)
order_products.shape
order_products.head(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


### Getting unique product ids from dataframe

In [6]:
order_products.product_id.nunique()

49685

### Getting the count of how frequently the product is ordered

In [7]:
product_counts = order_products.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
product_counts = product_counts.sort_values('frequency', ascending=False)[0:100].reset_index(drop = True)
product_counts = product_counts.merge(products, on = 'product_id', how = 'left')
product_counts.head(5)

Unnamed: 0,product_id,frequency,product_name,aisle_id,department_id
0,24852,491291,Banana,24,4
1,13176,394930,Bag of Organic Bananas,24,4
2,21137,275577,Organic Strawberries,24,4
3,21903,251705,Organic Baby Spinach,123,4
4,47209,220877,Organic Hass Avocado,24,4


### Getting the list of frequently ordered products

In [8]:
freq_products = list(product_counts.product_id)
print("Top 10 frequently purchased item\n",freq_products[1:10])

print("\nGetting the length of frequent items: ",len(freq_products))

Top 10 frequently purchased item
 [13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845]

Getting the length of frequent items:  100


### Getting the dataframe which will have frequent products in their order

In [9]:
order_products = order_products[order_products.product_id.isin(freq_products)]
print("Getting the shape of dataframe: ", order_products.shape)
print("\nGetting count of order having frequent products: ",order_products.order_id.nunique())

Getting the shape of dataframe:  (7795471, 4)

Getting count of order having frequent products:  2444982


### Getting product name in order_products

In [10]:
order_products = order_products.merge(products, on = 'product_id', how='left')
order_products.head(5)
reduce_mem_usage(order_products)

Memory usage of dataframe is 223.03 MB --> 215.60 MB (Decreased by 3.3%)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,28985,2,1,Michigan Organic Kale,83,4
1,2,17794,6,1,Carrots,83,4
2,3,24838,2,1,Unsweetened Almondmilk,91,16
3,3,21903,4,1,Organic Baby Spinach,123,4
4,3,46667,6,1,Organic Ginger Root,83,4
...,...,...,...,...,...,...,...
7795466,3420998,8174,27,0,Organic Navel Orange,24,4
7795467,3421026,7781,6,0,Organic Sticks Low Moisture Part Skim Mozzarel...,21,16
7795468,3421056,21709,3,1,Sparkling Lemon Water,115,7
7795469,3421063,49235,1,1,Organic Half & Half,53,16


### Creating pivot for final basket having orders and their frequent products

In [11]:
basket = order_products.groupby(['order_id', 'product_name'])['reordered'].count().unstack().reset_index().fillna(0).set_index('order_id')
reduce_mem_usage(basket)
basket.head(5)

Memory usage of dataframe is 1884.03 MB --> 485.00 MB (Decreased by 74.3%)


product_name,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Deleting temporary data frames for memory space

In [12]:
del product_counts, products, order_products, order_products_prior, order_products_train

### Coverting float to integer for pivotted basket

In [13]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1 
    
basket = basket.applymap(encode_units)
basket.head(5)

product_name,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Final basket size and shape

In [14]:
print("Checking size of basket: ",basket.size)
print("\nFinal shape of a basket: ",basket.shape)

Checking size of basket:  244498200

Final shape of a basket:  (2444982, 100)


### Getting the support for frequently ordered products

In [15]:
frequent_items = apriori(basket, min_support=0.01, use_colnames=True, low_memory=True)
reduce_mem_usage(frequent_items)
frequent_items.head(5)

print("\nChecking the shape of frequent item: ",frequent_items.shape)

Memory usage of dataframe is 0.00 MB --> 0.00 MB (Decreased by 35.3%)

Checking the shape of frequent item:  (129, 2)


### Using association rule to find the association between 2 frequently ordered products

In [16]:
rules = association_rules(frequent_items, metric="lift", min_threshold=1)
rules.sort_values('lift', ascending=False)
reduce_mem_usage(rules)

Memory usage of dataframe is 0.00 MB --> 0.00 MB (Decreased by 58.3%)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Organic Baby Spinach),(Bag of Organic Bananas),0.102966,0.161499,0.021515,0.208984,1.293945,0.004887,1.05957,0.253174
1,(Bag of Organic Bananas),(Organic Baby Spinach),0.161499,0.102966,0.021515,0.133179,1.293945,0.004887,1.035156,0.270752
2,(Organic Hass Avocado),(Bag of Organic Bananas),0.090332,0.161499,0.026489,0.293213,1.81543,0.011902,1.186523,0.493896
3,(Bag of Organic Bananas),(Organic Hass Avocado),0.161499,0.090332,0.026489,0.164062,1.81543,0.011902,1.087891,0.535645
4,(Bag of Organic Bananas),(Organic Raspberries),0.161499,0.058319,0.017288,0.107056,1.835938,0.007874,1.054688,0.542969
5,(Organic Raspberries),(Bag of Organic Bananas),0.058319,0.161499,0.017288,0.296387,1.835938,0.007874,1.191406,0.483398
6,(Bag of Organic Bananas),(Organic Strawberries),0.161499,0.112732,0.026459,0.163818,1.453125,0.008255,1.061523,0.37207
7,(Organic Strawberries),(Bag of Organic Bananas),0.112732,0.161499,0.026459,0.234741,1.453125,0.008255,1.095703,0.351562
8,(Organic Whole Milk),(Bag of Organic Bananas),0.058411,0.161499,0.011292,0.193359,1.197266,0.001859,1.039062,0.174805
9,(Bag of Organic Bananas),(Organic Whole Milk),0.161499,0.058411,0.011292,0.069946,1.197266,0.001859,1.012695,0.196289
