## Notebook 3 - Association Rules
In this notebook we will study the possible association rules that can be created with the data.
The data will be analised by each agreggated quarter and by product sales.

In [None]:
#Import Libraries
import pandas as pd
import numpy as np

import datetime as dt
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Data Load 

In [3]:
df = pd.read_csv('../Databases/df_clusters.csv',dtype={'ProductFamily_ID': np.int16,
                                         'ProductCategory_ID  ': np.int16,
                                         'ProductBrand_ID': np.int16,
                                         'ProductName_ID': np.int16,
                                         'ProductPackSKU_ID': np.int16,
                                         'Point-of-Sale_ID': np.int16,
                                         'Value_units': np.float32,
                                         'Value_price': np.float32,
                                         'Unit_Price': np.float32,
                                         'Retail_price': np.float32,
                                         'Is_Promo': np.int16,
                                        })
df.drop(columns='Unnamed: 0', inplace=True)

df['Date'] = pd.to_datetime(df['Date'])
df['Quarter'] = df['Date'].dt.quarter.astype(str) + 'Q' + df['Date'].dt.year.astype(str)
df['Quarter_number'] = df['Date'].dt.quarter
df.head()

Unnamed: 0,ProductFamily_ID,ProductCategory_ID,ProductBrand_ID,ProductName_ID,ProductPackSKU_ID,Point-of-Sale_ID,Date,Value_units,Value_price,Unit_Price,Retail_price,Is_Promo,cluster_value,cluster_product,Quarter,Quarter_number
0,16,11,306,649,1970,1,2017-03-04,2.0,1540.0,770.0,810.0,0,1,9,1Q2017,1
1,16,11,306,649,1970,1,2016-05-02,4.0,3080.0,770.0,810.0,0,1,9,2Q2016,2
2,16,11,306,649,1970,1,2016-10-24,2.0,1540.0,770.0,810.0,0,1,9,4Q2016,4
3,16,11,306,649,1970,1,2017-10-13,2.0,1620.0,810.0,810.0,0,1,9,4Q2017,4
4,16,11,306,649,1970,1,2017-10-14,2.0,1620.0,810.0,810.0,0,1,9,4Q2017,4


In [10]:
#Which product was sold the most and in each quarter
df_product=df.groupby(["ProductName_ID", "quarter"])['Value_units'].count().sort_values(ascending=False)
df_product=df_product.to_frame()
df_product

Unnamed: 0_level_0,Unnamed: 1_level_0,Value_units
ProductName_ID,quarter,Unnamed: 2_level_1
253,2017Q2,77755
356,2016Q1,70314
567,2016Q1,70068
567,2016Q2,69635
253,2019Q2,69317
...,...,...
479,2016Q3,1
267,2016Q3,1
479,2016Q4,1
1444,2018Q3,1


# Association Rules

In [5]:
#create empty dataframe for later use
complementary = pd.DataFrame(columns=['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction', 'quarter'])
substitutes = pd.DataFrame(columns=['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction', 'quarter'])

#iterate each quarter
for quarter in df.Quarter_number.unique():
    print(f'Quarter: {quarter}')
    df1 = df[df['Quarter_number']==quarter]
    pt_aux = df1[["ProductName_ID","Point-of-Sale_ID"]]
    pt = pt_aux.pivot_table(pt_aux,index='Point-of-Sale_ID', columns=["ProductName_ID"], 
                        aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)

    # Apply the APRIORI algorithm to get frequent itemsets
    frequent_itemsets = apriori(pt, min_support=0.2, use_colnames=True, max_len=2)
    
    #get the complementary products
    rulesConfidence = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.50)
    rulesConfidence.sort_values(by='confidence', ascending=False, inplace=True)
    complementary_temp = rulesConfidence[(rulesConfidence['confidence']>=0.5) & (rulesConfidence['lift']>=1.5)].head(10)
    complementary_temp['quarter'] = quarter
    frames = [complementary,complementary_temp]
    complementary = pd.concat(frames)
    
    #get the subtitute products
    substitutes_temp = association_rules(frequent_itemsets, metric="lift", min_threshold=0.0)
    substitutes_temp.sort_values(by='lift', ascending=True, inplace=True)
    substitutes_temp.reset_index(drop=True, inplace=True)
    substitutes_temp = substitutes_temp[np.arange(len(substitutes_temp)) % 2 != 0]
    substitutes_temp = substitutes_temp.head(10)
    substitutes_temp['quarter'] = quarter
    frames = [substitutes,substitutes_temp]
    substitutes = pd.concat(frames)

#save the dataframes for later use
substitutes.to_csv('../Databases/substitutes.csv')
complementary.to_csv('../Databases/complementary.csv')



Quarter: 1
Quarter: 2
Quarter: 4
Quarter: 3
