# 0. Setting Up The Data

In [86]:
pip install mlxtend  

Note: you may need to restart the kernel to use updated packages.


In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# read dataframe from local file
df = pd.read_csv('drone_prod_groups.csv')
df

Unnamed: 0,ID,Prod1,Prod2,Prod3,Prod4,Prod5,Prod6,Prod7,Prod8,Prod9,...,Prod11,Prod12,Prod13,Prod14,Prod15,Prod16,Prod17,Prod18,Prod19,Prod20
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
1,2,0,1,0,0,0,0,0,0,1,...,0,0,0,0,1,1,1,1,1,1
2,3,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,4,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
4,5,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,99997,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99997,99998,0,1,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
99998,99999,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


# 1. Business Understanding

# 2. Data Understanding

# 3. Data Preparation

Dataset is otherwise clean and sufficient.  
Only necessary modification is dropping the "ID" column as well as converting numerical values to boolean.  

In [88]:
df = df.drop(columns='ID')
df = df.apply(lambda x: x.map(lambda y: True if y == 1.0 else False))
df

Unnamed: 0,Prod1,Prod2,Prod3,Prod4,Prod5,Prod6,Prod7,Prod8,Prod9,Prod10,Prod11,Prod12,Prod13,Prod14,Prod15,Prod16,Prod17,Prod18,Prod19,Prod20
0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True
1,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,True,True,True,True,True
2,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True
3,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,True
4,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
99996,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
99997,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False
99998,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True


Due to the large number of categories the minimum support needs to be set sufficiently low to find any results from the dataset.  
Sufficient minimum support was determined manually by gradually reducing the value from a starting point of 10%. 
At 3% the results display a first itemset of three products with 38 individual itemsets and at 2.5% provides four different itemsets of three with only 59 individual itemsets.  
At this level the results are still manageable enough for a quick human analysis.  

However due to the large sample size of the data, as well as its sizeable number of categories it is worth it to push the minimum support even lower for further item combinations.  

At 0.01% the code execution time begins to suffer and the size of the result set increases to 10080. More than 10% of the original sample size. At this point the findings become too expansive to indicate anything meaningful, so 0.01% can be considered the lower limit.  

At 0.15% the result set is 1081, 1% of the original sample size, and yields an instance of itemset with 6 products.  
For that reason it is deemed the acceptable minimum for support.  

In [89]:
frequent_itemsets = apriori(df, min_support=0.0015, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.10998,(Prod1)
1,0.13098,( Prod2)
2,0.03271,( Prod3)
3,0.03585,( Prod4)
4,0.10459,( Prod5)
...,...,...
1077,0.00358,"( Prod15, Prod9, Prod20, Prod14, Prod19)"
1078,0.00359,"( Prod16, Prod15, Prod9, Prod20, Prod19)"
1079,0.00154,"( Prod17, Prod15, Prod9, Prod20, Prod19)"
1080,0.00325,"( Prod15, Prod9, Prod20, Prod18, Prod19)"


# 4. Modeling

Due to the relatively wide angle approach in determining support for the itemsets of the products it would be prudent to use a high threshold for confidence to increase the chances that the relation between the items is genuine.  
Minimum confidence thresholds at values 70%, 80% and 90% provide a very similiar result size of 241 (at 90%) and 284 (at 70%).  
In addition the products 9 and 19 or their combined set appear consistently as the consequent products in these ranges
Choosing the middlepoint value at 80% seems like a good cautious choice.

In [90]:
rules_70 = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.70)
rules_80 = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.80)
rules_90 = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.90)

rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.80)

rules = rules.sort_values(by='confidence', ascending=False)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
223,"( Prod5, Prod16, Prod12, Prod15)",( Prod9),0.00171,0.19853,0.00168,0.982456,4.948653,1.0,0.001341,45.683790,0.799292,0.008461,0.978110,0.495459
173,"( Prod17, Prod20, Prod15)",( Prod9),0.00172,0.19853,0.00167,0.970930,4.890597,1.0,0.001329,27.570568,0.796897,0.008410,0.963729,0.489671
72,"( Prod15, Prod2, Prod16)",( Prod9),0.00360,0.19853,0.00349,0.969444,4.883113,1.0,0.002775,26.229927,0.798086,0.017569,0.961876,0.493512
261,"( Prod17, Prod15, Prod19, Prod20)",( Prod9),0.00159,0.19853,0.00154,0.968553,4.878625,1.0,0.001224,25.486746,0.796290,0.007755,0.960764,0.488155
267,"( Prod5, Prod15, Prod12, Prod20, Prod19)",( Prod9),0.00183,0.19853,0.00177,0.967213,4.871874,1.0,0.001407,24.444835,0.796197,0.008913,0.959092,0.488064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,"( Prod20, Prod15)","( Prod19, Prod9)",0.02241,0.04996,0.01919,0.856314,17.139995,1.0,0.018070,6.611924,0.963243,0.360850,0.848758,0.620211
217,"( Prod15, Prod2, Prod20)","( Prod19, Prod9)",0.00401,0.04996,0.00343,0.855362,17.120929,1.0,0.003230,6.568380,0.945383,0.067867,0.847755,0.462008
243,"( Prod8, Prod15, Prod20)","( Prod19, Prod9)",0.00484,0.04996,0.00412,0.851240,17.038424,1.0,0.003878,6.386380,0.945887,0.081294,0.843417,0.466853
256,"( Prod15, Prod14, Prod20)","( Prod19, Prod9)",0.00421,0.04996,0.00358,0.850356,17.020742,1.0,0.003370,6.348680,0.945228,0.070765,0.842487,0.461007


In [91]:
print('Threshold set at 70: ')
rules_70['consequents'].unique()

Threshold set at 70: 


array([frozenset({' Prod9'}), frozenset({' Prod19'}),
       frozenset({' Prod12'}), frozenset({' Prod20'}),
       frozenset({' Prod19', ' Prod9'})], dtype=object)

In [92]:
print('Threshold set at 80: ')
rules_80['consequents'].unique()

Threshold set at 80: 


array([frozenset({' Prod9'}), frozenset({' Prod19'}),
       frozenset({' Prod19', ' Prod9'})], dtype=object)

In [93]:
print('Threshold set at 90: ')
rules_90['consequents'].unique()

Threshold set at 90: 


array([frozenset({' Prod9'}), frozenset({' Prod19'})], dtype=object)

In [95]:
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=2)

rules = rules.sort_values(by='lift', ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
2913,"( Prod12, Prod19, Prod9)","( Prod5, Prod15, Prod20)",0.01042,0.00301,0.00177,0.169866,56.433768,1.0,0.001739,1.200998,0.992623,0.151801,0.167359,0.378953
2898,"( Prod5, Prod15, Prod20)","( Prod12, Prod19, Prod9)",0.00301,0.01042,0.00177,0.588040,56.433768,1.0,0.001739,2.402126,0.985246,0.151801,0.583702,0.378953
2903,"( Prod5, Prod20, Prod9)","( Prod15, Prod12, Prod19)",0.00513,0.00625,0.00177,0.345029,55.204678,1.0,0.001738,1.517243,0.986949,0.184183,0.340910,0.314115
2908,"( Prod15, Prod12, Prod19)","( Prod5, Prod20, Prod9)",0.00625,0.00513,0.00177,0.283200,55.204678,1.0,0.001738,1.387932,0.988061,0.184183,0.279504,0.314115
2904,"( Prod5, Prod19, Prod9)","( Prod15, Prod12, Prod20)",0.00700,0.00465,0.00177,0.252857,54.377880,1.0,0.001737,1.332208,0.988530,0.179150,0.249367,0.316751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,( Prod5),"( Prod15, Prod2, Prod12)",0.10459,0.00428,0.00160,0.015298,3.574259,1.0,0.001152,1.011189,0.804349,0.014916,0.011065,0.194565
896,( Prod5),"( Prod8, Prod15, Prod12)",0.10459,0.00496,0.00185,0.017688,3.566152,1.0,0.001331,1.012957,0.803638,0.017177,0.012792,0.195336
891,"( Prod8, Prod15, Prod12)",( Prod5),0.00496,0.10459,0.00185,0.372984,3.566152,1.0,0.001331,1.428049,0.723173,0.017177,0.299744,0.195336
867,"( Prod10, Prod8, Prod12)",( Prod5),0.00440,0.10459,0.00162,0.368182,3.520239,1.0,0.001160,1.417196,0.719092,0.015088,0.294381,0.191835


# 5. Evaluation

# 6. Deployment