In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import pyodbc

In [2]:
connection = pyodbc.connect('Driver={SQL Server Native Client 11.0};'
                      'Server=DESKTOP-ATL660H;'
                      'Database=AdventureWorks2012;'
                      'Trusted_Connection=yes;')


In [3]:
query ='''
SELECT 
C.SalesOrderID
,A.Name
,B.OrderQty
,D.Name as Region
FROM[Production].[Product]  AS A
LEFT JOIN [Sales].[SalesOrderDetail] AS B
ON A.ProductID = B.ProductID
LEFT JOIN [Sales].[SalesOrderHeader] AS C
ON B.SalesOrderID = C.SalesOrderID
LEFT JOIN [Sales].[SalesTerritory] AS D
ON C.[TerritoryID] = D.[TerritoryID]
'''

In [4]:
data= pd.read_sql(query,connection)
data.head()

Unnamed: 0,SalesOrderID,Name,OrderQty,Region
0,43659.0,"Mountain-100 Black, 42",1.0,Southeast
1,43659.0,"Mountain-100 Black, 44",3.0,Southeast
2,43659.0,"Mountain-100 Black, 48",1.0,Southeast
3,43659.0,"Mountain-100 Silver, 38",1.0,Southeast
4,43659.0,"Mountain-100 Silver, 42",1.0,Southeast


In [5]:
data.columns

Index(['SalesOrderID', 'Name', 'OrderQty', 'Region'], dtype='object')

In [6]:
data.dtypes

SalesOrderID    float64
Name             object
OrderQty        float64
Region           object
dtype: object

In [7]:
data.isnull().sum()

SalesOrderID    238
Name              0
OrderQty        238
Region          238
dtype: int64

In [8]:
#stripping of the white spaces form the Name
data['Name'] = data['Name'].str.strip()

In [9]:
#dropping rows with missing values in salesorderid
# converting  salesorderid to type string for the algorithm to understand it as astring data type
data.dropna(axis=0, subset=['SalesOrderID'], inplace= True)
data['SalesOrderID'] = data['SalesOrderID'].astype('str')


In [10]:
#238 rows are dropped
data.head()


Unnamed: 0,SalesOrderID,Name,OrderQty,Region
0,43659.0,"Mountain-100 Black, 42",1.0,Southeast
1,43659.0,"Mountain-100 Black, 44",3.0,Southeast
2,43659.0,"Mountain-100 Black, 48",1.0,Southeast
3,43659.0,"Mountain-100 Silver, 38",1.0,Southeast
4,43659.0,"Mountain-100 Silver, 42",1.0,Southeast


In [11]:
data['Region'].value_counts()

Southwest         25644
Canada            19064
Northwest         16865
Australia         15058
United Kingdom    10426
France             9088
Germany            7528
Southeast          5976
Northeast          5836
Central            5832
Name: Region, dtype: int64

In [12]:
#separating transaction by region 
basket = (data
            .groupby(['SalesOrderID', 'Name'])['OrderQty']
            .sum().unstack().reset_index().fillna(0)
            .set_index('SalesOrderID'))

In [13]:
# applying one hot encoding to converting all values to 0's and 1's

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)


In [14]:
basket_sets

Name,AWC Logo Cap,All-Purpose Bike Stand,Bike Wash - Dissolver,Cable Lock,Chain,"Classic Vest, L","Classic Vest, M","Classic Vest, S",Fender Set - Mountain,Front Brakes,...,"Touring-3000 Yellow, 54","Touring-3000 Yellow, 58","Touring-3000 Yellow, 62",Water Bottle - 30 oz.,"Women's Mountain Shorts, L","Women's Mountain Shorts, M","Women's Mountain Shorts, S","Women's Tights, L","Women's Tights, M","Women's Tights, S"
SalesOrderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43659.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43660.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43661.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43662.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43663.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43664.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43665.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43666.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43667.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43668.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Training the model

In [35]:
# generating frequent item sets
frequent_itemsets = apriori(basket_sets, min_support=0.05, use_colnames=True)

min_support = It's the number of times the frequent itemset shows up as a proportion of the total number of transactions. It’s an indicator of how common that itemset is within a particular list of transactions
Generate frequent itemsets that have a support value of at least 4% (this number is chosen so that you can get close enough)

In [16]:
# viewing frequent itemset
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.107485,(AWC Logo Cap)
1,0.042174,(Bike Wash - Dissolver)
2,0.067408,(Fender Set - Mountain)
3,0.044367,(HL Mountain Tire)
4,0.051962,"(Long-Sleeve Logo Jersey, L)"
5,0.064357,(Mountain Bottle Cage)
6,0.098363,(Mountain Tire Tube)
7,0.106595,(Patch Kit/8 Patches)
8,0.05441,(Road Bottle Cage)
9,0.075512,(Road Tire Tube)


In [32]:
# creating rules  
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [33]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Mountain Bottle Cage),(Water Bottle - 30 oz.),0.064357,0.148991,0.053774,0.835556,5.608096,0.044185,5.175055
1,(Water Bottle - 30 oz.),(Mountain Bottle Cage),0.148991,0.064357,0.053774,0.360922,5.608096,0.044185,1.46405


The rule with a high lift value, which means that it occurs more frequently than would be expected given the number of transaction and product combinations
There are four rules which can be recommend but we are consedering high lift values based on high confidence that holds true value

## overall two of the products are recommed most
## 1) If a customer bought Mountain Bottle Cage it is most likely to have Water bottle-30 oz
## 2) If a customer bought Road bottle cage he might have buy Water bottle-30 oz as well