# APRIORI Rule Mining


##Preparing Data and Models

Importing required libraries

In [None]:
import pandas as pd
import numpy as np

import csv
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,association_rules

Loading data

In [None]:
responses = pd.read_csv('/content/ML Data Collection (Responses) Final - Form Responses 1.csv')
responses = responses.set_axis(['Timestamp','Program','Coops Completed', 'Role', 'Pay', 'Adjusted Pay','WFH Status', 'Weekly Hours', 'Returning', 'Obtain Job', 'Top Choice', 'Location','Team Size', 'Other Co-ops', 'Enjoyment', 'Enjoyed'], axis=1, inplace=False)
responses.head()

Unnamed: 0,Timestamp,Program,Coops Completed,Role,Pay,Adjusted Pay,WFH Status,Weekly Hours,Returning,Obtain Job,Top Choice,Location,Team Size,Other Co-ops,Enjoyment,Enjoyed
0,3/5/2022 17:46:59,Management Engineering,3,Project Manager,20.0,-2.55,Remote,37.5,No,WaterlooWorks,No,Toronto (GTA),10,No,4,1
1,3/5/2022 17:50:01,Management Engineering,3,Consultant,22.0,-0.55,Remote,37.5,No,WaterlooWorks,Yes,Toronto (GTA),7,No,4,1
2,3/5/2022 17:55:04,Management Engineering,3,Developer,25.0,2.45,Remote,37.5,No,WaterlooWorks,No,Toronto (GTA),7,No,3,0
3,3/5/2022 17:55:27,Management Engineering,3,Project Manager,23.5,0.95,Remote,37.5,No,WaterlooWorks,Yes,Toronto (GTA),4,No,5,1
4,3/5/2022 17:59:27,Management Engineering,3,Business Analyst,26.5,3.95,Remote,37.5,No,WaterlooWorks,Yes,Toronto (GTA),14,No,2,0


Making pay a categorical variable

In [None]:
#taken from https://stackoverflow.com/questions/49382207/how-to-map-numeric-data-into-categories-bins-in-pandas-dataframe
bins = [0, 15, 20, 30, 40, 50, np.inf]
names = ['-15', '15-20', '20-30', '30-40','40-50', '50+']
responses['PayRange'] = pd.cut(responses['Pay'], bins, labels=names)

Refactoring answers to make itemsets more comprehensible

In [None]:
responses['Returning'] = responses['Returning'].apply(lambda x: "Yes Returning"  if x=='Yes' else 'No Returning')
responses['Top Choice'] = responses['Top Choice'].apply(lambda x: "Yes Top Choice"  if x=='Yes' else 'No Top Choice')
responses['Other Co-ops'] = responses['Other Co-ops'].apply(lambda x: "Yes Other Coops"  if x=='Yes' else 'No Other Coops')
responses.head()

Unnamed: 0,Timestamp,Program,Coops Completed,Role,Pay,Adjusted Pay,WFH Status,Weekly Hours,Returning,Obtain Job,Top Choice,Location,Team Size,Other Co-ops,Enjoyment,Enjoyed,PayRange
0,3/5/2022 17:46:59,Management Engineering,3,Project Manager,20.0,-2.55,Remote,37.5,No Returning,WaterlooWorks,No Top Choice,Toronto (GTA),10,No Other Coops,4,1,15-20
1,3/5/2022 17:50:01,Management Engineering,3,Consultant,22.0,-0.55,Remote,37.5,No Returning,WaterlooWorks,Yes Top Choice,Toronto (GTA),7,No Other Coops,4,1,20-30
2,3/5/2022 17:55:04,Management Engineering,3,Developer,25.0,2.45,Remote,37.5,No Returning,WaterlooWorks,No Top Choice,Toronto (GTA),7,No Other Coops,3,0,20-30
3,3/5/2022 17:55:27,Management Engineering,3,Project Manager,23.5,0.95,Remote,37.5,No Returning,WaterlooWorks,Yes Top Choice,Toronto (GTA),4,No Other Coops,5,1,20-30
4,3/5/2022 17:59:27,Management Engineering,3,Business Analyst,26.5,3.95,Remote,37.5,No Returning,WaterlooWorks,Yes Top Choice,Toronto (GTA),14,No Other Coops,2,0,20-30


Droppping numerical columns and preparing data for TransactionEncoder

In [None]:
responses = responses.drop(['Timestamp','Pay','Adjusted Pay','Weekly Hours','Enjoyment','Team Size'], axis = 1)
responses['Enjoyed'] = responses['Enjoyed'].apply(lambda x: 'Enjoyed' if x==1 else 'Not Enjoyed')

In [None]:
dataset = responses.astype(str).values.tolist()

In [None]:
print(dataset)

[['Management Engineering', '3', 'Project Manager', 'Remote', 'No Returning', 'WaterlooWorks', 'No Top Choice', 'Toronto (GTA)', 'No Other Coops', 'Enjoyed', '15-20'], ['Management Engineering', '3', 'Consultant', 'Remote', 'No Returning', 'WaterlooWorks', 'Yes Top Choice', 'Toronto (GTA)', 'No Other Coops', 'Enjoyed', '20-30'], ['Management Engineering', '3', 'Developer', 'Remote', 'No Returning', 'WaterlooWorks', 'No Top Choice', 'Toronto (GTA)', 'No Other Coops', 'Not Enjoyed', '20-30'], ['Management Engineering', '3', 'Project Manager', 'Remote', 'No Returning', 'WaterlooWorks', 'Yes Top Choice', 'Toronto (GTA)', 'No Other Coops', 'Enjoyed', '20-30'], ['Management Engineering', '3', 'Business Analyst', 'Remote', 'No Returning', 'WaterlooWorks', 'Yes Top Choice', 'Toronto (GTA)', 'No Other Coops', 'Not Enjoyed', '20-30'], ['Management Engineering', '3', 'Business Analyst', 'Remote', 'No Returning', 'WaterlooWorks', 'No Top Choice', 'Toronto (GTA)', 'No Other Coops', 'Enjoyed', '15-2

Generating datafram to extract itemsets

In [None]:
oht = TransactionEncoder()
oht_array = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_array, columns=oht.columns_)
df.head()

Unnamed: 0,-15,1,15-20,2,20-30,3,30-40,4,40-50,5,...,Software Engineering,Student Planner,Supply Chain Analyst,Systems Engineer,Toronto (GTA),Waterloo,WaterlooWorks,Yes Other Coops,Yes Returning,Yes Top Choice
0,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
1,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,True,False,True,False,False,True
2,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
3,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,True,False,True,False,False,True
4,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,True,False,True,False,False,True


##Rule Mining

In [None]:
frequent_itemsets = apriori(df, min_support = 0.6, use_colnames = True)
print(frequent_itemsets)

     support                        itemsets
0   0.634146                         (20-30)
1   0.707317                       (Enjoyed)
2   0.845528                  (No Returning)
3   0.731707                        (Remote)
4   0.788618                 (WaterlooWorks)
5   0.780488                (Yes Top Choice)
6   0.626016       (Enjoyed, Yes Top Choice)
7   0.642276          (Remote, No Returning)
8   0.731707   (WaterlooWorks, No Returning)
9   0.626016  (Yes Top Choice, No Returning)
10  0.601626         (WaterlooWorks, Remote)


In [None]:
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.7)
print(rules[['antecedents','consequents','support','confidence']])

        antecedents       consequents   support  confidence
0         (Enjoyed)  (Yes Top Choice)  0.626016    0.885057
1  (Yes Top Choice)         (Enjoyed)  0.626016    0.802083
2          (Remote)    (No Returning)  0.642276    0.877778
3    (No Returning)          (Remote)  0.642276    0.759615
4   (WaterlooWorks)    (No Returning)  0.731707    0.927835
5    (No Returning)   (WaterlooWorks)  0.731707    0.865385
6  (Yes Top Choice)    (No Returning)  0.626016    0.802083
7    (No Returning)  (Yes Top Choice)  0.626016    0.740385
8   (WaterlooWorks)          (Remote)  0.601626    0.762887
9          (Remote)   (WaterlooWorks)  0.601626    0.822222


### Second attempt

Here increasing the minimum support by 10%

In [None]:
frequent_itemsets = apriori(df, min_support = 0.7, use_colnames = True)
print(frequent_itemsets)

    support                       itemsets
0  0.707317                      (Enjoyed)
1  0.845528                 (No Returning)
2  0.731707                       (Remote)
3  0.788618                (WaterlooWorks)
4  0.780488               (Yes Top Choice)
5  0.731707  (WaterlooWorks, No Returning)


In [None]:
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.7)
print(rules[['antecedents','consequents','support','confidence']])

       antecedents      consequents   support  confidence
0  (WaterlooWorks)   (No Returning)  0.731707    0.927835
1   (No Returning)  (WaterlooWorks)  0.731707    0.865385


Here we see that if you are returning to the job you're not applying through waterloo works


###Third attempt

From original, increasing confidence to 80%

In [None]:
frequent_itemsets = apriori(df, min_support = 0.6, use_colnames = True)
print(frequent_itemsets)

     support                        itemsets
0   0.634146                         (20-30)
1   0.707317                       (Enjoyed)
2   0.845528                  (No Returning)
3   0.731707                        (Remote)
4   0.788618                 (WaterlooWorks)
5   0.780488                (Yes Top Choice)
6   0.626016       (Enjoyed, Yes Top Choice)
7   0.642276          (Remote, No Returning)
8   0.731707   (WaterlooWorks, No Returning)
9   0.626016  (Yes Top Choice, No Returning)
10  0.601626         (WaterlooWorks, Remote)


In [None]:
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.8)
print(rules[['antecedents','consequents','support','confidence']])

        antecedents       consequents   support  confidence
0         (Enjoyed)  (Yes Top Choice)  0.626016    0.885057
1  (Yes Top Choice)         (Enjoyed)  0.626016    0.802083
2          (Remote)    (No Returning)  0.642276    0.877778
3   (WaterlooWorks)    (No Returning)  0.731707    0.927835
4    (No Returning)   (WaterlooWorks)  0.731707    0.865385
5  (Yes Top Choice)    (No Returning)  0.626016    0.802083
6          (Remote)   (WaterlooWorks)  0.601626    0.822222


###Fourth Attempt

From original, decreasing min support to 50 and increasing confidence to 90 (idk why prolly should be 70)

In [None]:
frequent_itemsets = apriori(df, min_support = 0.5, use_colnames = True)
print(frequent_itemsets)

     support                                       itemsets
0   0.634146                                        (20-30)
1   0.585366                                            (3)
2   0.707317                                      (Enjoyed)
3   0.552846                               (No Other Coops)
4   0.845528                                 (No Returning)
5   0.731707                                       (Remote)
6   0.569106                                (Toronto (GTA))
7   0.788618                                (WaterlooWorks)
8   0.780488                               (Yes Top Choice)
9   0.552846                          (20-30, No Returning)
10  0.536585                         (20-30, WaterlooWorks)
11  0.552846                        (Enjoyed, No Returning)
12  0.536585                       (Enjoyed, WaterlooWorks)
13  0.626016                      (Enjoyed, Yes Top Choice)
14  0.642276                         (Remote, No Returning)
15  0.731707                  (WaterlooW

In [None]:
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.9)
print(rules[['antecedents','consequents','support','confidence']])

                       antecedents     consequents   support  confidence
0                  (WaterlooWorks)  (No Returning)  0.731707    0.927835
1          (WaterlooWorks, Remote)  (No Returning)  0.569106    0.945946
2  (WaterlooWorks, Yes Top Choice)  (No Returning)  0.520325    0.901408


###Fifth attempt

From original, increasing min_support to 80% and decresing confidence to 60%


In [None]:
frequent_itemsets = apriori(df, min_support = 0.8, use_colnames = True)
print(frequent_itemsets)

    support        itemsets
0  0.845528  (No Returning)


In [None]:
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.6)
print(rules[['antecedents','consequents','support','confidence']])

Empty DataFrame
Columns: [antecedents, consequents, support, confidence]
Index: []
