# Part I: Analyzing Transactions

In [1]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

### Question 1. Read in the data and generate a file in which every row represents a transaction, with True identifying items that were part of that transaction, and False identifying items that were not (as in the example from class). Name the file groupxxtransactions01.csv, where xx is your group number.

In [2]:
# Reading the original dataset
mb = pd.read_csv('/Users/ramyavissapragada/Downloads/transactions.csv')

# Grouping by Transaction and Product and then applying a count to aggregate
mb['Count'] = 1
mb_new = mb.pivot_table(index='Transaction', columns='Product', values='Count', fill_value=0)

# Resetting the index
mb_new = mb_new.reset_index()
mb_new.columns.name = None  # Remove the column name

# Save the transformed dataset to a new CSV file
mb_new.to_csv('group11transactions01.csv', index=False)


In [3]:
# Reading the new dataset
marketbasket = pd.read_csv('group11transactions01.csv')
marketbasket

Unnamed: 0,Transaction,Bow,Candy Bar,Deodorant,Greeting Cards,Magazine,Markers,Pain Reliever,Pencils,Pens,Perfume,Photo Processing,Prescription Med,Shampoo,Soap,Toothbrush,Toothpaste,Wrapping Paper
0,12359,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,12362,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,12365,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,12371,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,12380,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,1221845,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,1,0
199996,1221854,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
199997,1221857,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
199998,1221863,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [4]:
# Converting the data present into boolean type as apriori implementation prefers boolean datatype
# Specifying the Transaction column as the index of the records
marketbasket.set_index('Transaction', inplace = True)
marketbasket = marketbasket.astype(bool)
marketbasket

Unnamed: 0_level_0,Bow,Candy Bar,Deodorant,Greeting Cards,Magazine,Markers,Pain Reliever,Pencils,Pens,Perfume,Photo Processing,Prescription Med,Shampoo,Soap,Toothbrush,Toothpaste,Wrapping Paper
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
12359,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12362,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
12365,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
12371,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12380,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1221845,False,False,False,True,True,False,False,False,True,False,True,False,False,False,False,True,False
1221854,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1221857,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1221863,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False


### Question 2. Identify the frequent itemsets using a minimum support threshold of 1%. How many itemsets are frequent?

In [5]:
# Generating the frequent itemsets with a minimum support threshold of 1%
fi = apriori(marketbasket, min_support = 0.01, use_colnames = True)
fi

Unnamed: 0,support,itemsets
0,0.054645,(Bow)
1,0.171005,(Candy Bar)
2,0.146885,(Greeting Cards)
3,0.241305,(Magazine)
4,0.0267,(Pain Reliever)
5,0.134925,(Pencils)
6,0.143575,(Pens)
7,0.08996,(Perfume)
8,0.05848,(Photo Processing)
9,0.014505,(Prescription Med)


In [6]:
# Number of Frequent Itemsets:
len(fi)

40

### Question 3. Identify all association rules with a minimum confidence of 10%. How many rules are generated? 

In [7]:
# Generating association rules with a minimum confidence of 10%
rules = association_rules(fi, metric="confidence", min_threshold=0.1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bow),(Toothbrush),0.054645,0.06735,0.01134,0.207521,3.081236,0.00766,1.176877,0.714499
1,(Toothbrush),(Bow),0.06735,0.054645,0.01134,0.168374,3.081236,0.00766,1.136755,0.724232
2,(Candy Bar),(Greeting Cards),0.171005,0.146885,0.04366,0.255314,1.738191,0.018542,1.145604,0.512294
3,(Greeting Cards),(Candy Bar),0.146885,0.171005,0.04366,0.297239,1.738191,0.018542,1.179626,0.49781
4,(Magazine),(Candy Bar),0.241305,0.171005,0.040535,0.167982,0.982325,-0.000729,0.996367,-0.023167
5,(Candy Bar),(Magazine),0.171005,0.241305,0.040535,0.23704,0.982325,-0.000729,0.99441,-0.021244
6,(Pencils),(Candy Bar),0.134925,0.171005,0.033015,0.244691,1.430903,0.009942,1.097558,0.348109
7,(Candy Bar),(Pencils),0.171005,0.134925,0.033015,0.193065,1.430903,0.009942,1.07205,0.36326
8,(Toothpaste),(Candy Bar),0.160425,0.171005,0.03978,0.247966,1.450053,0.012347,1.102338,0.369675
9,(Candy Bar),(Toothpaste),0.171005,0.160425,0.03978,0.232625,1.450053,0.012347,1.094087,0.374393


In [8]:
# Number of rules generated:
len(rules)

50

### Question 4. Which rules have the highest lift? Using the results from the previous questions, show exactly how this lift value was calculated for one of the rules with highest lift.

In [17]:
# Sorting rules based on highest Lift values
rules2 = rules.sort_values(['lift','confidence'], ascending=[False, False])
rules2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
26,(Perfume),(Toothbrush),0.08996,0.06735,0.02182,0.242552,3.60137,0.015761,1.231306,0.793732
27,(Toothbrush),(Perfume),0.06735,0.08996,0.02182,0.323979,3.60137,0.015761,1.346172,0.77449
0,(Bow),(Toothbrush),0.054645,0.06735,0.01134,0.207521,3.081236,0.00766,1.176877,0.714499
1,(Toothbrush),(Bow),0.06735,0.054645,0.01134,0.168374,3.081236,0.00766,1.136755,0.724232
28,"(Magazine, Candy Bar)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
31,(Greeting Cards),"(Magazine, Candy Bar)",0.146885,0.040535,0.016665,0.113456,2.798966,0.010711,1.082253,0.753386
42,"(Toothpaste, Pencils)",(Candy Bar),0.02456,0.171005,0.01139,0.463762,2.71198,0.00719,1.545947,0.64716
29,"(Magazine, Greeting Cards)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
47,"(Toothpaste, Magazine)",(Greeting Cards),0.031665,0.146885,0.011945,0.37723,2.568202,0.007294,1.369873,0.63059
38,"(Toothpaste, Magazine)",(Candy Bar),0.031665,0.171005,0.01372,0.433286,2.533762,0.008305,1.46281,0.625124


In [10]:
''' By sorting the rules in descending order of lift values, we determined that 2 rules which are actually symmetric had the same highest lift value. They are:
{Perfume} -> {Toothbrush}
{Toothbrush} -> {Perfume}
As their lift values are same -> 3.601370, we decided to pick the one that has the higher confidence (%). So, we choose the association rule, {Toothbrush} -> {Perfume} which has a confidence of around 32%

Lift is a metric that measures how likely it is for a consequent to occur, with the presence of the corresponding antecedent.
           Lift(rule) = support(rule)/(support(a) * (support(c)) 
      => Lift(rule) = confidence(rule)/support(c)         { where a -> antecedent, c -> consequent}

support for toothbrush: 0.06735
support for perfume: 0.08996

support for the rule {Toothbrush -> Perfume}: 0.02182
Lift(Toothbrush -> Perfume) = (0.02182)/(0.06735* 0.08996)
                            = 3.60137

The Rules with top 3 highest lift are ordered as follows: 
1. {Toothbrush} -> {Perfume}
2. {Bow} -> {Toothbrush}
3. {Magazine, Candy Bar} -> {Greeting Cards}
'''

' By sorting the rules in descending order of lift values, we determined that 2 rules which are actually symmetric had the same highest lift value. They are:\n{Perfume} -> {Toothbrush}\n{Toothbrush} -> {Perfume}\nAs their lift values are same -> 3.601370, we decided to pick the one that has the higher confidence (%). So, we choose the association rule, {Toothbrush} -> {Perfume} which has a confidence of around 32%\n\nLift is a metric that measures how likely it is for a consequent to occur, with the presence of the corresponding antecedent.\n           Lift(rule) = support(rule)/(support(a) * (support(c)) \n      => Lift(rule) = confidence(rule)/support(c)         { where a -> antecedent, c -> consequent}\n\nsupport for toothbrush: 0.06735\nsupport for perfume: 0.08996\n\nsupport for the rule {Toothbrush -> Perfume}: 0.02182\nLift(Toothbrush -> Perfume) = (0.02182)/(0.06735* 0.08996)\n                            = 3.60137\n\nThe Rules with top 3 highest lift are ordered as follows: \n1

### Question 5. For the same rule, show how leverage and conviction were obtained

In [11]:
''' 5) Leverage is the measure of deviation from independence. It explains how much more likely is the co-occurrence of the antecedent and consequent with regards to the independence. Leverage measures how much more often items  in antecedent and consequent occur together in transactions than if they were statistically independent.

Leverage(rule) = support(rule) – support(consequent) * support(antecedent)

For the rule {Toothbrush) -> {Perfume} the leverage is calculated as follows:
Support(rule) = 0.02182, support(consequent) = 
                                0.02182 - ( 0.08996 * 0.06735 )
                                = 0.01576

Conviction measures the ratio of the expected frequency that antecedent occurs without consequent, to the observed frequency that antecedent occurs without the consequent. 
Conviction(rule) = (1 - Support(consequent)) / (1 - Confidence(rule))

Confidence(Toothbrush -> Perfume) = 0.323979
Conviction(Toothbrush -> Perfume) = (1 - 0.089960)/1-0.323979 = 1.346172

'''

' 5) Leverage is the measure of deviation from independence. It explains how much more likely is the co-occurrence of the antecedent and consequent with regards to the independence. Leverage measures how much more often items  in antecedent and consequent occur together in transactions than if they were statistically independent.\n\nLeverage(rule) = support(rule) – support(consequent) * support(antecedent)\n\nFor the rule {Toothbrush) -> {Perfume} the leverage is calculated as follows:\nSupport(rule) = 0.02182, support(consequent) = \n                                0.02182 - ( 0.08996 * 0.06735 )\n                                = 0.01576\n\nConviction measures the ratio of the expected frequency that antecedent occurs without consequent, to the observed frequency that antecedent occurs without the consequent. \nConviction(rule) = (1 - Support(consequent)) / (1 - Confidence(rule))\n\nConfidence(Toothbrush -> Perfume) = 0.323979\nConviction(Toothbrush -> Perfume) = (1 - 0.089960)/1-0.3

### Question 6. Interpret and discuss the 5 rules with 
### a. the highest confidence,
### b. the highest lift,
### c. the highest leverage, and
### d. the highest conviction.
### If there are more than five meeting the required criterion, pick any five. Are any of these surprising? Comment on the extent of their redundancy and utility. 

In [12]:
# 6(a) Sorting Rules by Highest Confidence
rules6a = rules.sort_values(['confidence'], ascending=[False])
rules6a

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
42,"(Toothpaste, Pencils)",(Candy Bar),0.02456,0.171005,0.01139,0.463762,2.71198,0.00719,1.545947,0.64716
29,"(Magazine, Greeting Cards)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
38,"(Toothpaste, Magazine)",(Candy Bar),0.031665,0.171005,0.01372,0.433286,2.533762,0.008305,1.46281,0.625124
28,"(Magazine, Candy Bar)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
33,"(Toothpaste, Greeting Cards)",(Candy Bar),0.03208,0.171005,0.013175,0.410692,2.401637,0.007689,1.406726,0.60296
30,"(Candy Bar, Greeting Cards)",(Magazine),0.04366,0.241305,0.016665,0.381699,1.581813,0.00613,1.227065,0.384606
35,"(Pencils, Magazine)",(Candy Bar),0.03163,0.171005,0.012005,0.379545,2.219495,0.006596,1.336108,0.567394
47,"(Toothpaste, Magazine)",(Greeting Cards),0.031665,0.146885,0.011945,0.37723,2.568202,0.007294,1.369873,0.63059
45,"(Pencils, Greeting Cards)",(Magazine),0.02924,0.241305,0.010955,0.374658,1.552633,0.003899,1.213248,0.366654
48,"(Toothpaste, Greeting Cards)",(Magazine),0.03208,0.241305,0.011945,0.37235,1.543069,0.004204,1.208787,0.363605


In [13]:
# 6(b) Sorting Rules by Highest Lift
rules6b = rules.sort_values(['lift'], ascending=[False])
rules6b

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
26,(Perfume),(Toothbrush),0.08996,0.06735,0.02182,0.242552,3.60137,0.015761,1.231306,0.793732
27,(Toothbrush),(Perfume),0.06735,0.08996,0.02182,0.323979,3.60137,0.015761,1.346172,0.77449
1,(Toothbrush),(Bow),0.06735,0.054645,0.01134,0.168374,3.081236,0.00766,1.136755,0.724232
0,(Bow),(Toothbrush),0.054645,0.06735,0.01134,0.207521,3.081236,0.00766,1.176877,0.714499
31,(Greeting Cards),"(Magazine, Candy Bar)",0.146885,0.040535,0.016665,0.113456,2.798966,0.010711,1.082253,0.753386
28,"(Magazine, Candy Bar)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
42,"(Toothpaste, Pencils)",(Candy Bar),0.02456,0.171005,0.01139,0.463762,2.71198,0.00719,1.545947,0.64716
29,"(Magazine, Greeting Cards)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
47,"(Toothpaste, Magazine)",(Greeting Cards),0.031665,0.146885,0.011945,0.37723,2.568202,0.007294,1.369873,0.63059
38,"(Toothpaste, Magazine)",(Candy Bar),0.031665,0.171005,0.01372,0.433286,2.533762,0.008305,1.46281,0.625124


In [14]:
# 6(c) Sorting Rules based on Highest Leverage
rules6c = rules.sort_values(['leverage'], ascending=[False])
rules6c

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(Candy Bar),(Greeting Cards),0.171005,0.146885,0.04366,0.255314,1.738191,0.018542,1.145604,0.512294
3,(Greeting Cards),(Candy Bar),0.146885,0.171005,0.04366,0.297239,1.738191,0.018542,1.179626,0.49781
27,(Toothbrush),(Perfume),0.06735,0.08996,0.02182,0.323979,3.60137,0.015761,1.346172,0.77449
26,(Perfume),(Toothbrush),0.08996,0.06735,0.02182,0.242552,3.60137,0.015761,1.231306,0.793732
8,(Toothpaste),(Candy Bar),0.160425,0.171005,0.03978,0.247966,1.450053,0.012347,1.102338,0.369675
9,(Candy Bar),(Toothpaste),0.171005,0.160425,0.03978,0.232625,1.450053,0.012347,1.094087,0.374393
28,"(Magazine, Candy Bar)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
31,(Greeting Cards),"(Magazine, Candy Bar)",0.146885,0.040535,0.016665,0.113456,2.798966,0.010711,1.082253,0.753386
29,"(Magazine, Greeting Cards)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
6,(Pencils),(Candy Bar),0.134925,0.171005,0.033015,0.244691,1.430903,0.009942,1.097558,0.348109


In [15]:
# 6(d) Sorting Rules based on Highest Conviction
rules6c = rules.sort_values(['conviction'], ascending=[False])
rules6c

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
42,"(Toothpaste, Pencils)",(Candy Bar),0.02456,0.171005,0.01139,0.463762,2.71198,0.00719,1.545947,0.64716
29,"(Magazine, Greeting Cards)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
38,"(Toothpaste, Magazine)",(Candy Bar),0.031665,0.171005,0.01372,0.433286,2.533762,0.008305,1.46281,0.625124
28,"(Magazine, Candy Bar)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
33,"(Toothpaste, Greeting Cards)",(Candy Bar),0.03208,0.171005,0.013175,0.410692,2.401637,0.007689,1.406726,0.60296
47,"(Toothpaste, Magazine)",(Greeting Cards),0.031665,0.146885,0.011945,0.37723,2.568202,0.007294,1.369873,0.63059
27,(Toothbrush),(Perfume),0.06735,0.08996,0.02182,0.323979,3.60137,0.015761,1.346172,0.77449
35,"(Pencils, Magazine)",(Candy Bar),0.03163,0.171005,0.012005,0.379545,2.219495,0.006596,1.336108,0.567394
44,"(Pencils, Magazine)",(Greeting Cards),0.03163,0.146885,0.010955,0.346348,2.357956,0.006309,1.305152,0.594715
43,"(Pencils, Candy Bar)",(Toothpaste),0.033015,0.160425,0.01139,0.344995,2.150505,0.006094,1.281784,0.553259


In [16]:
# Questions 7 and 8 answered in the report (word document)