# Recommender System

In [1]:
# Imports required packages

import pandas as pd
import numpy as np

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

## Loading Data Set

In [2]:
# Loads Data Set

# Contains list of orders and list of items in each order.
# The phrase "order" and "transction" will be used interchangably
transctions = []

# Opens file
with open("./../../../Data/groceries.csv", mode="r") as file:
    # Reads all lines
    lines = file.readlines()
    
    # Removes leading and tailing spaces from each line
    lines_stripped = [line.strip() for line in lines]
    
    # Splits each line by a comma to extract items and stores in a list
    for transction in lines_stripped:
        transctions.append(transction.split(','))

# Shows few transactions
transctions[:10]

[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'],
 ['tropical fruit', 'yogurt', 'coffee'],
 ['whole milk'],
 ['pip fruit', 'yogurt', 'cream cheese ', 'meat spreads'],
 ['other vegetables',
  'whole milk',
  'condensed milk',
  'long life bakery product'],
 ['whole milk', 'butter', 'yogurt', 'rice', 'abrasive cleaner'],
 ['rolls/buns'],
 ['other vegetables',
  'UHT-milk',
  'rolls/buns',
  'bottled beer',
  'liquor (appetizer)'],
 ['pot plants'],
 ['whole milk', 'cereals']]

## Data Transformation

Encoding data into tabular format where each row represents a tranaction and each column represents an item.

In [3]:
# Applying one-hot encoding to items in each transaction

# Instantiate encoder
transaction_encoder = TransactionEncoder()

# Performs one-hot encoding transformations
transctions_one_hot_encoded = transaction_encoder.fit_transform(transctions)

# Converts the matrix into a DataFrame
transctions_one_hot_encoded = pd.DataFrame(
    transctions_one_hot_encoded, columns=transaction_encoder.columns_)

In [4]:
# Displays few of the transaction from the DataFrame
display(transctions_one_hot_encoded.head(10))

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
5,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


Interpretation from the above table is as follows
- The value 'True' indicates that the item was purchases where 'False' indicates otherwise.
- The above table is sparse because only few items are purchased by customers.

In [5]:
# Prints the shape of the data set
print(transctions_one_hot_encoded.shape)

(9835, 171)


In [6]:
print("The above shape indicates that there are {} " \
      "unique items across all the transactions".format(transctions_one_hot_encoded.shape[1]))

The above shape indicates that there are 171 unique items across all the transactions


## Association Rules

### Applying Apriori Algorithm

In [7]:
itemsets = apriori(
    transctions_one_hot_encoded,
    min_support = 0.02,           # Presence of itemset must be 2% of all transaction 
    use_colnames=True             # Uses input DataFrame's column names as item namea
)

In [8]:
# Checks for the count of the itemsets (through shape)
print("Count of itemsets: {}".format(itemsets.shape[0]))

Count of itemsets: 122


In [9]:
# Shows 10 random itemsets from all the itemsets
display(itemsets.sample(10, random_state = 42))

Unnamed: 0,support,itemsets
18,0.053279,(curd)
45,0.037824,(salty snack)
47,0.098526,(shopping bags)
89,0.035892,"(tropical fruit, other vegetables)"
4,0.080529,(bottled beer)
40,0.088968,(pastry)
62,0.024199,"(bottled water, rolls/buns)"
107,0.025826,"(root vegetables, yogurt)"
31,0.037417,(long life bakery product)
55,0.071683,(whipped/sour cream)


Note that Apriori algorithm has filtered out itemsets which have minimum support of greater than 2%.
These itemsets will be used to create _Association Rules_.

### Generating Association Rules

In [10]:
rules = association_rules(
    itemsets, 
    metric = "lift",      # default: "confidence" 
    min_threshold = 1
)

In [11]:
# Checks for the count of the association rules (through shape)
print("Count of Assocition Rules: {}".format(rules.shape[0]))

Count of Assocition Rules: 126


In [12]:
# Shows 10 random association rules from all the association rules
display(rules.sample(10, random_state = 42))

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
73,(whole milk),(pip fruit),0.255516,0.075648,0.030097,0.117788,1.557043,0.010767,1.047765,0.480544
19,(citrus fruit),(other vegetables),0.082766,0.193493,0.028876,0.348894,1.80314,0.012862,1.238674,0.485603
116,"(root vegetables, whole milk)",(other vegetables),0.048907,0.193493,0.023183,0.474012,2.44977,0.013719,1.53332,0.62223
67,(pastry),(soda),0.088968,0.174377,0.021047,0.236571,1.356665,0.005533,1.081467,0.288572
94,(sausage),(soda),0.09395,0.174377,0.024301,0.258658,1.483324,0.007918,1.113687,0.359626
77,(root vegetables),(rolls/buns),0.108998,0.183935,0.024301,0.222948,1.212101,0.004252,1.050206,0.196393
31,(whole milk),(frankfurter),0.255516,0.058973,0.020539,0.080382,1.363029,0.00547,1.02328,0.357751
53,(other vegetables),(sausage),0.193493,0.09395,0.026945,0.139254,1.482209,0.008766,1.052633,0.403383
117,(other vegetables),"(root vegetables, whole milk)",0.193493,0.048907,0.023183,0.119811,2.44977,0.013719,1.080555,0.733779
44,(pip fruit),(other vegetables),0.075648,0.193493,0.026131,0.34543,1.785237,0.011494,1.232118,0.475847


In [13]:
# Shows top 10 association rules
display(rules.sort_values("confidence", ascending=False)[:10])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
121,"(other vegetables, yogurt)",(whole milk),0.043416,0.255516,0.022267,0.512881,2.007235,0.011174,1.52834,0.524577
16,(butter),(whole milk),0.055414,0.255516,0.027555,0.497248,1.946053,0.013395,1.480817,0.514659
25,(curd),(whole milk),0.053279,0.255516,0.026131,0.490458,1.919481,0.012517,1.461085,0.505984
115,"(other vegetables, root vegetables)",(whole milk),0.047382,0.255516,0.023183,0.48927,1.914833,0.011076,1.457687,0.501524
116,"(root vegetables, whole milk)",(other vegetables),0.048907,0.193493,0.023183,0.474012,2.44977,0.013719,1.53332,0.62223
28,(domestic eggs),(whole milk),0.063447,0.255516,0.029995,0.472756,1.850203,0.013783,1.41203,0.490649
108,(whipped/sour cream),(whole milk),0.071683,0.255516,0.032232,0.449645,1.759754,0.013916,1.352735,0.465077
90,(root vegetables),(whole milk),0.108998,0.255516,0.048907,0.448694,1.756031,0.021056,1.350401,0.483202
51,(root vegetables),(other vegetables),0.108998,0.193493,0.047382,0.434701,2.246605,0.026291,1.426693,0.622764
32,(frozen vegetables),(whole milk),0.048094,0.255516,0.020437,0.424947,1.663094,0.008149,1.294636,0.418855


Observation from the above table shows that the probility for customer to buy "whole milk" provided he/she also purchased "Yogurt/other vegitables" is more than 50% (0.512881). These rule can be used to create strategies like keeping these items together in store shelves.