<a href="https://colab.research.google.com/github/OptimusJet/OptimusJet/blob/main/Assoc_Rules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Market Basket Analysis

#### Source of this notebook is the following: https://medium.com/analytics-vidhya/association-analysis-in-python-2b955d0180c

#### Market Basket Analysis or Association Rules mining is used to discover associations among items sold in a small grocery store  
#### Data file is available here:https://gist.github.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751

In [None]:
# If necessary, install the mlxtend package
# To install, in Conda prompt, type and run the following command:
# conda install -c conda-forge mlxtend
# or type this command: pip instal mlxtend

import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt

In [None]:
df1 = pd.read_csv('./desktop/data/store_data.csv', sep=',')
df1.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [None]:
# Extract unique values from the first columns
items = (df1['0'].unique())
items

array(['shrimp', 'burgers', 'chutney', 'turkey', 'mineral water',
       'low fat yogurt', 'whole wheat pasta', 'soup', 'frozen vegetables',
       'french fries', 'eggs', 'cookies', 'spaghetti', 'meatballs',
       'red wine', 'rice', 'parmesan cheese', 'ground beef',
       'sparkling water', 'herb & pepper', 'pickles', 'energy bar',
       'fresh tuna', 'escalope', 'avocado', 'tomato sauce',
       'clothes accessories', 'energy drink', 'chocolate',
       'grated cheese', 'yogurt cake', 'mint', 'asparagus', 'champagne',
       'ham', 'muffins', 'french wine', 'chicken', 'pasta', 'tomatoes',
       'pancakes', 'frozen smoothie', 'carrots', 'yams', 'shallot',
       'butter', 'light mayo', 'pepper', 'candy bars', 'cooking oil',
       'milk', 'green tea', 'bug spray', 'oil', 'olive oil', 'salmon',
       'cake', 'almonds', 'salt', 'strong cheese', 'hot dogs', 'pet food',
       'whole wheat rice', 'antioxydant juice', 'honey', 'sandwich',
       'salad', 'magazines', 'protein bar', '

## One-hot encoding the data frame

One-hot encoding refers to transforming categorical data into zero and 1 values. The following code transforms the initial data frame that contains categorical data values (product names) to numbers (zero or one)

In [None]:
# From df1, create a list of documents, each document containing
# a sales transaction as key-value pairs

encoded_vals = []
for index, row in df1.iterrows():
    labels = {}
    uncommons = list(set(items) - set(row))
    commons = list(set(items).intersection(row))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)
    encoded_vals[0]


# one-hot encoded data frame
ohe_df = pd.DataFrame(encoded_vals)

In [None]:
# after the one hot encoding process done in the previous cell
# the data frame looks like below

ohe_df.head(8)

Unnamed: 0,salt,nonfat milk,mushroom cream sauce,chocolate,corn,black tea,eggs,melons,mayonnaise,milk,...,shrimp,vegetables mix,salmon,frozen smoothie,olive oil,salad,spinach,green grapes,yams,whole weat flour
0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
freq_items = apriori(ohe_df, min_support=0.01, use_colnames=True, verbose=1)
freq_items.head(7)

Processing 80 combinations | Sampling itemset size 4 3


Unnamed: 0,support,itemsets
0,0.010399,(nonfat milk)
1,0.019064,(mushroom cream sauce)
2,0.163845,(chocolate)
3,0.014265,(black tea)
4,0.179709,(eggs)
5,0.011998,(melons)
6,0.129583,(milk)


In [None]:
rules = association_rules(freq_items, metric="confidence", min_threshold=0.45)
rules.sort_values ('confidence', ascending = False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,"(eggs, ground beef)",(mineral water),0.019997,0.238368,0.010132,0.506667,2.125563,0.005365,1.543848
4,"(milk, ground beef)",(mineral water),0.021997,0.238368,0.011065,0.50303,2.110308,0.005822,1.532552
1,"(ground beef, chocolate)",(mineral water),0.023064,0.238368,0.010932,0.473988,1.988472,0.005434,1.447937
3,"(frozen vegetables, milk)",(mineral water),0.023597,0.238368,0.011065,0.468927,1.967236,0.00544,1.434136
0,(soup),(mineral water),0.050527,0.238368,0.023064,0.456464,1.914955,0.01102,1.401255
5,"(spaghetti, pancakes)",(mineral water),0.025197,0.238368,0.011465,0.455026,1.908923,0.005459,1.397557
