<a href="https://colab.research.google.com/github/Rajeeb321123/Machine-learning-Journey/blob/master/17_apriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Apriori

## Importing the libraries

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
!pip install apyori # apyori library isnot in google colab

Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5955 sha256=c099e4aaceb62a2cf25ca3164ff28e6209996912836d1bfc220d89f71fede27b
  Stored in directory: /root/.cache/pip/wheels/c4/1a/79/20f55c470a50bb3702a8cb7c94d8ada15573538c7f4baebe2d
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


## Data Preprocessing

In [5]:
dataset = pd.read_csv("Market_Basket_Optimisation.csv", header = None) # header = None as there is no header name or column names
dataset.values

array([['shrimp', 'almonds', 'avocado', ..., 'frozen smoothie',
        'spinach', 'olive oil'],
       ['burgers', 'meatballs', 'eggs', ..., nan, nan, nan],
       ['chutney', nan, nan, ..., nan, nan, nan],
       ...,
       ['chicken', nan, nan, ..., nan, nan, nan],
       ['escalope', 'green tea', nan, ..., nan, nan, nan],
       ['eggs', 'frozen smoothie', 'yogurt cake', ..., nan, nan, nan]],
      dtype=object)

In [6]:
# apyroi is expect certain data format (no panda data format)
# recreate the dataset to list
transactions = [] # creating the transactions list
for i in range(0, 7501): # for all data
  transactions.append([str(dataset.values[i, j]) for j in range(0, 20)])
transactions[:2]

[['shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil'],
 ['burgers',
  'meatballs',
  'eggs',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan']]

## Training the Apriori model on the dataset

In [7]:
from apyori import apriori

# only consider transaction that appear at least 3 times a day
# perweek  3 * 7 = 21
# support = no of times product appear in transcation / total transaction
# min_support = 21 ( minimum 3 times a day or 21 times a week) / 7052 (total transactions) = 0.003
# for min_confidence, start high like 0.8 (if few rules) decrease it
# for this condition 0.2 min_confidence is best.
# min_lift = 3 is default and good 3 is a good value for min_lift
rules = apriori( # return final rules.
    transactions = transactions,
    min_support = 0.003,
    min_confidence = 0.2, min_lift=3,
    # Most important for a business and anlaysis point of view below comment
    # Here task is buy one product and get another product deal for a local shop so here rule must have only two product
    min_length=2,
    max_length=3,
    # imagine situation of buy 2 product and get one free then, min_length=3 and max_length = 3
    # situation for flexible with multiple option like combination of  buy 1 product get 1, buy 2 get 2 , buy 10 get  1 then min_length =2 , max_length =11
    )

## Visualising the results

### Displaying the first results coming directly from the output of the apriori function

In [8]:
results = list(rules)
results
#{{if buy, then also buy}} , this is the order of each rules
# confidence = % also chance of buying
# lift > 3 is only here as per our hyperparameter we provided before
# looks like they arenot sorted by lift

[RelationRecord(items=frozenset({'light cream', 'chicken'}), support=0.004532728969470737, ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}), items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)]),
 RelationRecord(items=frozenset({'escalope', 'mushroom cream sauce'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'mushroom cream sauce'}), items_add=frozenset({'escalope'}), confidence=0.3006993006993007, lift=3.790832696715049)]),
 RelationRecord(items=frozenset({'escalope', 'pasta'}), support=0.005865884548726837, ordered_statistics=[OrderedStatistic(items_base=frozenset({'pasta'}), items_add=frozenset({'escalope'}), confidence=0.3728813559322034, lift=4.700811850163794)]),
 RelationRecord(items=frozenset({'fromage blanc', 'honey'}), support=0.003332888948140248, ordered_statistics=[OrderedStatistic(items_base=frozenset({'fromage blanc'}), items_add=frozenset({'honey'}), confidence=0

### Putting the results well organised into a Pandas DataFrame

In [22]:
results[1][2][0][0], results[1][2][0][1]

(frozenset({'mushroom cream sauce'}), frozenset({'escalope'}))

In [24]:
tuple(results[1][2][0][1])

('escalope',)

In [25]:
tuple(results[1][2][0][1])[0]

'escalope'

In [44]:
def inspect (results): # very specific to this case, so donot try to over analyze this function
  lhs =         [tuple(result[2][0][0])[0] for result in results]
  rhs =         [tuple(result[2][0][1])[0] for result in results]
  supports =    [result[1] for result in results]
  confidence =  [result[2][0][2] for result in results]
  lifts =       [result[2][0][3] for result in results]
  return list ( zip(lhs, rhs, supports, confidence, lifts))

In [45]:
# for showing in panda dataframe
resultsinDataFrame = pd.DataFrame(inspect(results=results), columns = ['Left Hand Side', 'Right Hand Side', 'Support', 'Confidence', 'Lift'])

### Displaying the results non sorted

In [46]:
resultsinDataFrame

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
0,light cream,chicken,0.004533,0.290598,4.843951
1,mushroom cream sauce,escalope,0.005733,0.300699,3.790833
2,pasta,escalope,0.005866,0.372881,4.700812
3,fromage blanc,honey,0.003333,0.245098,5.164271
4,herb & pepper,ground beef,0.015998,0.323450,3.291994
...,...,...,...,...,...
60,whole wheat pasta,olive oil,0.007999,0.271493,4.130772
61,pasta,,0.005066,0.322034,4.515096
62,pancakes,olive oil,0.005066,0.201058,3.052910
63,olive oil,spaghetti,0.004399,0.611111,3.509912


### Displaying the results sorted by descending lifts

In [47]:
resultsinDataFrame.nlargest(n = 10, columns='Lift') # n = 10, return only 10 best, columns = which columns to sort by

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
59,whole wheat pasta,olive oil,0.003866,0.402778,6.115863
50,tomato sauce,ground beef,0.003066,0.216981,5.535971
28,fromage blanc,honey,0.003333,0.245098,5.178818
3,fromage blanc,honey,0.003333,0.245098,5.164271
0,light cream,chicken,0.004533,0.290598,4.843951
16,light cream,,0.004533,0.290598,4.843951
2,pasta,escalope,0.005866,0.372881,4.700812
26,pasta,escalope,0.005866,0.372881,4.700812
27,ground beef,herb & pepper,0.0032,0.230769,4.665768
61,pasta,,0.005066,0.322034,4.515096
