# Example 1

In [1]:
## Importing libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

In [2]:
## Creating a Dataset 
dataset = [['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
 ['a', 'b', 'c', 'f', 'l', 'm', 'o'],
 ['b', 'f', 'h', 'j', 'o'],
 ['b', 'c', 'k', 's', 'p'],
 ['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']]

In [3]:
## We use the TransactionEncoder Object to transform the dataset 
## into an array format encoded in NumPy boolean array
te = TransactionEncoder()

## Now using the fit method we transform it into a boolean array
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)



In [4]:
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,s
0,True,False,True,True,False,True,True,False,True,False,False,False,True,False,False,True,False
1,True,True,True,False,False,True,False,False,False,False,False,True,True,False,True,False,False
2,False,True,False,False,False,True,False,True,False,True,False,False,False,False,True,False,False
3,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,True,True
4,True,False,True,False,True,True,False,False,False,False,False,True,True,True,False,True,False


In [6]:
from mlxtend.frequent_patterns import apriori

In [7]:
## The apriori algorithm defines frequent items
## Here the min_support is set to 0.5 so it will 
## extract items that occurs at least 0.5 times in the dataset
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)
## We add a length column to frequent_itemsets 
## We apply a lambda function to make x equals len(x)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x : len(x))

In [8]:
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.6,(a),1
1,0.6,(b),1
2,0.8,(c),1
3,0.8,(f),1
4,0.6,(m),1
5,0.6,(p),1
6,0.6,"(a, c)",2
7,0.6,"(a, f)",2
8,0.6,"(a, m)",2
9,0.6,"(f, c)",2


In [12]:
frequent_itemsets[(frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.6)]
## When we use the first constraints line it will takes the items
## That contains f and m in their itemsets column
frequent_itemsets[frequent_itemsets['itemsets'] == {'f','m'}]

## When we use the second constraints line it will takes the items
## That contains f and m in their itemsets column
## and the pat is a literal string
frequent_itemsets[frequent_itemsets['itemsets'].str.contains('m', regex = False)]

Unnamed: 0,support,itemsets,length
4,0.6,(m),1
8,0.6,"(a, m)",2
10,0.6,"(m, c)",2
12,0.6,"(f, m)",2
14,0.6,"(a, m, c)",3
15,0.6,"(a, f, m)",3
16,0.6,"(f, m, c)",3
17,0.6,"(a, f, m, c)",4


## Example 2 

In [13]:
from mlxtend.frequent_patterns import association_rules

In [18]:
jogging_data = pd.read_csv('JoggingTitre.csv', sep=',')
## We use the getDummies to convert the categorical variables into indicator variables
x_dum = pd.get_dummies(jogging_data)
x_dum

Unnamed: 0,Perspective_Couvert,Perspective_Pluie,Perspective_Soleil,Temps_Bon,Temps_Chaud,Temps_Frais,Humidité_Haute,Humidité_Normale,Vent_Doux,Vent_Fort,Jogging_Non,Jogging_Oui
0,0,0,1,0,1,0,1,0,1,0,1,0
1,0,0,1,0,1,0,1,0,0,1,1,0
2,1,0,0,0,1,0,1,0,1,0,0,1
3,0,1,0,1,0,0,1,0,1,0,0,1
4,0,1,0,0,0,1,0,1,1,0,0,1
5,0,1,0,0,0,1,0,1,0,1,1,0
6,1,0,0,0,0,1,0,1,0,1,0,1
7,0,0,1,1,0,0,1,0,1,0,1,0
8,0,0,1,0,0,1,0,1,1,0,0,1
9,0,1,0,1,0,0,0,1,1,0,0,1


In [19]:
## It will give us the colnames that have an occurence >= 20% 
frequent_itemsets02 = apriori(x_dum, min_support=0.2, use_colnames=True)
frequent_itemsets02

Unnamed: 0,support,itemsets
0,0.285714,(Perspective_Couvert)
1,0.357143,(Perspective_Pluie)
2,0.357143,(Perspective_Soleil)
3,0.428571,(Temps_Bon)
4,0.285714,(Temps_Chaud)
5,0.285714,(Temps_Frais)
6,0.5,(Humidité_Haute)
7,0.5,(Humidité_Normale)
8,0.571429,(Vent_Doux)
9,0.428571,(Vent_Fort)


In [20]:
## It gives the rules that can occure with level of confidence that is above 80%
ARules02 = association_rules(frequent_itemsets02, metric="confidence", min_threshold=0.8)
ARules02

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Perspective_Couvert),(Jogging_Oui),0.285714,0.642857,0.285714,1.0,1.555556,0.102041,inf
1,(Temps_Frais),(Humidité_Normale),0.285714,0.5,0.285714,1.0,2.0,0.142857,inf
2,(Humidité_Normale),(Jogging_Oui),0.5,0.642857,0.428571,0.857143,1.333333,0.107143,2.5
3,"(Perspective_Pluie, Vent_Doux)",(Jogging_Oui),0.214286,0.642857,0.214286,1.0,1.555556,0.076531,inf
4,"(Perspective_Pluie, Jogging_Oui)",(Vent_Doux),0.214286,0.571429,0.214286,1.0,1.75,0.091837,inf
5,"(Perspective_Soleil, Humidité_Haute)",(Jogging_Non),0.214286,0.357143,0.214286,1.0,2.8,0.137755,inf
6,"(Perspective_Soleil, Jogging_Non)",(Humidité_Haute),0.214286,0.5,0.214286,1.0,2.0,0.107143,inf
7,"(Temps_Frais, Jogging_Oui)",(Humidité_Normale),0.214286,0.5,0.214286,1.0,2.0,0.107143,inf
8,"(Humidité_Normale, Vent_Doux)",(Jogging_Oui),0.285714,0.642857,0.285714,1.0,1.555556,0.102041,inf


In [23]:
## Here we select the columns we want to see
ARules02.iloc[0:,0:4]
ARules02.iloc[0:,[0,1,4,5]]


Unnamed: 0,antecedents,consequents,support,confidence
0,(Perspective_Couvert),(Jogging_Oui),0.285714,1.0
1,(Temps_Frais),(Humidité_Normale),0.285714,1.0
2,(Humidité_Normale),(Jogging_Oui),0.428571,0.857143
3,"(Perspective_Pluie, Vent_Doux)",(Jogging_Oui),0.214286,1.0
4,"(Perspective_Pluie, Jogging_Oui)",(Vent_Doux),0.214286,1.0
5,"(Perspective_Soleil, Humidité_Haute)",(Jogging_Non),0.214286,1.0
6,"(Perspective_Soleil, Jogging_Non)",(Humidité_Haute),0.214286,1.0
7,"(Temps_Frais, Jogging_Oui)",(Humidité_Normale),0.214286,1.0
8,"(Humidité_Normale, Vent_Doux)",(Jogging_Oui),0.285714,1.0


In [24]:
## We fix the consequents to see the antecedents thats suits with
ARules02[(ARules02['consequents'] == {'Jogging_Oui'})|(ARules02['consequents'] == {'Jogging_Non'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Perspective_Couvert),(Jogging_Oui),0.285714,0.642857,0.285714,1.0,1.555556,0.102041,inf
2,(Humidité_Normale),(Jogging_Oui),0.5,0.642857,0.428571,0.857143,1.333333,0.107143,2.5
3,"(Perspective_Pluie, Vent_Doux)",(Jogging_Oui),0.214286,0.642857,0.214286,1.0,1.555556,0.076531,inf
5,"(Perspective_Soleil, Humidité_Haute)",(Jogging_Non),0.214286,0.357143,0.214286,1.0,2.8,0.137755,inf
8,"(Humidité_Normale, Vent_Doux)",(Jogging_Oui),0.285714,0.642857,0.285714,1.0,1.555556,0.102041,inf


## Part 2

### Exercice 1 

In [37]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display

In [39]:
def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

In [40]:
orders = pd.read_csv('../input/order_products__prior.csv')
print('orders -- dimensions: {0};   size: {1}'.format(orders.shape, size(orders)))
display(orders.head())

FileNotFoundError: [Errno 2] No such file or directory: '../input/order_products__prior.csv'