**Project 02**

Project Title:
Discovering Edibility Patterns in Mushrooms using Association Rule Mining

In [4]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df = pd.read_csv('/content/drive/MyDrive/Data Mining/Project 02/mushroom_cleaned.csv')
df

Unnamed: 0,class,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8120,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8121,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8122,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


**Objectives:**
1. Preprocess the mushroom dataset for association rule mining.
2. Use the Apriori algorithm to mine frequent itemsets. Set support = 0.3.
3. Generate the top 10 association rules based on confidence and lift. Set confidence = 0.7.
4. Select any one rule and explain its meaning in the report.

In [7]:
import pandas as pd

df_encoded = pd.get_dummies(df, columns=df.columns)
df_encoded

Unnamed: 0,class_0,class_1,cap-shape_c_0,cap-shape_c_1,cap-shape_f_0,cap-shape_f_1,cap-shape_k_0,cap-shape_k_1,cap-shape_s_0,cap-shape_s_1,...,habitat_l_0,habitat_l_1,habitat_m_0,habitat_m_1,habitat_p_0,habitat_p_1,habitat_u_0,habitat_u_1,habitat_w_0,habitat_w_1
0,False,True,False,True,False,True,False,True,False,True,...,False,True,False,True,False,True,False,True,False,True
1,True,False,True,False,True,False,True,False,True,False,...,True,False,True,False,True,False,True,False,True,False
2,True,False,True,False,True,False,True,False,True,False,...,True,False,True,False,True,False,True,False,True,False
3,False,True,False,True,False,True,False,True,False,True,...,False,True,False,True,False,True,False,True,False,True
4,True,False,True,False,True,False,True,False,True,False,...,True,False,True,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,True,False,True,False,True,False,True,False,True,False,...,True,False,True,False,True,False,True,False,True,False
8120,True,False,True,False,True,False,True,False,True,False,...,True,False,True,False,True,False,True,False,True,False
8121,True,False,True,False,True,False,True,False,True,False,...,True,False,True,False,True,False,True,False,True,False
8122,False,True,False,True,False,True,False,True,False,True,...,False,True,False,True,False,True,False,True,False,True


In [42]:
import gc

columns_to_use = ['class', 'cap-shape_c', 'gill-color_e', 'spore-print-color_o', 'habitat_w']
df_small = df[columns_to_use]

df_encoded = pd.get_dummies(df_small)

frequent_itemsets = apriori(df_encoded, min_support=0.3, use_colnames=True) #Apriori (support = 0.3)


rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7) #Generate rules (confidence = 0.7)

top_10_rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False]).head(10)

print(top_10_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

del df_encoded, frequent_itemsets, rules
gc.collect() #cleaned memory because of pc crashed everytime I loaded dummies

             antecedents            consequents   support  confidence  \
0                (class)          (cap-shape_c)  0.482029         1.0   
1          (cap-shape_c)                (class)  0.482029         1.0   
2         (gill-color_e)                (class)  0.482029         1.0   
3                (class)         (gill-color_e)  0.482029         1.0   
4  (spore-print-color_o)                (class)  0.482029         1.0   
5                (class)  (spore-print-color_o)  0.482029         1.0   
6                (class)            (habitat_w)  0.482029         1.0   
7            (habitat_w)                (class)  0.482029         1.0   
8         (gill-color_e)          (cap-shape_c)  0.482029         1.0   
9          (cap-shape_c)         (gill-color_e)  0.482029         1.0   

       lift  
0  2.074566  
1  2.074566  
2  2.074566  
3  2.074566  
4  2.074566  
5  2.074566  
6  2.074566  
7  2.074566  
8  2.074566  
9  2.074566  


75

In [49]:
import gc

[
'class', 'cap-shape_c', 'cap-shape_f', 'cap-shape_k', 'cap-shape_s', 'cap-shape_x', 'cap-surface_g', 'cap-surface_s', 'cap-surface_y', 'cap-color_c', 'cap-color_e', 'cap-color_g', 'cap-color_n', 'cap-color_p', 'cap-color_r', 'cap-color_u', 'cap-color_w', 'cap-color_y', 'bruises_t', 'odor_c', 'odor_f', 'odor_l', 'odor_m', 'odor_n', 'odor_p', 'odor_s', 'odor_y', 'gill-attachment_f', 'gill-spacing_w', 'gill-size_n', 'gill-color_e', 'gill-color_g', 'gill-color_h', 'gill-color_k', 'gill-color_n', 'gill-color_o', 'gill-color_p', 'gill-color_r', 'gill-color_u', 'gill-color_w', 'gill-color_y', 'stalk-shape_t', 'stalk-root_b', 'stalk-root_c', 'stalk-root_e', 'stalk-root_r', 'stalk-surface-above-ring_k', 'stalk-surface-above-ring_s', 'stalk-surface-above-ring_y', 'stalk-surface-below-ring_k', 'stalk-surface-below-ring_s', 'stalk-surface-below-ring_y', 'stalk-color-above-ring_c', 'stalk-color-above-ring_e', 'stalk-color-above-ring_g', 'stalk-color-above-ring_n', 'stalk-color-above-ring_o', 'stalk-color-above-ring_p', 'stalk-color-above-ring_w', 'stalk-color-above-ring_y', 'stalk-color-below-ring_c', 'stalk-color-below-ring_e', 'stalk-color-below-ring_g', 'stalk-color-below-ring_n', 'stalk-color-below-ring_o', 'stalk-color-below-ring_p', 'stalk-color-below-ring_w', 'stalk-color-below-ring_y', 'ring-number_o', 'ring-number_t', 'ring-type_f', 'ring-type_l', 'ring-type_n', 'ring-type_p', 'spore-print-color_h', 'spore-print-color_k', 'spore-print-color_n', 'spore-print-color_o', 'spore-print-color_r', 'spore-print-color_u', 'spore-print-color_w', 'spore-print-color_y', 'population_c', 'population_n', 'population_s', 'population_v', 'population_y', 'habitat_g', 'habitat_l', 'habitat_m', 'habitat_p', 'habitat_u', 'habitat_w'
]
df_small = df[columns_to_use]

df_encoded = pd.get_dummies(df_small)

frequent_itemsets = apriori(df_encoded, min_support=0.3, use_colnames=True) #Apriori (support = 0.3)


rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7) #Generate rules (confidence = 0.7)

top_10_rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False]).head(10)

print(top_10_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

del df_encoded, frequent_itemsets, rules
gc.collect() #cleaned memory because of pc crashed everytime I loaded dummies

             antecedents            consequents   support  confidence  \
0                (class)          (cap-shape_c)  0.482029         1.0   
1          (cap-shape_c)                (class)  0.482029         1.0   
2         (gill-color_e)                (class)  0.482029         1.0   
3                (class)         (gill-color_e)  0.482029         1.0   
4  (spore-print-color_o)                (class)  0.482029         1.0   
5                (class)  (spore-print-color_o)  0.482029         1.0   
6                (class)            (habitat_w)  0.482029         1.0   
7            (habitat_w)                (class)  0.482029         1.0   
8         (gill-color_e)          (cap-shape_c)  0.482029         1.0   
9          (cap-shape_c)         (gill-color_e)  0.482029         1.0   

       lift  
0  2.074566  
1  2.074566  
2  2.074566  
3  2.074566  
4  2.074566  
5  2.074566  
6  2.074566  
7  2.074566  
8  2.074566  
9  2.074566  


43

In [14]:
columns_to_use = ['class', 'cap-shape_c', 'gill-color_e', 'spore-print-color_o', 'habitat_w']
df = df[columns_to_use]
df_encoded = pd.get_dummies(df)

In [15]:
frequent_itemsets = apriori(df_encoded, min_support=0.5, use_colnames=True)

In [38]:
import gc

for var in ['df_encoded', 'frequent_itemsets', 'rules']:
    if var in locals():
        del globals()[var]
print(top_10_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
rd = gc.collect()
print(f"gc: {rd}")

             antecedents            consequents   support  confidence  \
0                (class)          (cap-shape_c)  0.482029         1.0   
1          (cap-shape_c)                (class)  0.482029         1.0   
2         (gill-color_e)                (class)  0.482029         1.0   
3                (class)         (gill-color_e)  0.482029         1.0   
4  (spore-print-color_o)                (class)  0.482029         1.0   
5                (class)  (spore-print-color_o)  0.482029         1.0   
6                (class)            (habitat_w)  0.482029         1.0   
7            (habitat_w)                (class)  0.482029         1.0   
8         (gill-color_e)          (cap-shape_c)  0.482029         1.0   
9          (cap-shape_c)         (gill-color_e)  0.482029         1.0   

       lift  
0  2.074566  
1  2.074566  
2  2.074566  
3  2.074566  
4  2.074566  
5  2.074566  
6  2.074566  
7  2.074566  
8  2.074566  
9  2.074566  
gc: 18
