# Data Preprocessing:

In [1]:
# Installing MLxtend library

!pip install mlxtend


Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Importing required libraries

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [3]:
xls = pd.ExcelFile(r"C:\\Users\\sahil\\Desktop\\excelr data science\\Assignments\\Association Rules\\Online_retail.xlsx")
print(xls.sheet_names)

['Sheet1']


In [4]:
# Reading the dataset

df = pd.read_excel(r"C:\\Users\\sahil\\Desktop\\excelr data science\\Assignments\\Association Rules\\Online_retail.xlsx", sheet_name ='Sheet1')

In [5]:
df.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [6]:
df.tail()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."
7497,chicken
7498,"escalope,green tea"
7499,"eggs,frozen smoothie,yogurt cake,low fat yogurt"


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                           --------------  ----- 
 0   shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil  7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [8]:
# Checking presence of null values

df.isnull().sum()

shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil    0
dtype: int64

In [9]:
# Checking for duplicate values

df.duplicated().sum()

2325

In [10]:
df[df.duplicated()]


Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
33,cookies
41,spaghetti
59,spaghetti
63,"turkey,eggs"
64,french fries
...,...
7490,herb & pepper
7491,"chocolate,escalope"
7494,"pancakes,light mayo"
7497,chicken


In [11]:
# Droping duplicate values

df.drop_duplicates()


Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt
...,...
7492,"burgers,salmon,pancakes,french fries,frozen sm..."
7493,"turkey,burgers,dessert wine,shrimp,pasta,tomat..."
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."


# Association Rule Mining:

In [12]:
data = [
    "shrimp,almonds,avocado,vegetables mix,green grapes,whole wheat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxidant juice,frozen smoothie,spinach,olive oil",
    "burgers,meatballs,eggs",
    "chutney",
    "turkey,avocado",
    "mineral water,milk,energy bar,whole wheat rice",
    "low fat yogurt"
]

In [13]:
# Convert the string data to a list of transactions
transactions = [transaction.split(',') for transaction in data]


In [14]:
# Convert data into the required format using TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)



In [20]:
df_trans = pd.DataFrame(te_ary, columns=te.columns_)


In [21]:
df_trans.head()

Unnamed: 0,Unnamed: 1,",",a,b,c,d,e,f,g,h,...,p,r,s,t,u,v,w,x,y,z
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,False,True,True,True,False,False,True,False,True,False,...,False,True,True,True,True,False,False,False,False,False
2,False,False,False,False,True,False,True,False,False,True,...,False,False,False,True,True,False,False,False,True,False
3,False,True,True,False,True,True,True,False,False,False,...,False,True,False,True,True,True,False,False,True,False
4,True,True,True,True,True,False,True,False,True,True,...,False,True,False,True,False,False,True,False,True,False


In [15]:
# Applying Apriori to find frequent itemsets

frequent_itemsets = apriori(df_trans, min_support=0.3, use_colnames=True, max_len=3)


# View the frequent itemsets
print(frequent_itemsets)



       support   itemsets
0     0.500000        ( )
1     0.666667        (,)
2     0.833333        (a)
3     0.500000        (b)
4     0.666667        (c)
...        ...        ...
1325  0.666667  (u, t, y)
1326  0.333333  (t, v, y)
1327  0.500000  (t, w, y)
1328  0.333333  (u, v, y)
1329  0.333333  (u, w, y)

[1330 rows x 2 columns]


In [16]:
# Applying association rules

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)


In [17]:
# Display the rules

print(rules)

     antecedents consequents  antecedent support  consequent support  \
0            ( )         (a)            0.500000            0.833333   
1            (a)         ( )            0.833333            0.500000   
2            ( )         (b)            0.500000            0.500000   
3            (b)         ( )            0.500000            0.500000   
4            ( )         (f)            0.500000            0.333333   
...          ...         ...                 ...                 ...   
5221         (u)      (y, v)            0.833333            0.333333   
5222         (v)      (u, y)            0.333333            0.666667   
5223         (y)      (u, v)            0.833333            0.333333   
5224      (u, w)         (y)            0.333333            0.833333   
5225         (y)      (u, w)            0.833333            0.333333   

       support  confidence      lift  leverage  conviction  zhangs_metric  
0     0.500000    1.000000  1.200000  0.083333         inf 

In [22]:
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

     antecedents consequents   support  confidence      lift
0            ( )         (a)  0.500000    1.000000  1.200000
1            (a)         ( )  0.500000    0.600000  1.200000
2            ( )         (b)  0.333333    0.666667  1.333333
3            (b)         ( )  0.333333    0.666667  1.333333
4            ( )         (f)  0.333333    0.666667  2.000000
...          ...         ...       ...         ...       ...
5221         (u)      (y, v)  0.333333    0.400000  1.200000
5222         (v)      (u, y)  0.333333    1.000000  1.500000
5223         (y)      (u, v)  0.333333    0.400000  1.200000
5224      (u, w)         (y)  0.333333    1.000000  1.200000
5225         (y)      (u, w)  0.333333    0.400000  1.200000

[5226 rows x 5 columns]


# Analysis and Interpretation:

Support Insights:

High support values indicate products that are commonly purchased together. For example, if you find that "avocado" and "shrimp" have high support, it suggests that customers frequently buy these two items together.
Confidence Insights:

High confidence values indicate strong relationships. If a rule states that "if a customer buys avocado, they are likely to buy shrimp" with a confidence of 70%, it suggests that purchasing avocado significantly leads to purchasing shrimp. This could inform cross-promotional strategies.
Lift Insights:

A high lift value indicates a strong correlation. For instance, if the rule {"avocado"} -> {"shrimp"} has a lift of 2.5, it means that customers are 2.5 times more likely to buy shrimp when they purchase avocado than if they were buying without the presence of avocado. This suggests a strong affinity between these two products.
Insights into Customer Purchasing Behavior
Complementary Products: If certain products are frequently bought together, consider bundling them in promotions or cross-promotions. For instance, if salmon and green tea often appear together, consider offering discounts when customers purchase both.

Customer Preferences: Understanding that customers who buy low fat yogurt also often buy energy drinks may indicate a trend towards health-conscious consumers looking for convenient, nutritious options.

Targeted Marketing: If a specific rule shows that "if customers buy mineral water, they are likely to buy yams," you can tailor marketing campaigns to target this customer segment with recipes or health benefits associated with both products.

Seasonal Trends: Monitor how purchasing patterns change over time. If certain combinations of products spike during specific seasons (e.g., smoothies in summer), you can adjust inventory and marketing strategies accordingly.