## Market Basket Analysis 

### Packages Importing

In [1]:
#External package need to install
# !pip install apyori

In [2]:
import pandas as pd
from apyori import apriori

### Data Loading 

In [3]:
df = pd.read_csv("E:/Data Analytics Projects/Grocery Bill Analysis/data/Market_Basket_Optimisation.csv", header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


### Data Cleaning 

In [5]:
# replacing empty value with 0.
df.fillna(0,inplace=True)

### Data Pre-processing 

In [None]:
# for using aprori , need to convert data in list format..
# transaction = [['apple','almonds'],['apple'],['banana','apple']]....

transactions = []

for i in range(0,len(df)):
    transactions.append([str(df.values[i,j]) for j in range(0,20) if str(df.values[i,j])!='0'])

In [None]:
## verifying - by printing the 0th transaction
transactions[0]

In [None]:
## verifying - by printing the 1st transaction
transactions[1]

### Apriori Model 

In [None]:
# Call apriori function which requires minimum support, confidance and lift, min length is combination of item default is 2".
rules = apriori(transactions, min_support=0.003, min_confidance=0.2, min_lift=3, min_length=2)

## min_support = 0.003 -> means selecting items with min support of 0.3%
## min_confidance = 0.2 -> means min confidance of 20% 
## min_lift = 3  
## min_length = 2 -> means no. of items in the transaction should be 2

In [None]:
#it generates a set of rules in a generator file...
rules

In [None]:
# all rules need to be converted in a list..
Results = list(rules)
Results

In [None]:
# convert result in a dataframe for further operation...
df_results = pd.DataFrame(Results)

In [None]:
# as we see "order_statistics" , is itself a list so need to be converted in proper format..
df_results.head()

In [None]:
# keep support in a separate data frame so we can use later.. 
support = df_results.support

In [None]:
'''
convert orderstatistic in a proper format.
order statistic has lhs => rhs as well rhs => lhs 
we can choose any one for convience.
Let's choose first one which is 'df_results['ordered_statistics'][i][0]'
''' 

#all four empty list which will contain lhs, rhs, confidance and lift respectively.
first_values = []
second_values = []
third_values = []
fourth_value = []

# loop number of rows time and append 1 by 1 value in a separate list.. 
# first and second element was frozenset which need to be converted in list..
for i in range(df_results.shape[0]):
    single_list = df_results['ordered_statistics'][i][0]
    first_values.append(list(single_list[0]))
    second_values.append(list(single_list[1]))
    third_values.append(single_list[2])
    fourth_value.append(single_list[3])

In [None]:
# convert all four list into dataframe for further operation..
lhs = pd.DataFrame(first_values)
rhs = pd.DataFrame(second_values)

confidence=pd.DataFrame(third_values,columns=['Confidence'])

lift=pd.DataFrame(fourth_value,columns=['lift'])

In [None]:
# concat all list together in a single dataframe
df_final = pd.concat([lhs,rhs,support,confidence,lift], axis=1)
df_final

In [None]:
'''
 we have some of place only 1 item in lhs and some place 3 or more so we need to a proper represenation for User to understand. 
 replacing none with ' ' and combining three column's in 1 
 example : coffee,none,none is converted to coffee, ,
'''
df_final.fillna(value=' ', inplace=True)
df_final.head()

In [None]:
#set column name
df_final.columns = ['lhs',1,'rhs',2,3,'support','confidence','lift']
df_final.head()

In [None]:
# add all three column to lhs itemset only
df_final['lhs'] = df_final['lhs'] + str(", ") + df_final[1]

df_final['rhs'] = df_final['rhs']+str(", ")+df_final[2] + str(", ") + df_final[3]

In [None]:
df_final.head()

In [None]:
#drop columns 1,2 and 3 because now we already appended to lhs column.
df_final.drop(columns=[1,2,3],inplace=True)

In [None]:
#this is final output. You can sort based on the support lift and confidance..
df_final.head()

In [None]:
df_final.sort_values('lift', ascending=False).head(10)

In [None]:
## Showing top 10 items, based on lift.  Sorting in desc order
df_lift = df_final.sort_values('lift', ascending=False).head(10)
df_support = df_final.sort_values('support', ascending=False).head(10)
df_confidence = df_final.sort_values('confidence', ascending=False).head(10)

In [None]:
df_final.sort_values('lift', ascending=False).tail(10)

In [None]:
# df_final = pd.DataFrame(data)

import matplotlib.pyplot as plt

# Plotting the top 10 items based on lift

# Bar plot for lift values
plt.figure(figsize=(10, 6))
plt.barh(df_lift['lhs'] + df_lift['rhs'], df_lift['lift'], color='skyblue')
plt.xlabel('Lift')
plt.ylabel('Itemsets')
plt.title('Top 10 Itemsets Based on Lift')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest lift on top
plt.show()

# Bar plot for support values
plt.figure(figsize=(10, 6))
plt.barh(df_support['lhs'] + df_support['rhs'], df_support['support'], color='lightgreen')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.title('Top 10 Itemsets Based on Support')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest support on top
plt.show()

# Bar plot for confidence values
plt.figure(figsize=(10, 6))
plt.barh(df_confidence['lhs'] + df_confidence['rhs'], df_confidence['confidence'], color='salmon')
plt.xlabel('Confidence')
plt.ylabel('Itemsets')
plt.title('Top 10 Itemsets Based on Confidence')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest confidence on top
plt.show()