In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from ast import literal_eval
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt             #visualisation
import seaborn as sns

color = (0.2, # redness
         0.4, # greenness
         0.2, # blueness
         0.6 # transparency
         ) 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

ModuleNotFoundError: No module named 'mlxtend'

# Objective: 
1. Which products are frequently bought together?
2. If someone buys a product, which product is he/she likely to buy next?

In [None]:
df = pd.read_csv('/kaggle/input/groceries-dataset/Groceries_dataset.csv')

df.head()

# Data Cleaning

In [None]:
dc_grocery = df.copy()

# Rename Columns
dc_grocery.rename(columns={"Member_number": "Member_Number","itemDescription":"Item"}, inplace=True)

# Check Null Values
dc_grocery.isnull().sum()

In [None]:
print("Rows before remove duplicates " + str(len(dc_grocery)))
print("Rows after remove duplicates " + str(len(dc_grocery.drop_duplicates())))
dc_grocery = dc_grocery.drop_duplicates()

In [None]:
# Object Types
dc_grocery.info()

# EDA

In [None]:
def set_custom_palette(series, max_color = 'blue', other_color = 'lightgrey'):
    max_val = series.max()
    pal = []
    
    for item in series:
        if item == max_val:
            pal.append(max_color)
        else:
            pal.append(other_color)
    return pal

In [None]:
y = dc_grocery.Item.value_counts().nlargest(10).tolist()
x = dc_grocery.Item.value_counts().nlargest(10).keys().tolist()

plt.rcParams['figure.figsize'] = (10, 5)

fig, ax = plt.subplots()

sns.barplot(x=x, y=y, ax=ax)

ax.bar_label(ax.containers[0], labels=y, padding=3)
ax.margins(y=0.1)

plt.title("Most items put in groceries")
plt.ylabel('Number of items')
plt.xlabel('Items')
plt.xticks(rotation='vertical')

fig.tight_layout()

In [None]:
top_5_sum = dc_grocery.Item.value_counts().nlargest(5).sum()
sum_all = dc_grocery.Item.value_counts().sum()
summ = top_5_sum/sum_all
summ

# Data Analysis

In [None]:
basket = (dc_grocery.groupby(['Member_Number','Item'])['Date'].count().unstack().reset_index().fillna(0).set_index('Member_Number'))
basket.head()

In [None]:
def encode_units(x):
    if x < 1:
        return 0
    if x >= 1:
        return 1


basket = basket.applymap(encode_units)
basket.head(10)

In [None]:
#Run apriori algorithm
df_trans = pd.DataFrame(basket)

frequent_itemsets = apriori(df_trans, min_support=0.1, use_colnames=True).sort_values(by="support", ascending=False)
frequent_itemsets

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules = rules[(rules['confidence'] > 0.5)]

rules.sort_values(['confidence'],ascending=False).reset_index(drop=True)

In [None]:
def frequently_bought_together(item):
    
    # df of item passed
    item_df = basket.loc[basket[item]==1]
    
    # Applying apriori algorithm on item df
    frequent_itemsets = apriori(item_df, min_support=0.1, use_colnames=True)
    
    # Storing association rules
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
    rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
    rules["consequents_len"] = rules["consequents"].apply(lambda x: len(x))

    rules = rules[(rules['antecedent_len'] == 1) & (rules['consequents_len'] == 1) & (rules['lift'] > 1)]
    
    # Sorting on lift and support
    rules.sort_values(['confidence'],ascending=False).reset_index(drop=True)
    
    print('Items frequently bought together with {0}'.format(item))
    
    bt = rules['consequents'].drop_duplicates()
    
    # Returning top 6 items with highest lift and support
    return bt

In [None]:
frequently_bought_together('yogurt')