<a href="https://colab.research.google.com/github/SarathSabu/Python-Notebooks/blob/main/Association_Rule.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install apyori # to install apyori

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
# import data as dataframe
file_path = '/content/drive/MyDrive/Infor648/Data/Market_Basket.csv'
df = pd.read_csv(file_path, header = None)

# calling head() method
df.head()
# Based on the data, we have 20 columns from 0 - 19

#Data Pre-processing

In [None]:
## Data Cleaning step

# replacing empty value with 0.
df.fillna(0,inplace=True)
df.head()


In [None]:
# we need to convert data in list format..
# transaction = [['apple','almonds'],['apple'],['banana','apple']]....

transactions = []

for i in range(0,len(df)):
    transactions.append([str(df.values[i,j]) for j in range(0,20) if str(df.values[i,j])!='0']) # adjust the column range based on the number of columns in the DataFrame, we currently have from 0 - 19

In [None]:
#a list of transactions
transactions[0]

#Check the items frequency

In [None]:
# Initialize a dictionary to count item occurrences (support count)
support_count = {}

# Count the occurrences of each item in the transactions
for transaction in transactions:
    for item in transaction:
        if item in support_count:
            support_count[item] += 1
        else:
            support_count[item] = 1

df_support = pd.DataFrame(list(support_count.items()), columns=['Item', 'Support Count'])

# Calculate total number of transactions
total_transactions = len(transactions)

# Calculate support
df_support['Support'] = df_support['Support Count'] / total_transactions

# Sort the DataFrame by Support Count (descending)
df_support = df_support.sort_values('Support Count', ascending=False).reset_index(drop=True)

# Display the top 20 items
df_support.head(20)




In [None]:
import plotly.graph_objects as go

# Select the top 20 items based on support count
df_top_20 = df_support.head(20)###adjust here

fig = go.Figure(data=[go.Bar(
    x=df_top_20["Item"],  # X-axis: items (top 20)
    y=df_top_20["Support Count"],  # Y-axis: support count (frequency)
    hovertext=df_top_20["Item"],
    text=df_top_20["Support Count"],  # Display support counts on the bars
    textposition="outside"  # Position of the text
)])

fig.update_traces(marker_color='MediumPurple',
                  marker_line_color='MediumPurple',
                  marker_line_width=1.5, opacity=0.85)

# adjust figure size, background color, and title
fig.update_layout(
    title_text="Top 20 Items by Frequency (Support Count)",
    template="simple_white",
    width=800,  # Adjust width of the figure
    height=600,  # Adjust height of the figure
    title_font=dict(size=24),  # Adjust title font size
    xaxis=dict(tickangle=-45)  # Rotate x-axis labels
)

# Display the figure
fig.show()


#Check the first item often selected by consumers

In [None]:
# Take the first item in each non-empty transaction
first_choices = [transaction[0] for transaction in transactions if transaction]

first_choice_count = {}

for item in first_choices:
    if item in first_choice_count:
        first_choice_count[item] += 1
    else:
        first_choice_count[item] = 1


df_first_choice = pd.DataFrame(list(first_choice_count.items()), columns=['Item', 'First Choice Count'])

# Sort the DataFrame by the first choice count in descending order and select top 20 items
df_top_20_first_choice = df_first_choice.sort_values('First Choice Count', ascending=False).head(20).reset_index(drop=True) ##adjust here

# Create a bar chart for the top 20 first choice frequency
fig = go.Figure(data=[go.Bar(
    x=df_top_20_first_choice["Item"],  # X-axis: items
    y=df_top_20_first_choice["First Choice Count"],  # Y-axis: first choice count (frequency)
    hovertext=df_top_20_first_choice["Item"],
    text=df_top_20_first_choice["First Choice Count"],  # Display counts on the bars
    textposition="outside"  # Position of the text
)])

# Customize the bar appearance
fig.update_traces(marker_color='LightSkyBlue', marker_line_color='LightSkyBlue',
                  marker_line_width=1.5, opacity=0.85)

# adjust figure size, background color, and title
fig.update_layout(
    title_text="Top 20 First Choice Items by Customers",
    template="simple_white",
    width=800,  # Adjust width of the figure
    height=600,  # Adjust height of the figure
    title_font=dict(size=24),  # Adjust title font size
    xaxis=dict(tickangle=-45)  # Rotate x-axis labels for better readability
)

# Display the figure
fig.show()

#Association Rule Mining

In [None]:
# Parameters:
# transactions: Our main dataset. A list of transactions where each transaction is a list of items
# min_support: Minimum support threshold (0.01 means itemsets must appear in at least 1% of transactions)
# min_confidence: Minimum confidence threshold (0.1 means rules must have at least 10% confidence)
# min_lift: Minimum lift threshold (2 means the consequent is at least twice as likely to appear when the antecedent is present)
# min_length: Minimum length of the itemset (2 means itemsets with at least 2 items will be considered)
# max_length: Maximum length of the itemset (2 means itemsets with at most 2 items will be considered)

from apyori import apriori
rules = apriori(transactions = transactions, min_support = 0.01, min_confidence = 0.1, min_lift = 2, min_length = 2)
results = list(rules)

#from apyori import apriori
#rules = apriori(transactions = transactions, min_support = 0.01, min_confidence = 0.1, min_lift = 2, min_length = 2, max_length = 3)

In [None]:
# Loop through the generated rules and print relevant details
for rule in results:
    # Display the items involved in the rule (antecedent → consequent)
    print(f"Rule: {rule.items}")

    # Display the support of the rule
    print(f"Support: {rule.support}")

    # Display the confidence of the rule (how likely the consequent is, given the antecedent)
    print(f"Confidence: {rule.ordered_statistics[0].confidence}")

    # Display the lift of the rule (how much more likely the consequent is, given the antecedent)
    print(f"Lift: {rule.ordered_statistics[0].lift}")

    # Print a separator for readability between rules
    print("=" * 30)

#Check our results

In [None]:
# Prepare lists to store the extracted data
antecedents = []
consequents = []
supports = []
confidences = []
lifts = []

# Loop through the results and extract the necessary details
for rule in results:
    for ordered_stat in rule.ordered_statistics:
        antecedents.append(', '.join(list(ordered_stat.items_base)))  # Antecedent (if available)
        consequents.append(', '.join(list(ordered_stat.items_add)))  # Consequent
        supports.append(rule.support)  # Support
        confidences.append(ordered_stat.confidence)  # Confidence
        lifts.append(ordered_stat.lift)  # Lift

# Create a DataFrame for better visualization
df_rules = pd.DataFrame({
    'Antecedent': antecedents,
    'Consequent': consequents,
    'Support': supports,
    'Confidence': confidences,
    'Lift': lifts
})

# Display the DataFrame
display(df_rules)

In [None]:
df_rules_sorted_lift = df_rules.sort_values('Lift', ascending=False).reset_index(drop=True)

# Display the sorted DataFrame
display(df_rules_sorted_lift)


In [None]:
df_rules_sorted_support = df_rules.sort_values('Support', ascending=False).reset_index(drop=True)

# Display the sorted DataFrame
display(df_rules_sorted_support.head(10))

In [None]:
# Create a bar chart to visualize the lift of the top 20 rules
fig = px.bar(df_rules_sorted_lift.head(20), ###adjust here
             x='Antecedent',
             y='Lift',
             color='Consequent',
             title="Top 20 Association Rules by Lift")

# Display the bar chart
fig.show()

