In [15]:
import pandas as pd

# Step 1: Load the Dataset

In [26]:
# Load the dataset from the provided Excel file
file_path = 'D:\Assignments Data Science ExcelR\Association Rules\Association Rules\Online retail.xlsx'
df = pd.read_excel(file_path, sheet_name='Sheet1', header=None)


In [27]:
df

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


In [28]:
df.head()

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


# Step 2: Data Preprocessing

In [29]:
# Check for missing values
df.isnull().sum()

0    0
dtype: int64

In [31]:
# Check for duplicate rows
df.duplicated().sum()

2325

In [32]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)

In [33]:
# Confirm no duplicates
df.duplicated().sum()

0

In [34]:
# Create a list of lists from the dataframe
transactions = df[0].apply(lambda x: x.split(',')).tolist()

# Create a set of all unique items
items = sorted(set(item for sublist in transactions for item in sublist))

# Create an empty dataframe for one-hot encoding
one_hot = pd.DataFrame(0, index=range(len(transactions)), columns=items)

# Populate the dataframe with one-hot encoding
for i, transaction in enumerate(transactions):
    for item in transaction:
        one_hot.at[i, item] = 1

# Display the first few rows of the one-hot encoded dataframe
one_hot.head()


Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# Step 3: Generating Association Rules

In [37]:
!pip install mlxtend



Collecting mlxtend
  Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
Collecting scikit-learn>=1.0.2
  Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl (9.3 MB)
Collecting joblib>=0.13.2
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, scikit-learn, mlxtend
  Attempting uninstall: joblib
    Found existing installation: joblib 1.0.1
    Uninstalling joblib-1.0.1:
      Successfully uninstalled joblib-1.0.1
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:
      Successfully uninstalled scikit-learn-0.24.1
Successfully installed joblib-1.4.2 mlxtend-0.23.1 scikit-learn-1.3.2


In [38]:
from mlxtend.frequent_patterns import apriori, association_rules

# Generate frequent itemsets
frequent_itemsets = apriori(one_hot, min_support=0.01, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the generated rules
rules.head()




Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(almonds),(mineral water),0.029366,0.299845,0.011012,0.375,1.250644,0.002207,1.120247,0.206476
1,(mineral water),(almonds),0.299845,0.029366,0.011012,0.036727,1.250644,0.002207,1.007641,0.28624
2,(chocolate),(avocado),0.205178,0.045981,0.01024,0.049906,1.085347,0.000805,1.004131,0.098935
3,(avocado),(chocolate),0.045981,0.205178,0.01024,0.222689,1.085347,0.000805,1.022528,0.082426
4,(french fries),(avocado),0.19262,0.045981,0.011592,0.060181,1.3088,0.002735,1.015108,0.292231


# Step 4: Analyzing the Rules

In [39]:
# Sort rules by lift
rules = rules.sort_values('lift', ascending=False)

# Display the top 10 rules
rules.head(10)


  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
425,(whole wheat pasta),(olive oil),0.040572,0.087713,0.011012,0.271429,3.094525,0.007454,1.252159,0.705471
424,(olive oil),(whole wheat pasta),0.087713,0.040572,0.011012,0.125551,3.094525,0.007454,1.09718,0.741925
796,(soup),"(milk, mineral water)",0.070904,0.067813,0.012365,0.174387,2.571586,0.007557,1.129085,0.657774
793,"(milk, mineral water)",(soup),0.067813,0.070904,0.012365,0.182336,2.571586,0.007557,1.136281,0.655593
298,(ground beef),(herb & pepper),0.135819,0.066461,0.022798,0.167852,2.525588,0.013771,1.121843,0.698989
299,(herb & pepper),(ground beef),0.066461,0.135819,0.022798,0.343023,2.525588,0.013771,1.31539,0.647056
736,"(mineral water, shrimp)",(frozen vegetables),0.033617,0.12983,0.010433,0.310345,2.390394,0.006068,1.261747,0.601893
737,(frozen vegetables),"(mineral water, shrimp)",0.12983,0.033617,0.010433,0.080357,2.390394,0.006068,1.050825,0.668443
718,"(spaghetti, frozen vegetables)",(ground beef),0.039026,0.135819,0.012558,0.321782,2.369196,0.007257,1.274194,0.601386
719,(ground beef),"(spaghetti, frozen vegetables)",0.135819,0.039026,0.012558,0.092461,2.369196,0.007257,1.058879,0.668744


# Interview Questions

Q1. What is lift and why is it important in Assossiation Rules?

Lift is a measure used in association rule mining to evaluate the strength of an association rule. Specifically, it is used to compare the observed frequency of co-occurrence of items in a rule to the expected frequency if the items were independent. It helps determine whether the occurrence of one item has any impact on the occurrence of another item.

Interpretation of Lift

Lift > 1: Indicates a positive correlation between A and B. The presence of A increases the likelihood of the presence of B.
Lift = 1: Indicates no correlation between A and B. The presence of A has no effect on the likelihood of the presence of B.
Lift < 1: Indicates a negative correlation between A and B. The presence of A decreases the likelihood of the presence of B.

Why is Lift Important in Association Rules?

1. Identifies Strong Associations: Lift helps identify strong associations between items. A high lift value suggests that the items co-occur more frequently than would be expected by chance, indicating a strong association.

2. Evaluates Rule Strength Beyond Support and Confidence: While support and confidence are useful measures, they can sometimes be misleading. For instance, an item may have high support and confidence simply because it is very common, not necessarily because it is strongly associated with another item. Lift corrects for this by taking into account the individual frequencies of the items.

3. Filters Out Uninteresting Rules: By considering lift, one can filter out rules that appear strong in terms of support and confidence but are actually weak due to the commonness of the items involved. This helps in focusing on truly interesting and actionable rules.

4. Provides a Normalized Measure: Lift provides a normalized measure that is independent of the dataset size. This makes it easier to compare rules across different datasets or subsets of a dataset.

5. Useful in Market Basket Analysis: In market basket analysis, lift helps in identifying product combinations that are bought together more frequently than expected. This can inform decisions about product placement, bundling, and promotions.

.

Q2. What is suppoert and confidence? How do you calculate them?

Support
Support is a measure used in association rule mining to indicate how frequently an itemset appears in the dataset. It helps in identifying how common a particular itemset is within all the transactions.

Calculation:
Support(A) = (Number of transactions containing a) / Total number of transactions.

Confidence
Confidence is a measure used in association rule mining to evaluate the reliability of an association rule. It indicates the likelihood of the consequent given that the antecedent is already present in the transaction. Essentially, it measures how often items in the consequent appear in transactions that contain the antecedent.

Calculation:
Confidence (A → B) = Support (A∪B) / Support (A)



.

Q3. What are some limitations or challenges of Association rules mining?

Association rule mining is a powerful technique for discovering interesting relationships between variables in large datasets. However, it also has several limitations and challenges:

1. Scalability:

Computational Complexity: As the dataset grows larger, the number of possible itemsets increases exponentially. This makes the computation of frequent itemsets and association rules very resource-intensive in terms of time and memory.

High Dimensionality: Datasets with a large number of items can lead to a very high-dimensional search space, which can be computationally prohibitive to explore fully.

2. Redundancy :

Too Many Rules: Association rule mining can generate an overwhelming number of rules, many of which may be redundant or not particularly interesting. Filtering out irrelevant rules can be challenging.

Redundant Rules: Many generated rules may convey similar information. 

3. Interpretability :

Complex Rules: Some rules may be complex and difficult to interpret, especially if they involve a large number of items. This can make it hard for users to draw actionable insights.

Threshold Setting: Choosing appropriate thresholds for support and confidence is often non-trivial. Too high a threshold might miss interesting rules, while too low a threshold might include too many uninteresting rules.

4.  Data Quality :

Missing Values: Datasets often contain missing or incomplete data, which can affect the accuracy of the discovered rules.

Noisy Data: Noise and outliers in the data can lead to misleading association rules. Proper preprocessing is necessary to mitigate this issue.

5. Sparsity :

Sparse Data: In datasets with many items, most transactions will only contain a small subset of the possible items. This sparsity can make it difficult to find meaningful and statistically significant associations.