In [1]:
# Import necessary libraries
import pandas as pd
import random
from mlxtend.frequent_patterns import apriori, association_rules

## 1. Simulate Transaction Data 

#### Marilyn: 
- Imported the pandas and random librabries
- Simulated the transactions
- Saved the transactions to a csv file called  "'supermarket_transactions.csv'"

In [2]:
#define the pool of items that can appear in a transaction
item_pool = ['Milk', 'Eggs', 'Butter', 'Cheese', 'Bananas', 'Bread', 'Apples', 'Chicken','pork', 'beef', 'strawberries','pineapples', 'watermelon','lemon','melon', 'avocadoes', 'dragonfruit', 'orange', 'kiwi', 'flour', 'rice', 'sugar','salt','noodles','potato','onion','fruitjuice','garlic', 'okra','cucumbers']


In [3]:
# for reproducibility
random.seed(42)
 # List to hold transactions
transactions = []
# Generate 3000 fake transactions, each with 2–7 random items and store them in a list
for _ in range(3000): 
    num_items = random.randint(2, 7) 
    transaction = random.sample(item_pool, num_items)
    transactions.append(transaction) 

# Convert the transactions into a DataFrame where each row = one transaction and items separated by comma)
transactions_df = pd.DataFrame({'Transaction': [', '.join(items) for items in transactions]})

# Save to CSV
transactions_df.to_csv('supermarket_transactions.csv', index=False)
# Display the first few transactions

transactions_df.head()

Unnamed: 0,Transaction
0,"Cheese, Milk, noodles, pork, Chicken, onion, B..."
1,"Cheese, sugar, noodles, orange, Butter, kiwi, ..."
2,"Milk, Butter"
3,"Chicken, dragonfruit, flour"
4,"orange, Apples"


## 2. Preprocessing: One-Hot Encoding

CHAD:

1.Extracted all unique items across all transactions using a set and sorted() to create a consistent column order.

2.Created a one-hot encoded structure, where:

    -Each row represents a single transaction.
    -Each column represents an item.
    -Cell values are True if the item is present in that transaction, otherwise False.

3.Stored the encoded transactions as a list of dictionaries (encoded_data), each mapping item names to boolean values.

4.Converted the list of dictionaries into a Pandas DataFrame (df), suitable for input into the mlxtend.frequent_patterns.apriori() algorithm.


In [4]:
# Convert to one-hot encoded DataFrame
all_items = sorted(set(item for transaction in transactions for item in transaction)) 
encoded_data = []
for transaction in transactions:
    encoded_data.append({item: (item in transaction) for item in all_items})
df = pd.DataFrame(encoded_data)

##  3. Generate Frequent Itemsets

Hetal:

- Used 'apriori()' from 'mlxtend.frequent_patterns' to generate itemsets with min_support = 0.05.

- Displayed and exported the **top 10 itemsets.**


In [6]:
# Hetal: 
# Generate frequent itemsets with a minimum support threshold of 0.05
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) 

# Sort by support in descending order and displaying top 10
top_10_itemsets = frequent_itemsets.sort_values(by='support', ascending=False).head(10)

# Displaying the top 10 frequent itemsets
print("Top 10 most Frequent Itemsets:\n", top_10_itemsets)

# Export 'top_10_itemsets' to CSV
top_10_itemsets.to_csv('frequent_itemsets.csv', index=False)


Top 10 most Frequent Itemsets:
      support      itemsets
28  0.170000       (sugar)
4   0.162333      (Cheese)
8   0.161000   (avocadoes)
25  0.159333        (rice)
1   0.159000     (Bananas)
26  0.156667        (salt)
16  0.154333       (lemon)
18  0.154000     (noodles)
17  0.154000       (melon)
29  0.152667  (watermelon)


### 4: Identify Closed Frequent Itemsets  
Rita:

- A *Closed Frequent Itemset* is one where **no proper superset has the same support**.  
- To identify closed itemsets, compare each frequent itemset against all others.  
- If a larger itemset existed that contained the current one *and* had the same support, it was excluded.


In [12]:
# Load frequent itemsets from CSV
import pandas as pd

df = pd.read_csv("frequent_itemsets.csv")

# Clean and convert the 'itemsets' column from string to list of items
df['items'] = df['itemsets'].apply(lambda x: sorted(x.replace("frozenset({", "").replace("})", "").replace("'", "").split(', ')))

# Initialize list to store closed itemsets
closed_itemsets = []

# Compare each itemset with all others
for i, row_i in df.iterrows():
    items_i = set(row_i['items'])
    support_i = row_i['support']
    is_closed = True

    for j, row_j in df.iterrows():
        items_j = set(row_j['items'])
        support_j = row_j['support']

        # Itemset is not closed if a proper superset has the same support
        if items_i < items_j and support_i == support_j:
            is_closed = False
            break

    if is_closed:
        closed_itemsets.append((','.join(items_i), support_i))

# Save closed itemsets to CSV
closed_df = pd.DataFrame(closed_itemsets, columns=['itemset', 'support'])
closed_df.to_csv("closed_itemsets.csv", index=False)

# Step 6: Show confirmation and preview
print("Closed itemsets saved:", len(closed_df))
closed_df.head()


Closed itemsets saved: 10


Unnamed: 0,itemset,support
0,sugar,0.17
1,Cheese,0.162333
2,avocadoes,0.161
3,rice,0.159333
4,Bananas,0.159


### 5: Identify Maximal Frequent Itemsets  
Rita:

- A *Maximal Frequent Itemset* is one where **no frequent superset exists**.  
- To identify maximal itemsets, compare each frequent itemset against all others.  
- If a larger itemset existed that contained the current one and was also frequent, it was excluded.


In [13]:
# Read the frequent itemsets
df = pd.read_csv("frequent_itemsets.csv")

# Convert string itemsets to Python lists
df['items'] = df['itemsets'].apply(lambda x: sorted(eval(x)))

maximal_itemsets = []

# Check each itemset
for i, row in df.iterrows():
    is_maximal = True
    for j, other_row in df.iterrows():
        if i != j:
            # Check if this itemset is a subset of another
            if set(row['items']).issubset(set(other_row['items'])) and row['support'] <= other_row['support']:
                is_maximal = False
                break
    if is_maximal:
        maximal_itemsets.append(row)

# Create final DataFrame of maximal itemsets
maximal_df = pd.DataFrame(maximal_itemsets)

# Drop the helper 'items' column
maximal_df = maximal_df.drop(columns=['items'])

# Save to CSV
maximal_df.to_csv("maximal_itemsets.csv", index=False)

# Show top 5
maximal_df.head()


Unnamed: 0,support,itemsets
0,0.17,frozenset({'sugar'})
1,0.162333,frozenset({'Cheese'})
2,0.161,frozenset({'avocadoes'})
3,0.159333,frozenset({'rice'})
4,0.159,frozenset({'Bananas'})
