## *Exploring Frequent Itemsets: Closed vs Maximal in Supermarket Data*

#### *[Student: Mohammed] Import necessary libraries*

In [2]:
import pandas as pd
import random
from mlxtend.frequent_patterns import apriori
import random

##### *Generate 3000 supermarket transactions*
##### *Each transaction will have between 2 and 7 items randomly chosen from a pool of 30 unique items*

In [3]:
# Generate 3000 supermarket transactions
# Each transaction will have between 2 and 7 items randomly chosen from a pool of 30 unique items

# [Student: Mohammed] Define the item pool
item_pool = [
    'Milk', 'Bread', 'Butter', 'Eggs', 'Cheese', 'Apples', 'Bananas', 'Chicken',
    'Beef', 'Fish', 'Rice', 'Pasta', 'Cereal', 'Juice', 'Soda', 'Yogurt',
    'Tomatoes', 'Onions', 'Potatoes', 'Carrots', 'Cookies', 'Chips', 'Ice Cream',
    'Coffee', 'Tea', 'Sugar', 'Flour', 'Salt', 'Pepper', 'Oil'
]

# generate transactions
random.seed(42)  # For reproducibility
transactions = []
for _ in range(3000):
    transaction = random.sample(item_pool, k=random.randint(2, 7))  # 2 to 7 items per transaction
    transactions.append(transaction)
# Save as CSV
df_transactions = pd.DataFrame({'Transaction': transactions})  # Creating a DataFrame from the transactions list
df_transactions.to_csv('data/supermarket_transactions.csv', index=False)  # Saving the DataFrame to a CSV file
df_transactions.head()  # Displaying the first few transactions

Unnamed: 0,Transaction
0,"[Eggs, Milk, Coffee, Beef, Chicken, Sugar, Che..."
1,"[Eggs, Chips, Coffee, Onions, Butter, Potatoes..."
2,"[Milk, Butter]"
3,"[Chicken, Tomatoes, Carrots]"
4,"[Onions, Bananas]"


In [4]:
# [Student: Snit] Import required libraries
from collections import defaultdict
import pandas as pd

# [Student: Snit] Sample dataset with 5 transactions (each a set of items)
transactions = [
    {"A", "B", "C"},
    {"A", "B"},
    {"A", "C"},
    {"B", "C"},
    {"A", "B", "C"},
]

# [Student: Snit] Initialize a dictionary to count support for all item combinations
support_count = defaultdict(int)

for t in transactions:
    # Count support for individual items
    for item in t:
        support_count[frozenset([item])] += 1
    
    # Count support for all item pairs
    for i1 in t:
        for i2 in t:
            if i1 < i2:
                support_count[frozenset([i1, i2])] += 1

    # Count support for triple itemsets (only if all three items present)
    if len(t) == 3:
        support_count[frozenset(t)] += 1

# [Student: Snit] Identify closed itemsets
# A closed itemset has no superset with the same support
closed_itemsets = []
for itemset in support_count:
    is_closed = True
    for other in support_count:
        if itemset < other and support_count[itemset] == support_count[other]:
            is_closed = False
            break
    if is_closed:
        closed_itemsets.append((set(itemset), support_count[itemset]))

# [Student: Snit] Display closed itemsets and their support
print("Closed Frequent Itemsets:")
for itemset, count in closed_itemsets:
    print(f"{itemset} -> support: {count}")

# [Student: Snit] Save closed itemsets to a CSV file inside the data folder
closed_df = pd.DataFrame(closed_itemsets, columns=["Itemset", "Support"])
closed_df.to_csv("data/manual_closed_itemsets.csv", index=False)


Closed Frequent Itemsets:
{'C'} -> support: 4
{'A'} -> support: 4
{'B'} -> support: 4
{'C', 'A'} -> support: 3
{'A', 'B'} -> support: 3
{'C', 'B'} -> support: 3
{'C', 'A', 'B'} -> support: 2


In [5]:
# [Student: Lesala] Step 2: One-Hot Encode the Transactions
# Convert list of items to one-hot encoded DataFrame

# Creating  a DataFrame where each row is a transaction and each column is an item
encoded_data = []
for transaction in transactions:
    encoded_row = {item: (item in transaction) for item in item_pool}
    encoded_data.append(encoded_row)

df = pd.DataFrame(encoded_data)

#  Generating Frequent Itemsets using Apriori
# Minimum support is set to 0.05 (5%)
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

# Sorting and exporting top 10 itemsets to data folder 
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
frequent_itemsets.head(10).to_csv('data/frequent_itemsets.csv', index=False)

