In [1]:
import pandas as pd
from collections import Counter
from itertools import combinations

#basic imports
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from collections import Counter
from itertools import combinations

url = 'https://raw.githubusercontent.com/Rk-Pudasaini/Applied_Machine_Learning/main/Data_Science_Projects/Data_mining/basket_data.csv'
df = pd.read_csv(url, header = None)
df.head()

# # Sample transactions (lists of items)
# transactions = [
#     ['IA', 'IB', 'IC'],
#     ['IA', 'IB', 'ID'],
#     ['IB', 'IC', 'ID'],
#     ['IA', 'IC', 'ID'],
#     ['IA', 'IB'],
#     ['IA', 'IC', 'IE'],
#     ['IB', 'IC', 'IE'],
#     ['IB','IE']
# ]

# Define the function to extract all products from the DataFrame
def extract_all_products(df):
    all_products = []
    for index, row in df.iterrows():
        row_products = []
        for cell in row:
            if pd.notnull(cell):
                cell_products = cell.split(',')
                cell_products = [product.strip() for product in cell_products if product.strip()]
                if cell_products:
                    row_products.extend(cell_products)
        all_products.append(row_products)
    return all_products

# Call the function to extract products
all_products = extract_all_products(df)

# Sample transactions data (replace with your actual transactions)
transactions = all_products[:34]

def count_items(transactions):
    """
    Count the number of occurrences of each item in the dataset.
    """
    item_counts = Counter(item for transaction in transactions for item in transaction)
    return item_counts
#calling the count_items function
item_counts = count_items(transactions)

# Convert to DataFrame
item_counts_df = pd.DataFrame(list(item_counts.items()), columns=['Item', 'Count'])

print(item_counts_df)

#You can simply print the items and its count using for loop
# # Print the item counts
# for item, count in item_counts.items():
#     print(f"{item}: {count}")


              Item  Count
0           shrimp      4
1          almonds      1
2          avocado      5
3   vegetables mix      1
4     green grapes      1
..             ...    ...
63   herb & pepper      1
64    tomato sauce      1
65       magazines      1
66    strawberries      1
67   strong cheese      1

[68 rows x 2 columns]


In [2]:
#function to generate frequent itemsets based on the minimum support
def generate_frequent_1_itemsets(transactions, min_support):
    item_counts = Counter(item for transaction in transactions for item in transaction)  #count the number of occurance of the items in the dataset
    frequent_1_itemsets = {frozenset([item]): support for item, support in item_counts.items() if support >= min_support} #filtering items based on the minimum support
    return frequent_1_itemsets  #returning the stored filtered items


#generating the candidates itemsets based on the k value
def generate_candidate_itemsets(itemsets, k):
    candidate_itemsets = set()  #empty sets of candidates itemsets
    for itemset1 in itemsets:
        for itemset2 in itemsets:
            union_set = itemset1.union(itemset2) #get the union of the itemsets
            if len(union_set) == k:  #check condition if the number of union set is equal to 2
                candidate_itemsets.add(frozenset(union_set)) # then add the union set to candidate_itemsets
    return candidate_itemsets #return the candidate_itemsets

#defines a function named prune_itemsets that is used to prune candidate itemsets
#by checking whether all of their (k-1)-subsets are frequent, given a
#set of previously identified frequent itemsets.
def prune_itemsets(candidate_itemsets, prev_frequent_itemsets):
    pruned_itemsets = set() # Initialize an empty set to store pruned itemsets
    for candidate_set in candidate_itemsets:  # Iterate through each candidate itemset
        is_valid = True
        for subset in combinations(candidate_set, len(candidate_set) - 1): # Check all (k-1)-subsets of the candidate set
            # If any (k-1)-subset is not in the set of previously frequent itemsets,
            # set is_valid to False and break out of the loop
            if frozenset(subset) not in prev_frequent_itemsets:
                is_valid = False
                break
        # If is_valid is still True after checking all (k-1)-subsets, add the candidate set to the pruned set
        if is_valid:
            pruned_itemsets.add(candidate_set)
    return pruned_itemsets  ## Return the pruned set of candidate itemsets

def apriori_algorithm(transactions, min_support):
    frequent_itemsets = []

    # Step 1: Generate frequent 1-itemsets
    frequent_1_itemsets = generate_frequent_1_itemsets(transactions, min_support)
    frequent_itemsets.append(frequent_1_itemsets)

    k = 2
    while len(frequent_itemsets[k-2]) > 0:
        # Step 2: Generate candidate k-itemsets
        candidate_itemsets = generate_candidate_itemsets(frequent_itemsets[k-2].keys(), k)

        # Step 3: Prune candidate k-itemsets
        candidate_itemsets = prune_itemsets(candidate_itemsets, frequent_itemsets[k-2].keys())

        # Step 4: Count the support of candidate k-itemsets
        item_counts = Counter(frozenset(transaction) for transaction in transactions)
        for transaction in transactions:
            for candidate_set in candidate_itemsets:
                if candidate_set.issubset(transaction):
                    item_counts[candidate_set] += 1

        # Step 5: Filter candidate k-itemsets by support
        frequent_k_itemsets = {itemset: support for itemset, support in item_counts.items() if support >= min_support}

        # Append frequent k-itemsets to the list
        frequent_itemsets.append(frequent_k_itemsets)

        k += 1

    return frequent_itemsets[:-1]


# Example usage:
min_support = 2
result = apriori_algorithm(transactions, min_support)

# Print the result
for i, itemsets in enumerate(result):
    print(f"Frequent {i+1}-itemsets:")
    for itemset, support in itemsets.items():
        print(f"{set(itemset)} : {support}")
    print()


Frequent 1-itemsets:
{'shrimp'} : 4
{'avocado'} : 5
{'yams'} : 2
{'low fat yogurt'} : 3
{'green tea'} : 4
{'honey'} : 4
{'mineral water'} : 11
{'salmon'} : 4
{'frozen smoothie'} : 2
{'burgers'} : 3
{'meatballs'} : 2
{'eggs'} : 9
{'turkey'} : 5
{'milk'} : 4
{'energy bar'} : 2
{'french fries'} : 5
{'soup'} : 2
{'light cream'} : 2
{'frozen vegetables'} : 3
{'spaghetti'} : 7
{'cookies'} : 2
{'cooking oil'} : 2
{'chocolate'} : 5
{'chicken'} : 3
{'black tea'} : 2
{'pasta'} : 2
{'sparkling water'} : 2

Frequent 2-itemsets:
{'turkey', 'avocado'} : 2
{'salmon', 'mineral water'} : 5
{'turkey', 'eggs'} : 4
{'frozen smoothie', 'salmon'} : 2
{'shrimp', 'avocado'} : 2
{'honey', 'shrimp'} : 3
{'honey', 'avocado'} : 2
{'mineral water', 'yams'} : 2
{'low fat yogurt', 'shrimp'} : 2
{'low fat yogurt', 'honey'} : 2
{'frozen smoothie', 'mineral water'} : 2
{'green tea', 'mineral water'} : 3
{'mineral water', 'avocado'} : 2
{'eggs', 'burgers'} : 2
{'milk', 'energy bar'} : 2
{'mineral water', 'milk'} : 2
{'m

In [3]:
def calculate_support(itemset, transactions):
    """
    Calculate the support of the given itemset in the transactions.
    """
    count = 0
    for transaction in transactions:
        if itemset.issubset(transaction):
            count += 1
    return count


def generate_association_rules(frequent_itemsets, min_confidence):
    association_rules = []
    max_itemset_number = len(frequent_itemsets) - 1

    for I in frequent_itemsets[max_itemset_number]:
        subsets = get_subsets(I)
        for S in subsets:
            S = frozenset(S)  # Convert the tuple to a frozenset
            I_S = I - S
            if len(I_S) <= 0:
                continue
            I_support = calculate_support(I, transactions)
            S_support = calculate_support(S, transactions)

            confidence = I_support / S_support
            if confidence >= min_confidence:
                association_rules.append((S, I_S, confidence))
    return association_rules

def get_subsets(itemset):
    subsets = []
    for i in range(1, len(itemset)):
        subsets.extend(combinations(itemset, i))
    return subsets

# Define the minimum confidence threshold
min_confidence = 0.2

# Generate association rules
association_rules = generate_association_rules(result, min_confidence)

print("Number of association rules generated:", len(association_rules))  # Debugging statement

# Print the association rules
print("Association Rules:")
for antecedent, consequent, confidence in association_rules:
    print(f"Antecedent: {antecedent} => Consequent: {consequent} | Confidence: {confidence:.2f}")


Number of association rules generated: 13
Association Rules:
Antecedent: frozenset({'black tea'}) => Consequent: frozenset({'spaghetti', 'salmon', 'mineral water'}) | Confidence: 1.00
Antecedent: frozenset({'spaghetti'}) => Consequent: frozenset({'salmon', 'mineral water', 'black tea'}) | Confidence: 0.29
Antecedent: frozenset({'salmon'}) => Consequent: frozenset({'spaghetti', 'mineral water', 'black tea'}) | Confidence: 0.50
Antecedent: frozenset({'mineral water', 'black tea'}) => Consequent: frozenset({'spaghetti', 'salmon'}) | Confidence: 1.00
Antecedent: frozenset({'spaghetti', 'mineral water'}) => Consequent: frozenset({'salmon', 'black tea'}) | Confidence: 0.50
Antecedent: frozenset({'salmon', 'mineral water'}) => Consequent: frozenset({'spaghetti', 'black tea'}) | Confidence: 0.50
Antecedent: frozenset({'spaghetti', 'black tea'}) => Consequent: frozenset({'salmon', 'mineral water'}) | Confidence: 1.00
Antecedent: frozenset({'salmon', 'black tea'}) => Consequent: frozenset({'spag