In [1]:
pip install mrjob

Collecting mrjob
  Downloading mrjob-0.7.4-py2.py3-none-any.whl.metadata (7.3 kB)
Downloading mrjob-0.7.4-py2.py3-none-any.whl (439 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.6/439.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mrjob
Successfully installed mrjob-0.7.4


In [3]:
import pandas as pd
from itertools import combinations
from mrjob.job import MRJob

# Step 1: Load and Prepare Dataset
file_path = "/content/retail_data.xlsx"  # Adjust this path as needed
data = pd.ExcelFile(file_path)
data_2010_2011 = data.parse("Year 2010-2011")



In [5]:
# Data Cleaning
data_cleaned = data_2010_2011.dropna().copy()
data_cleaned.loc[:, 'Invoice'] = data_cleaned['Invoice'].astype(str)
data_cleaned.loc[:, 'Description'] = data_cleaned['Description'].str.strip()

# Group transactions by Invoice
transactions = data_cleaned.groupby("Invoice")["Description"].apply(list)

# Save transactions to a text file
transactions_file = "transactions.txt"
with open(transactions_file, "w") as f:
    for transaction in transactions:
        f.write(",".join(transaction) + "\n")


In [8]:
# Step 2: Define the MRJob class for Frequent Itemsets Mining
class MRFrequentItemsets(MRJob):
    def configure_args(self):
        super(MRFrequentItemsets, self).configure_args()
        self.add_passthru_arg('--min-support', type=int, default=2, help='Minimum support threshold')

    def mapper(self, _, line):
        items = line.strip().split(',')
        for itemset in combinations(items, 2):  # Generate pairs
            yield itemset, 1

    def combiner(self, itemset, counts):
        yield itemset, sum(counts)

    def reducer(self, itemset, counts):
        total_count = sum(counts)
        if total_count >= self.options.min_support:
            yield itemset, total_count




In [14]:
with open('frequent_itemsets_mr.py', 'w') as f:
    f.write("""
from mrjob.job import MRJob
from itertools import combinations

class MRFrequentItemsets(MRJob):
    def configure_args(self):
        super(MRFrequentItemsets, self).configure_args()
        self.add_passthru_arg('--min-support', type=int, default=2, help='Minimum support threshold')

    def mapper(self, _, line):
        items = line.strip().split(',')
        for itemset in combinations(items, 2):  # Generate pairs
            yield itemset, 1

    def combiner(self, itemset, counts):
        yield itemset, sum(counts)

    def reducer(self, itemset, counts):
        total_count = sum(counts)
        if total_count >= self.options.min_support:
            yield itemset, total_count

if __name__ == '__main__':
    MRFrequentItemsets.run()
""")


In [15]:
# Prepare transactions data
transactions = [
    ['milk', 'bread', 'butter'],
    ['bread', 'butter', 'jam'],
    ['milk', 'bread'],
    ['milk', 'butter'],
    ['bread', 'butter'],
]

# Save transactions to a file
with open('transactions.txt', 'w') as f:
    for transaction in transactions:
        f.write(','.join(transaction) + '\n')


In [16]:
!python frequent_itemsets_mr.py transactions.txt --min-support 2 > output.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/frequent_itemsets_mr.root.20241120.172030.526101
Running step 1 of 1...
job output is in /tmp/frequent_itemsets_mr.root.20241120.172030.526101/output
Streaming final output from /tmp/frequent_itemsets_mr.root.20241120.172030.526101/output...
Removing temp directory /tmp/frequent_itemsets_mr.root.20241120.172030.526101...


In [17]:
# Display frequent itemsets
with open('output.txt', 'r') as f:
    print(f.read())


["milk", "bread"]	2
["bread", "butter"]	3
["milk", "butter"]	2



In [25]:
# Step 1: Parse frequent itemsets from the output file
frequent_itemsets = []
with open('output.txt', 'r') as f:
    for line in f:
        # Remove leading/trailing whitespace and check for empty lines
        line = line.strip()
        if not line:  # Skip empty lines
            continue

        # Split the line into itemset and count using a tab delimiter
        parts = line.rsplit('\t', 1)
        if len(parts) != 2:  # Skip malformed lines
            print(f"Skipping malformed line: {line}")
            continue

        itemset_str, count_str = parts[0], parts[1]

        # Parse itemset: Remove brackets and split items
        itemset = itemset_str.strip('[]').replace('"', '').split(', ')
        itemset = [item.strip() for item in itemset if item]  # Clean up item names
        try:
            count = int(count_str)  # Convert count to integer
        except ValueError:
            print(f"Skipping line with invalid count: {line}")
            continue

        frequent_itemsets.append((set(itemset), count))

# Add singleton itemsets to frequent_itemsets
singleton_itemsets = {}
for itemset, count in frequent_itemsets:
    for item in itemset:
        if frozenset([item]) in singleton_itemsets:
            singleton_itemsets[frozenset([item])] += count
        else:
            singleton_itemsets[frozenset([item])] = count

# Add singletons to the frequent_itemsets list
for itemset, count in singleton_itemsets.items():
    frequent_itemsets.append((set(itemset), count))

print("Frequent Itemsets Parsed (including singletons):")
for itemset, count in frequent_itemsets:
    print(f"Itemset: {itemset}, Count: {count}")

# Step 2: Generate Association Rules
def generate_association_rules(frequent_itemsets, min_confidence=0.4):  # Lowered confidence threshold
    rules = []
    print("Generating Rules...")
    for itemset, support in frequent_itemsets:
        if len(itemset) > 1:  # Only generate rules for itemsets with 2 or more items
            for item in itemset:
                antecedent = itemset - {item}
                consequent = {item}
                antecedent_support = next(
                    (sup for items, sup in frequent_itemsets if items == antecedent), 0
                )
                print(f"Analyzing: Itemset={itemset}, Antecedent={antecedent}, Consequent={consequent}, Support={support}, Antecedent Support={antecedent_support}")
                if antecedent_support > 0:  # Avoid division by zero
                    confidence = support / antecedent_support
                    print(f"Confidence for rule {antecedent} -> {consequent}: {confidence:.2f}")
                    if confidence >= min_confidence:
                        rules.append((antecedent, consequent, confidence))
    return rules

# Generate rules with a confidence threshold of 0.4
association_rules = generate_association_rules(frequent_itemsets, min_confidence=0.4)

# Step 3: Display Association Rules
print("Association Rules:")
if not association_rules:
    print("No association rules generated.")
else:
    for antecedent, consequent, confidence in association_rules:
        print(f"Rule: {antecedent} -> {consequent} (Confidence: {confidence:.2f})")


Frequent Itemsets Parsed (including singletons):
Itemset: {'milk', 'bread'}, Count: 2
Itemset: {'bread', 'butter'}, Count: 3
Itemset: {'milk', 'butter'}, Count: 2
Itemset: {'milk'}, Count: 4
Itemset: {'bread'}, Count: 5
Itemset: {'butter'}, Count: 5
Generating Rules...
Analyzing: Itemset={'milk', 'bread'}, Antecedent={'bread'}, Consequent={'milk'}, Support=2, Antecedent Support=5
Confidence for rule {'bread'} -> {'milk'}: 0.40
Analyzing: Itemset={'milk', 'bread'}, Antecedent={'milk'}, Consequent={'bread'}, Support=2, Antecedent Support=4
Confidence for rule {'milk'} -> {'bread'}: 0.50
Analyzing: Itemset={'bread', 'butter'}, Antecedent={'butter'}, Consequent={'bread'}, Support=3, Antecedent Support=5
Confidence for rule {'butter'} -> {'bread'}: 0.60
Analyzing: Itemset={'bread', 'butter'}, Antecedent={'bread'}, Consequent={'butter'}, Support=3, Antecedent Support=5
Confidence for rule {'bread'} -> {'butter'}: 0.60
Analyzing: Itemset={'milk', 'butter'}, Antecedent={'butter'}, Consequent={