<a href="https://colab.research.google.com/github/Rujan0833/DM_DW_LAB/blob/main/LAB2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mlxtend



In [None]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.1


In [None]:
from google.colab import files
import fitz
def read_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

sports_text = read_pdf("sports.pdf")
space_text = read_pdf("space.pdf")

In [None]:
def parse(text):
    lines = text.strip().split("\n")
    transactions = []
    for line in lines[1:]:  # skip header
        items = line.split(",")[1:]  # skip transaction ID
        transactions.append([item.strip() for item in items])
    return transactions

sports_transactions = parse(sports_text)
space_transactions = parse(space_text)


In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

def encode(transactions):
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    return pd.DataFrame(te_ary, columns=te.columns_)

sports_df = encode(sports_transactions)
space_df = encode(space_transactions)


In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

def apply_apriori(df, min_support=0.2):
    freq_items = apriori(df, min_support=min_support, use_colnames=True)
    rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
    return freq_items, rules

sports_freq, sports_rules = apply_apriori(sports_df)
space_freq, space_rules = apply_apriori(space_df)

print("Sports - Frequent Itemsets:\n", sports_freq)
print("\nSports - Association Rules:\n", sports_rules[['antecedents','consequents','support','confidence','lift']])


Sports - Frequent Itemsets:
      support                                           itemsets
0   0.214286                                          (cricket)
1   0.571429                                     (cricket ball)
2   0.928571                                      (cricket bat)
3   0.928571                                         (football)
4   0.785714                                           (gloves)
5   0.642857                                            (juice)
6   0.357143                                     (water bottle)
7   0.214286                             (cricket bat, cricket)
8   0.214286                                (cricket, football)
9   0.571429                        (cricket bat, cricket ball)
10  0.500000                           (cricket ball, football)
11  0.357143                             (cricket ball, gloves)
12  0.285714                              (cricket ball, juice)
13  0.214286                       (water bottle, cricket ball)
14  0.85714

In [None]:
from mlxtend.frequent_patterns import fpgrowth

def apply_fpgrowth(df, min_support=0.2):
    freq_items = fpgrowth(df, min_support=min_support, use_colnames=True)
    rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
    return freq_items, rules

sports_fp_freq, sports_fp_rules = apply_fpgrowth(sports_df)
space_fp_freq, space_fp_rules = apply_fpgrowth(space_df)

print("Sports - FP-Growth Frequent Itemsets:\n", sports_fp_freq)
print("\nSports - FP-Growth Association Rules:\n", sports_fp_rules[['antecedents','consequents','support','confidence','lift']])


Sports - FP-Growth Frequent Itemsets:
      support                                           itemsets
0   0.928571                                         (football)
1   0.928571                                      (cricket bat)
2   0.785714                                           (gloves)
3   0.642857                                            (juice)
4   0.571429                                     (cricket ball)
5   0.214286                                          (cricket)
6   0.357143                                     (water bottle)
7   0.857143                            (cricket bat, football)
8   0.785714                                 (gloves, football)
9   0.714286                              (cricket bat, gloves)
10  0.714286                    (cricket bat, gloves, football)
11  0.571429                                    (juice, gloves)
12  0.571429                               (cricket bat, juice)
13  0.571429                                  (juice, football)
1

In [None]:
def compare_results(name, apriori_rules, fp_rules):
    print(f"---- {name} ----")
    print(f"Apriori Rules Count: {len(apriori_rules)}")
    print(f"FP-Growth Rules Count: {len(fp_rules)}")
    print()

compare_results("Sports", sports_rules, sports_fp_rules)
compare_results("Space", space_rules, space_fp_rules)


---- Sports ----
Apriori Rules Count: 110
FP-Growth Rules Count: 110

---- Space ----
Apriori Rules Count: 3
FP-Growth Rules Count: 3

