# Data Mining CS-634_Midterm_Project

# Importing_Libraries

In [51]:
import numpy as np 
import pandas as pd
import itertools
from itertools import combinations
import warnings
warnings.filterwarnings("ignore")
import time

# Importing Datasets

In [52]:
import os

def select_dataset():
    # Get list of files in the folder
    files = os.listdir("Datasets")

    # Print the list of datasets
    print("Available datasets:")
    for i, file in enumerate(files):
        print(f"{i+1}. {file}")

    # Ask user to select a dataset
    choice = int(input("Enter the number corresponding to the dataset you want to select: "))

    # Validate user input
    if 1 <= choice <= len(files):
        selected_dataset = files[choice-1]
        print(f"You selected '{selected_dataset}' as the dataset.")
        return selected_dataset
    else:
        print("Invalid choice. Please select a number within the range.")
        return None

data = select_dataset()
if data:
    # Now you can use 'selected_dataset' variable to further process the selected dataset
    print(f"Processing '{data}'...")
    # Add your code to process the selected dataset here


Available datasets:
1. AmazonBooks.csv
2. BestBuy.csv
3. Generic.csv
4. Grocery Store.csv
5. K-Mart.csv
6. Nike.csv


Enter the number corresponding to the dataset you want to select:  1


You selected 'AmazonBooks.csv' as the dataset.
Processing 'AmazonBooks.csv'...


# Loading_Data 

In [53]:
df = pd.read_csv("Datasets//" + data)

In [54]:
print(df.head())

  A Beginner’s Guide Java: The Complete Reference Java For Dummies  \
0                  t                          NaN                t   
1                  t                          NaN                t   
2                  t                          NaN              NaN   
3                  t                          NaN                t   
4                NaN                            t                t   

  Android Programming: The Big Nerd Ranch Head First Java 2nd Edition  \
0                                     NaN                         NaN   
1                                     NaN                           t   
2                                     NaN                           t   
3                                     NaN                         NaN   
4                                     NaN                           t   

  Beginning Programming with Java Java 8 Pocket Guide  \
0                               t                   t   
1                         

# Assigining an index to each item listed in 

In [55]:
item_list = list(df.columns)
item_dict = dict()

for i, item in enumerate(item_list):
    item_dict[item] = i + 1

item_dict

{'A Beginner’s Guide': 1,
 'Java: The Complete Reference': 2,
 'Java For Dummies': 3,
 'Android Programming: The Big Nerd Ranch': 4,
 'Head First Java 2nd Edition': 5,
 'Beginning Programming with Java': 6,
 'Java 8 Pocket Guide': 7,
 'C++ Programming in Easy Steps': 8,
 'Effective Java (2nd Edition)': 9,
 'HTML and CSS: Design and Build Websites': 10}

# Extracting Transactions from the Data

In [56]:
transactions = list()

for i, row in df.iterrows():
    transaction = set()
    
    for item in item_dict:
        if row[item] == 't':
            transaction.add(item_dict[item])
    transactions.append(transaction)

In [57]:
transactions

[{1, 3, 6, 7, 8, 9},
 {1, 3, 5},
 {1, 5, 9},
 {1, 3, 9, 10},
 {2, 3, 5, 10},
 {1, 4, 5, 7, 9},
 {1, 3, 4, 7, 9, 10},
 {4, 10},
 {2, 4, 6, 7},
 {2, 5, 6, 7, 8, 10},
 {4, 7},
 {2, 4, 6},
 {2, 6},
 {1, 2, 3, 4, 6, 7, 8},
 {8},
 set(),
 {4, 5, 6, 7, 9, 10},
 {8},
 {1, 2, 3, 4, 6, 7}]

# Get Support Function that evaluates the support value for a set given all the transactions

In [58]:
def get_support(transactions, item_set):
    match_count = 0
    for transaction in transactions:
        if item_set.issubset(transaction):
            match_count += 1
            
    return float(match_count/len(transactions))

# self_join performs join based on the last level valid sets. It joins each sets together by performing union and if the length exceeds the current level, it will skip that set.

In [59]:
def self_join(frequent_item_sets_per_level, level):
    current_level_candidates = list()
    last_level_items = frequent_item_sets_per_level[level - 1]
    
    if len(last_level_items) == 0:
        return current_level_candidates
    
    for i in range(len(last_level_items)):
        for j in range(i+1, len(last_level_items)):
            itemset_i = last_level_items[i][0]
            itemset_j = last_level_items[j][0]
            union_set = itemset_i.union(itemset_j)
            
            if union_set not in current_level_candidates and len(union_set) == level:
                current_level_candidates.append(union_set)
                
    return current_level_candidates

In [60]:
def get_single_drop_subsets(item_set):
    single_drop_subsets = list()
    for item in item_set:
        temp = item_set.copy()
        temp.remove(item)
        single_drop_subsets.append(temp)
        
    return single_drop_subsets

def is_valid_set(item_set, prev_level_sets):
    single_drop_subsets = get_single_drop_subsets(item_set)
    
    for single_drop_set in single_drop_subsets:
        if single_drop_set not in prev_level_sets:
            return False
    return True

def pruning(frequent_item_sets_per_level, level, candidate_set):
    post_pruning_set = list()
    if len(candidate_set) == 0:
        return post_pruning_set
    
    prev_level_sets = list()
    for item_set, _ in frequent_item_sets_per_level[level - 1]:
        prev_level_sets.append(item_set)
        
    for item_set in candidate_set:
        if is_valid_set(item_set, prev_level_sets):
            post_pruning_set.append(item_set)
            
    return post_pruning_set

# This is the main function which uses all the above described Utility functions to implement the Apriori Algorithm and generate the list of frequent itemsets for each level for the provided transactions and min_support value.

In [61]:
from collections import defaultdict
def apriori(min_support):
    frequent_item_sets_per_level = defaultdict(list)
    print("level : 1", end = " ")
    
    for item in range(1, len(item_list) + 1):
        support = get_support(transactions, {item})
        if support >= min_support:
            frequent_item_sets_per_level[1].append(({item}, support))
        
    for level in range(2, len(item_list) + 1):
        print(level, end = " ")
        current_level_candidates = self_join(frequent_item_sets_per_level, level)

        post_pruning_candidates = pruning(frequent_item_sets_per_level, level, current_level_candidates)
        if len(post_pruning_candidates) == 0:
            break

        for item_set in post_pruning_candidates:
            support = get_support(transactions, item_set)
            if support >= min_support:
                frequent_item_sets_per_level[level].append((item_set, support))
                
    return frequent_item_sets_per_level

# Entering the Minimum Support Value

In [62]:
### print("Enter the Minimum Support : ")
min_support = int(input("Enter Minimum Support : "))
min_support = (min_support)/100
frequent_item_sets_per_level = apriori(min_support)

Enter Minimum Support :  30


level : 1 2 3 

# The below code produces a dictionary called item_support_dict which from frequent_item_sets_per_level that maps items to their support values

In [63]:
item_support_dict = dict()
item_list = list()

key_list = list(item_dict.keys())
val_list = list(item_dict.values())

for level in frequent_item_sets_per_level:
    for set_support_pair in frequent_item_sets_per_level[level]:
        for i in set_support_pair[0]:
            item_list.append(key_list[val_list.index(i)])
        item_support_dict[frozenset(item_list)] = set_support_pair[1]
        item_list = list()

# The find_subset function takes the item and item_length as parameter and it returns all the possible combinations of elements inside the items

In [64]:
def find_subset(item, item_length):
    combs = []
    for i in range(1, item_length + 1):
        combs.append(list(combinations(item, i)))
        
    subsets = []
    for comb in combs:
        for elt in comb:
            subsets.append(elt)
            
    return subsets

# This function generates the association rules in accordance withe the minimum confidence value and the provided dictionary of itemsets against their support values. It takes the mininmum confidence value and support_dict as a parameter, and returns rules as a list.

In [65]:
def association_rules(min_confidence, support_dict):
    rules = list()
    for item, support in support_dict.items():
        item_length = len(item)
       
        if item_length > 1:
            subsets = find_subset(item, item_length)
           
            for A in subsets:
                B = item.difference(A)
               
                if B:
                    A = frozenset(A)
                    
                    AB = A | B
                    
                    confidence = support_dict[AB] / support_dict[A]
                    if confidence >= min_confidence:
                        rules.append((A, B, confidence))
    
    return rules

In [66]:
print("Enter the Minimum Confidence : ")
confidence = int(input())
min_confidence = (confidence)/100
association_rule = association_rules(min_confidence, support_dict = item_support_dict)

Enter the Minimum Confidence : 


 20


# Output

In [67]:
print("Number of rules: ", len(association_rule))

for rule in association_rule:
    print('{0} -> {1} <confidence: {2}>'.format(set(rule[0]), set(rule[1]), rule[2]))


Number of rules:  8
{'Java For Dummies'} -> {'A Beginner’s Guide'} <confidence: 0.8571428571428571>
{'A Beginner’s Guide'} -> {'Java For Dummies'} <confidence: 0.75>
{'Java: The Complete Reference'} -> {'Beginning Programming with Java'} <confidence: 0.8571428571428571>
{'Beginning Programming with Java'} -> {'Java: The Complete Reference'} <confidence: 0.75>
{'Java 8 Pocket Guide'} -> {'Android Programming: The Big Nerd Ranch'} <confidence: 0.7777777777777778>
{'Android Programming: The Big Nerd Ranch'} -> {'Java 8 Pocket Guide'} <confidence: 0.7777777777777778>
{'Beginning Programming with Java'} -> {'Java 8 Pocket Guide'} <confidence: 0.75>
{'Java 8 Pocket Guide'} -> {'Beginning Programming with Java'} <confidence: 0.6666666666666666>


# Using the_mlxtend Librarry and implementing the the apriori and FP Growth algorithm

In [68]:
# 1] Apriori Algorithm

In [69]:
from mlxtend.frequent_patterns import apriori, fpgrowth
from mlxtend.frequent_patterns import association_rules

# Convert the 'Items' column to a one-hot encoded format
df_encoded = df.replace('t', 1)

start_apriori = time.time()
# Apply Apriori algorithm
frequent_itemsets = apriori(df_encoded.fillna(0), min_support=min_support, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
end_apriori = time.time()

print("Apriori results \n")

# Display the frequent itemsets and association rules
print("Frequent Itemsets:")
print(frequent_itemsets)

Apriori results 

Frequent Itemsets:
     support                                           itemsets
0   0.421053                               (A Beginner’s Guide)
1   0.368421                     (Java: The Complete Reference)
2   0.368421                                 (Java For Dummies)
3   0.473684          (Android Programming: The Big Nerd Ranch)
4   0.315789                      (Head First Java 2nd Edition)
5   0.421053                  (Beginning Programming with Java)
6   0.473684                              (Java 8 Pocket Guide)
7   0.315789                     (Effective Java (2nd Edition))
8   0.315789          (HTML and CSS: Design and Build Websites)
9   0.315789             (Java For Dummies, A Beginner’s Guide)
10  0.315789  (Java: The Complete Reference, Beginning Progr...
11  0.368421  (Java 8 Pocket Guide, Android Programming: The...
12  0.315789  (Beginning Programming with Java, Java 8 Pocke...


In [70]:
print("\nAssociation Rules:")
print(rules[['antecedents', 'consequents']])



Association Rules:
                                 antecedents  \
0                         (Java For Dummies)   
1                       (A Beginner’s Guide)   
2             (Java: The Complete Reference)   
3          (Beginning Programming with Java)   
4                      (Java 8 Pocket Guide)   
5  (Android Programming: The Big Nerd Ranch)   
6          (Beginning Programming with Java)   
7                      (Java 8 Pocket Guide)   

                                 consequents  
0                       (A Beginner’s Guide)  
1                         (Java For Dummies)  
2          (Beginning Programming with Java)  
3             (Java: The Complete Reference)  
4  (Android Programming: The Big Nerd Ranch)  
5                      (Java 8 Pocket Guide)  
6                      (Java 8 Pocket Guide)  
7          (Beginning Programming with Java)  


In [46]:
print(" Time taken for apriori : ", end_apriori - start_apriori , "seconds")

 Time taken for apriori :  0.007979869842529297 seconds


In [71]:
# 2] FP- Growthn Algorithm

In [72]:
start_FPgrowth = time.time()
# Apply FPgrowth algorithm
frequent_itemsets = fpgrowth(df_encoded.fillna(0), min_support=min_support, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
end_FPgrowth = time.time()

print("\nFPgrowth results \n\n")
# Display the frequent itemsets and association rules
print("Frequent Itemsets:")
print(frequent_itemsets)



FPgrowth results 


Frequent Itemsets:
     support                                           itemsets
0   0.473684                              (Java 8 Pocket Guide)
1   0.421053                  (Beginning Programming with Java)
2   0.421053                               (A Beginner’s Guide)
3   0.368421                                 (Java For Dummies)
4   0.315789                     (Effective Java (2nd Edition))
5   0.315789                      (Head First Java 2nd Edition)
6   0.315789          (HTML and CSS: Design and Build Websites)
7   0.368421                     (Java: The Complete Reference)
8   0.473684          (Android Programming: The Big Nerd Ranch)
9   0.315789  (Beginning Programming with Java, Java 8 Pocke...
10  0.315789             (Java For Dummies, A Beginner’s Guide)
11  0.315789  (Java: The Complete Reference, Beginning Progr...
12  0.368421  (Java 8 Pocket Guide, Android Programming: The...


In [73]:
print("\nAssociation Rules:")
print(rules[['antecedents', 'consequents']])
end_FPgrowth = time.time()


Association Rules:
                                 antecedents  \
0          (Beginning Programming with Java)   
1                      (Java 8 Pocket Guide)   
2                         (Java For Dummies)   
3                       (A Beginner’s Guide)   
4             (Java: The Complete Reference)   
5          (Beginning Programming with Java)   
6                      (Java 8 Pocket Guide)   
7  (Android Programming: The Big Nerd Ranch)   

                                 consequents  
0                      (Java 8 Pocket Guide)  
1          (Beginning Programming with Java)  
2                       (A Beginner’s Guide)  
3                         (Java For Dummies)  
4          (Beginning Programming with Java)  
5             (Java: The Complete Reference)  
6  (Android Programming: The Big Nerd Ranch)  
7                      (Java 8 Pocket Guide)  


In [74]:
print(" Time taken for FPgrowth : ", end_FPgrowth - start_FPgrowth , "seconds")

 Time taken for FPgrowth :  0.3782620429992676 seconds
