<a href="https://colab.research.google.com/github/Rishikesh623/Data_Mining/blob/main/Frequent_pattern_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
def preprocess_dataset(file_path, id_col, items_col, delimiter=','):
    df = pd.read_csv(file_path)

    df[items_col] = df[items_col].apply(lambda x: [item.strip() for item in str(x).split(delimiter)])

    encoded_df = df[items_col].explode().reset_index()
    one_hot = pd.crosstab(encoded_df['index'], encoded_df[items_col])

    final_df = df[[id_col]].join(one_hot)
    return final_df

In [None]:
def confidence(item,dataset):
   total_tuples = dataset.shape[0]
   conf  =  0
   for i in range(total_tuples):
      isPresent = True;
      for j in range(len(item)):
        if(dataset.iloc[i,dataset.columns.get_loc(item[j])] == 0):
          isPresent = False
          break
      if isPresent:
        conf += 1
   return conf/total_tuples

In [None]:
def find_frequent_1_itemsets(df, min_sup):
    df = df.drop(columns=[df.columns[0]])
    item_counts = df.sum()
    frequent_items = [[item] for item, count in item_counts.items() if count >= min_sup]
    return frequent_items

In [None]:
def apriori(dataset, min_sup):
    l_1 = find_frequent_1_itemsets(dataset, min_sup)
    l = [l_1]
    k = 2

    while len(l[k - 2]) > 0:
        c_k = generate_candidate(l[k - 2])
        frequent_k = []

        for candidate in c_k:
            count = 0
            for i in range(len(dataset)):
                transaction = set(dataset.columns[dataset.iloc[i] == 1])
                if set(candidate).issubset(transaction):
                    count += 1

            if count >= min_sup:
                frequent_k.append(candidate)

        l.append(frequent_k)
        k += 1

    return l

In [None]:
def generate_candidate(l_k_minus_1):
    c_k = []
    for i in range(len(l_k_minus_1)):
        l1 = l_k_minus_1[i]
        for j in range(i + 1, len(l_k_minus_1)):
            l2 = l_k_minus_1[j]
            if l1[:-1] == l2[:-1] and l1[-1] < l2[-1]:
                c = l1 + [l2[-1]]
                if not has_infrequent_subset(c, l_k_minus_1):
                    c_k.append(c)
    return c_k

def has_infrequent_subset(candidate, l_k_minus_1):
    l_k_minus_1_set = {tuple(itemset) for itemset in l_k_minus_1}
    for i in range(len(candidate)):
        subset = candidate[:i] + candidate[i+1:]
        if tuple(subset) not in l_k_minus_1_set:
            return True
    return False

In [None]:
def generate_subsets(itemset):
    n = len(itemset)
    subsets = []
    for i in range(1, 2**n - 1):
        subset = []
        for j in range(n):
            if (i >> j) & 1:
                subset.append(itemset[j])
        subsets.append(subset)
    return subsets

def calculate_support(itemset, dataset):
    count = 0
    for _, row in dataset.iterrows():
        if all(row[item] == 1 for item in itemset):
            count += 1
    return count / len(dataset)

def generate_association_rules(frequent_itemsets, dataset, min_conf=0.5):
    rules = []

    dataset = dataset.drop(dataset.columns[0], axis=1)

    for k_itemset in frequent_itemsets:
        for itemset in k_itemset:
          if len(itemset) < 2:
            continue
          itemset_support = calculate_support(itemset, dataset)

          for subset in generate_subsets(itemset):
              remaining = [item for item in itemset if item not in subset]
              if not remaining:
                  continue

              subset_support = calculate_support(subset, dataset)
              remaining_support = calculate_support(remaining, dataset)

              if subset_support == 0:
                  continue

              confidence = itemset_support / subset_support
              lift = confidence / remaining_support if remaining_support > 0 else 0

              if confidence >= min_conf:
                  rules.append({
                      'rule': f"{subset} => {remaining}",
                      'support': round(itemset_support, 4),
                      'confidence': round(confidence, 4),
                      'lift': round(lift, 4)
                })

    return rules

def print_rules(rules):
    print("RULE\t\t\tSUPPORT\tCONFIDENCE\tLIFT")
    for r in rules:
        print(f"{r['rule']}\t{r['support']}\t{r['confidence']}\t{r['lift']}")


In [None]:
def solve(preprocessed_dataset,min_sup,min_conf):
  frequent_itemsets = apriori(preprocessed_dataset,min_sup)

  rules = generate_association_rules(frequent_itemsets,preprocessed_dataset,min_conf)

  print_rules(rules)

In [None]:
transactions_df = preprocess_dataset('groceries.csv', 'Transaction ID', 'Items')
transactions_df.to_csv('transactions_onehot.csv', index=False)
print(transactions_df.head())

  Transaction ID  banana  biscuit  bread  butter  cereal  cheese  coffee  \
0           T001       0        0      1       1       0       0       0   
1           T002       0        0      1       0       0       0       0   
2           T003       0        0      0       1       0       0       0   
3           T004       0        0      1       0       0       0       0   
4           T005       0        0      0       0       1       0       0   

   crackers  eggs  jam  milk  sugar  
0         0     0    0     1      0  
1         0     1    0     0      0  
2         0     0    0     1      0  
3         0     0    1     0      0  
4         0     0    0     1      0  


In [None]:
print(find_frequent_1_itemsets(transactions_df,1))

[['banana'], ['biscuit'], ['bread'], ['butter'], ['cereal'], ['cheese'], ['coffee'], ['crackers'], ['eggs'], ['jam'], ['milk'], ['sugar']]


In [None]:
# transaction csv
solve(transactions_df,2,0.5)

RULE			SUPPORT	CONFIDENCE	LIFT
['banana'] => ['bread']	0.0645	0.5	1.0333
['banana'] => ['cereal']	0.0645	0.5	1.9375
['banana'] => ['milk']	0.0645	0.5	1.2917
['butter'] => ['bread']	0.1935	0.6	1.24
['eggs'] => ['bread']	0.129	0.5714	1.181
['jam'] => ['bread']	0.0645	1.0	2.0667
['butter'] => ['milk']	0.1613	0.5	1.2917
['cereal'] => ['milk']	0.129	0.5	1.2917
['bread', 'eggs'] => ['butter']	0.0645	0.5	1.55
['butter', 'eggs'] => ['bread']	0.0645	0.6667	1.3778
['bread', 'butter'] => ['milk']	0.0968	0.5	1.2917
['bread', 'milk'] => ['butter']	0.0968	0.6	1.86
['butter', 'milk'] => ['bread']	0.0968	0.6	1.24


In [None]:
# Clickstream dataset
clickstream_df = preprocess_dataset('clickstream.csv', 'Session ID', 'Viewed Products')
solve(clickstream_df,3,0.5)

RULE			SUPPORT	CONFIDENCE	LIFT
['P001'] => ['P002']	0.2333	0.5	1.0714
['P002'] => ['P001']	0.2333	0.5	1.0714


In [None]:
# MovieRatings dataset
import pandas as pd

def preprocess_movie_ratings(file_path, id_col, movie_col, rating_col, min_rating=4):
    df = pd.read_csv(file_path)

    df = df[df[rating_col] >= min_rating]
    grouped_movies = df.groupby(id_col)[movie_col].apply(list).reset_index()
    grouped_movies.columns = [id_col, 'Movie List']

    return grouped_movies

movie_df = preprocess_movie_ratings('moviesrating.csv', 'User ID', 'Movie ID', 'Rating', min_rating=4)

movie_df_onehot = pd.DataFrame(columns=['User ID'] + list(set([item for sublist in movie_df['Movie List'] for item in sublist])))

for index, row in movie_df.iterrows():
    movie_df_onehot.loc[index, 'User ID'] = row['User ID']
    for movie in row['Movie List']:
        movie_df_onehot.loc[index, movie] = 1

movie_df_onehot = movie_df_onehot.fillna(0)

print(movie_df_onehot.head())


solve(movie_df_onehot, 1, 0.5)


  User ID  M003  M007  M005  M002  M001  M006  M004
0    U001     0     0     0     1     1     0     0
1    U002     1     0     0     0     0     0     1
2    U003     1     0     0     0     0     0     0
3    U004     0     0     1     1     0     0     0
4    U005     0     0     0     1     1     0     0
RULE			SUPPORT	CONFIDENCE	LIFT
['M004'] => ['M003']	0.0667	0.5	1.875
['M004'] => ['M001']	0.0667	0.5	1.25


  movie_df_onehot = movie_df_onehot.fillna(0)


In [None]:
# SupermarketTransactions dataset
supermarket_df = preprocess_dataset('marketTransaction.csv', 'Transaction ID', 'Item List')
solve(supermarket_df,1,0.7)

RULE			SUPPORT	CONFIDENCE	LIFT
['cheese', 'eggs'] => ['bread']	0.0333	1.0	2.0
['cheese', 'jam'] => ['milk']	0.0333	1.0	2.1429


In [None]:
#BookstorePurchases dataset
bookstore_df = preprocess_dataset('bookstorepurchases.csv', 'Customer ID', 'Books Purchased')
solve(bookstore_df,3,0.6)

RULE			SUPPORT	CONFIDENCE	LIFT
['Mystery'] => ['Fiction']	0.2667	0.6667	1.1111
['Non-Fiction'] => ['Romance']	0.2667	0.6667	1.1111


In [None]:
# SocialMediaEngagement dataset
social_df = preprocess_dataset('socialmediaengagement.csv', 'User ID', 'Post Type')
solve(social_df,3,0.4)

RULE			SUPPORT	CONFIDENCE	LIFT
['memes'] => ['news']	0.2667	0.4706	0.7843
['news'] => ['memes']	0.2667	0.4444	0.7843
['memes'] => ['videos']	0.3333	0.5882	0.9804
['videos'] => ['memes']	0.3333	0.5556	0.9804
['news'] => ['videos']	0.2667	0.4444	0.7407
['videos'] => ['news']	0.2667	0.4444	0.7407


In [None]:
#  RestaurantOrders dataset
restaurant_df = preprocess_dataset('restaurantOrders.csv', 'Order ID', 'Items Ordered')
solve(restaurant_df,0.05,0.7)

RULE			SUPPORT	CONFIDENCE	LIFT
['Fries'] => ['Burger']	0.2258	0.7	1.8083
['Garlic Bread'] => ['Pasta']	0.0968	0.75	3.3214
['Mashed Potatoes'] => ['Steak']	0.0645	1.0	6.2
['Soup'] => ['Salad']	0.0323	1.0	5.1667
['Fries', 'Milkshake'] => ['Burger']	0.0323	1.0	2.5833
['Garlic Bread', 'Salad'] => ['Pasta']	0.0323	1.0	4.4286
['Garlic Bread', 'Soda'] => ['Pasta']	0.0323	1.0	4.4286
['Pasta', 'Soda'] => ['Garlic Bread']	0.0323	1.0	7.75
['Pasta', 'Water'] => ['Salad']	0.0323	1.0	5.1667
