## Headers

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import random
from sklearn.model_selection import train_test_split

## Functions Implemented

In [None]:
def support(movies):
    matching_users = [user for user in train_set if all(movie in train_set[user] for movie in movies)]
    return len(matching_users) / len(train_set)
def generate_frequent_itemsets(itemset_size, minsup,item_sets):
    frequent_itemsets = []

    if itemset_size == 2:
        candidates = [(x, y) for x in l1_movies for y in l1_movies if x < y]
    else:
        candidates = []
        candidate=[]
        for i in range(len(item_sets)):
            for j in range(i + 1, len(item_sets)):
                if sorted(item_sets[i][:-1]) == sorted(item_sets[j][:-1]):
                    candidate = list(item_sets[i][:-1])
                    candidate.append( item_sets[i][-1])
                    candidate.append(item_sets[j][-1])
                    candidates.append(candidate)
    for candidate in candidates:
        sup = support(candidate)
        if sup >= minsup:
            frequent_itemsets.append(candidate)

    return frequent_itemsets
def generate_association_rules(item_list, minconf):

    for x in item_list:
        s = support(x)
        for i in range(len(x)):
            antecedent = [x[j] for j in range(len(x)) if j != i]
            conf = s / support(antecedent)
            if conf >= minconf:
                ass_rules.append([antecedent, [x[i]], s, conf])
    
    return ass_rules

## Data preprocessing

In [None]:
df = pd.read_csv('ratings.csv')

df = df[df['rating'] > 2]

user_counts = df.groupby('userId')['movieId'].count()
valid_users = user_counts[user_counts > 10].index
df = df[df['userId'].isin(valid_users)]

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_set = {}
test_set = {}

for user, group in train_data.groupby('userId'):
    train_set[user] = list(group['movieId'])

for user, group in test_data.groupby('userId'):
    test_set[user] = list(group['movieId'])

movies_train = [movie for user in train_set for movie in train_set[user]]
movies_train_unique = list(set(movies_train))


### Part-1


In [None]:
minsup = 0.09
minconf = 0.1

asr_movies_sup = {movie: support([movie]) for movie in movies_train_unique if support([movie]) >= minsup}

movies_stminsup = list(asr_movies_sup.keys())

l1_movies = movies_stminsup

l2_movies = generate_frequent_itemsets(2, minsup,l1_movies)
l3_movies = generate_frequent_itemsets(3, minsup,l2_movies)
l4_movies = generate_frequent_itemsets(4, minsup,l3_movies)

ass_rules = []


generate_association_rules(l2_movies, minconf)
generate_association_rules(l3_movies, minconf)


### Part-2

In [None]:
sup_rules=sorted(ass_rules, key=lambda x: x[2], reverse=True)
conf_rules=sorted(ass_rules, key=lambda x: x[3], reverse=True)

top_100_sup_rules=sup_rules[:100]
top_100_conf_rules=conf_rules[:100]

common_sup_conf_rules=[]
for x in top_100_conf_rules:
  if x in top_100_sup_rules:
    common_sup_conf_rules.append(x)
print(common_sup_conf_rules)

with open('10_top100RulesBySup.txt', 'w') as sup_file:
    for rule in sup_rules:
        sup_file.write(','.join(map(str, rule)) + '\n')

with open('4_top100RulesByConf', 'w') as conf_file:
    for rule in conf_rules:
        conf_file.write(','.join(map(str, rule)) + '\n')


### Part-3

In [None]:
precision_avgs=[]
recall_avgs=[]
for k in range(1,11):
  for user in  train_set:
    train = train_set[user]
    if user in test_set:
        test = test_set[user]
        recall_sum=0
        precision_sum=0
        recommendation=[]
        for x in train:
          y=[]
          count=0
          for asr in conf_rules:
            if(int(asr[0][0])==int(x)):
              y = y+ asr[1]
              count+=1
              if(count==k):
                break
          recommendation = recommendation + y
        hitset = []
        for m in recommendation:
          if m in test:
            hitset.append(m)
        recall = len(hitset)/len(test)
        if(len(recommendation)==0):
          precision=0
        else:
          precision = len(hitset)/len(recommendation)
        recall_sum+=recall
        precision_sum+=precision
  recall_avg = recall_sum / len(train_set)
  precision_avg = precision_sum / len(train_set)
  precision_avgs.append(precision_avg)
  recall_avgs.append(recall_avg)

print(precision_avgs)
print(recall_avgs)

x=[i for i in range(1,11)]
plt.plot(x,precision_avgs,label='precision')
plt.plot(x,recall_avgs, label='recall')
plt.legend()


Decreasing Precision: On increasing the number of rules (k), the average precision is likely to decrease. This is because with more rules, we are recommending a larger set of items, and some of those recommendations may not be relevant to the user. As a result, the precision, which measures how many of the recommended items are relevant, tends to decrease as k increases.

Increasing Recall: On the other handOn increasing the number of rules (e) N, the average recall is likely to increase. This is because with more rulesweou are recommending a larger set of items, which is more likely to include some of the relevant items from the test set. Recall measures how many of the relevant items are included in the recommendations, and aweou provide more recommendationsweou are more likely to cover a larger portion of the relevant items.

Trade-off Between Precision and Recall: The graph will likely show a trade-off between precision and recall. Wwe you have fewer rules (loker we you have a higher precision but a lower recall, and wwe you have more rules (higker we you have a higher recall but a lower precisces.

### Part-4

In [None]:
import random
import matplotlib.pyplot as plt
sample_size = 20 
sample_users = random.sample(list(test_set.keys()), sample_size)

precision_avgs = []
recall_avgs = []

for k in range(1, 11):
    recall_sum = 0
    precision_sum = 0

    for user in sample_users:
        train = train_set[user]
        if user in test_set:
            test = test_set[user]
            recommendation = []
    
            for x in train:
                y = []
                count = 0
                for asr in conf_rules:
                    if int(asr[0][0]) == int(x):
                        y = y + asr[1]
                        count += 1
                        if count == k:
                            break
                recommendation = recommendation + y
    
            hit_set = [m for m in recommendation if m in test]
    
            recall = len(hit_set) / len(test)
            precision = len(hit_set) / len(recommendation) if len(recommendation) > 0 else 0

            recall_sum += recall
            precision_sum += precision

    recall_avg = recall_sum / sample_size
    precision_avg = precision_sum / sample_size

    precision_avgs.append(precision_avg)
    recall_avgs.append(recall_avg)

x = [i for i in range(1, 11)]
plt.plot(x, precision_avgs, label='precision')
plt.plot(x, recall_avgs, label='recall')
plt.legend()
plt.xlabel('k')
plt.ylabel('Score')
plt.title('Precision and Recall vs. k')
plt.show()

In [None]:
users_per_row = 5
num_rows = (sample_size + users_per_row - 1) // users_per_row

for row in range(num_rows):
    plt.figure(figsize=(25, 5))

    for col in range(users_per_row):
        index = row * users_per_row + col
        if index >= sample_size:
            break

        user = sample_users[index]
        precision_avgs = []
        recall_avgs = []

        for k in range(1, 11):
            train = train_set[user]
            if user in test_set:
                test = test_set[user]
                recall_sum = 0
                precision_sum = 0
                recommendation = []
    
                for x in train:
                    y = []
                    count = 0
                    for asr in conf_rules:
                        if int(asr[0][0]) == int(x):
                            y = y + asr[1]
                            count += 1
                            if count == k:
                                break
                    recommendation = recommendation + y
    
                hit_set = [m for m in recommendation if m in test]
    
                recall = len(hit_set) / len(test)
                precision = len(hit_set) / len(recommendation) if len(recommendation) > 0 else 0
    
                recall_sum += recall
                precision_sum += precision
    
            recall_avg = recall_sum
            precision_avg = precision_sum

            precision_avgs.append(precision_avg)
            recall_avgs.append(recall_avg)

        x = [i for i in range(1, 11)]
        plt.subplot(1, users_per_row, col + 1)
        plt.plot(x, precision_avgs, label='precision')
        plt.plot(x, recall_avgs, label='recall')
        plt.xlabel('k')
        plt.ylabel('Score')
        plt.title(f'User {user}')
        plt.legend()

    plt.show()
