## Discovering Most Frequently Appearing Courses using Apriori Algorithm

In [1]:
import json
import hashlib
import numpy as np
import pandas as pd

In [2]:
# Generating a personalized random seed
username = "prit.kanadiya"
seed = int(hashlib.sha256(username.encode()).hexdigest(), 16) % (2**32)
print(seed)

1379217023


In [3]:
df = pd.read_csv("../assets/data/student_courses.csv")
with open("../assets/data/course_mapping.json", 'r') as file:
    course_mapping = json.load(file)

In [4]:
print(df.head())
print(df.iloc[0, 1].split(","))
print(course_mapping["12"])

   student_id     course_list
0           0        49,65,76
1           1              42
2           2  17,22,52,74,87
3           3           21,68
4           4      1,34,57,64
['49', '65', '76']
CL3.411-NLP for Healthcare


In [59]:
data = {}

# We convert our data to dictionary as accessing course list for each student will be easier
for idx, row in df.iterrows():
    courses = courses = row.iloc[1].split(",")
    for j in range(len(courses)):
        courses[j] = int(courses[j])

        if courses[j] == 21:
            courses[j] = 20
        if courses[j] == 87:
            courses[j] = 86
    
    data[int(row.iloc[0])] = courses

In [60]:
def itemset_support(itemset: list[int], data):
    count = 0
    total_transactions = len(data.keys())    # Total number of students

    for key in data.keys():
        courses = data[key]

        # Checking if itemset is present in the given transaction
        if (len(courses) < len(itemset)):    # Itemset is big than the transaction
            continue
        
        match = 0
        for i in range(len(courses)):
            if (courses[i] in itemset):
                match += 1
        
        if (match == len(itemset)):    # If all items of itemset are found in transactions
            count += 1
        
    return count / total_transactions

In [61]:
# Testing the itemset support function
smai_support = itemset_support([20], data)
print(smai_support)

# SMAI is one of the more popular courses. This means 12.26% of students (~250 students) took SMAI course.

0.12259371833839919


In [64]:
def confidence(lhs_itemset: list[int], rhs_itemset: list[int], data):
    # This is for {lhs_itemset} -> {rhs_itemset}
    numerator = itemset_support(lhs_itemset + rhs_itemset, data)
    denominator = itemset_support(lhs_itemset, data)

    return numerator / denominator

In [65]:
# Testing the confidence function
smai_to_dip_mr_confidence = confidence([20], [52, 68], data)
print(smai_to_dip_mr_confidence)

dip_mr_to_smai_confidence = confidence([52, 68], [20], data)
print(dip_mr_to_smai_confidence)

# This is confidence of {SMAI} -> {DIP, MR} and vice versa
# We can see that confidence is directional
# Also, given a student takes DIP and MR course, he is very likely to take SMAI
# The inverse is true only for a very small fraction of students

0.012396694214876033
0.27272727272727276


In [66]:
def lift(lhs_itemset: list[int], rhs_itemset: list[int], data):
    # This is for {lhs_itemset} -> {rhs_itemset}
    numerator = confidence(lhs_itemset, rhs_itemset, data)
    denominator = itemset_support(rhs_itemset, data)

    return numerator / denominator

In [67]:
# Testing the lift function
smai_to_dip_mr_lift = lift([20], [52, 68], data)
print(smai_to_dip_mr_lift)

dip_mr_to_smai_lift = lift([52, 68], [20], data)
print(dip_mr_to_smai_lift)

# This is lift of {SMAI} -> {DIP, MR} and vice versa
# We can see that lift is non-directional
# The lift shows the association between {SMAI} and {DIP, MR} is significant

2.224643125469572
2.224643125469572


In [70]:
# Computing 1-itemset supports
itemset_1 = {}
for key in course_mapping.keys():
    key_support = itemset_support([int(key)], data)

    itemset_1[key] = key_support

In [71]:
# A class can be considered as popular if there are atleast 50 students in the course
support_threshold = 0.025

itemset_1_filtered = []
for key in itemset_1.keys():
    if (itemset_1[key] > support_threshold):
        itemset_1_filtered.append(key)

print(len(itemset_1_filtered))

40


In [72]:
# Now we will generate rules and compute lift and confidence for each rule
rules = []

for i in range(len(itemset_1_filtered)):
    for j in range(len(itemset_1_filtered)):
        if (i == j):
            continue

        else:
            rules.append((itemset_1_filtered[i], itemset_1_filtered[j]))

print(len(rules))
print(40*39)

1560
1560


In [73]:
confidence_dict = {}
lift_dict = {}

for rule in rules:
    confidence_dict[rule] = confidence(rule[0], rule[1], data)
    lift_dict[rule] = confidence(rule[0], rule[1], data)

TypeError: 'in <string>' requires string as left operand, not int

In [48]:
print(itemset_1["22"])
print(itemset_1["1"])

0.034447821681864235
0.20364741641337386
