## Discovering Most Frequently Appearing Courses using Apriori Algorithm

In [165]:
import re
import json
import hashlib
import numpy as np
import pandas as pd

In [166]:
# Generating a personalized random seed
username = "prit.kanadiya"
seed = int(hashlib.sha256(username.encode()).hexdigest(), 16) % (2**32)
print(seed)

1379217023


In [167]:
df = pd.read_csv("../assets/data/student_courses.csv")
with open("../assets/data/course_mapping.json", 'r') as file:
    course_mapping = json.load(file)

In [168]:
print(df.head())
print(df.iloc[0, 1].split(","))
print(course_mapping["12"])

   student_id     course_list
0           0        49,65,76
1           1              42
2           2  17,22,52,74,87
3           3           21,68
4           4      1,34,57,64
['49', '65', '76']
CL3.411-NLP for Healthcare


In [169]:
data = {}

# We convert our data to dictionary as accessing course list for each student will be easier
for idx, row in df.iterrows():
    courses = courses = row.iloc[1].split(",")
    for j in range(len(courses)):
        courses[j] = int(courses[j])

        if courses[j] == 21:
            courses[j] = 20
        if courses[j] == 87:
            courses[j] = 86
    
    data[int(row.iloc[0])] = courses

In [170]:
class Apriori():
    def __init__(self, data: dict[int, list[int]], course_mapping: dict[int, str], support_threshold: float, confidence_threshold: float):
        self.data = data
        self.course_mapping = course_mapping
        self.total_transactions = len(data.keys())    # Total number of students
        self.support_threshold = support_threshold
        self.confidence_threshold = confidence_threshold
    
    def get_itemset_support(self, itemset):
        """
        calculate support for an itemset
        """
        count = 0
        for key in self.data.keys():
            courses = self.data[key]
            if (len(courses) < len(itemset)):    # Itemset is big than the transaction
                continue
            
            match = 0
            for i in range(len(courses)):    # Checking if itemset is present in the given transaction
                if (courses[i] in itemset):
                    match += 1
    
            if (match == len(itemset)):    # If all items of itemset are found in transactions
                count += 1
            
        return count / self.total_transactions

    def get_confidence(self, lhs_itemset: list[int], rhs_itemset: list[int]):
        """
        calculate confidence for {lhs_itemset} -> {rhs_itemset}
        """
        numerator = self.get_itemset_support(lhs_itemset + rhs_itemset)
        denominator = self.get_itemset_support(lhs_itemset)

        return numerator / denominator

    def get_lift(self, lhs_itemset: list[int], rhs_itemset: list[int]):
        """
        calculate lift for {lhs_itemset} -> {rhs_itemset}
        """
        numerator = self.get_confidence(lhs_itemset, rhs_itemset)
        denominator = self.get_itemset_support(rhs_itemset)

        return numerator / denominator
    
    def check_candidate(self, candidate, prev_itemset):
        """
        checking if all k-1 subsets of candidate are in prev_itemset
        """
        k = len(candidate)
        for i in range(k-1):
            # Skip the i-th item and test for all others
            test = []
            for j in range(k):
                if (j == i):
                    continue
                else:
                    test.append(candidate[j])    # This will naturally be sorted
            
            if (test in prev_itemset):
                continue
            else:
                return False
        
        return True

    
    def build_1st_itemset(self):
        """  
        first itemset is built separately from other itemsets
        """
        itemset_1 = []
        print(f"Building 1-itemset")
        for key in course_mapping.keys():
            if(self.get_itemset_support([int(key)]) >= self.support_threshold):    # We need to pass an list of integer to function
                itemset_1.append([int(key)])
        
        print(f"1-itemset consists of {len(itemset_1)} objects.")
        return itemset_1

    def build_kth_itemset(self, prev_itemset):
        """  
        kth itemset is built from (k-1)th itemset, where k > 1
        """
        curr_itemset = set()
        k = len(prev_itemset[0]) + 1
        print(f"Building {k}-itemset")
        for i in range(len(prev_itemset)):
            for j in range(i+1, len(prev_itemset)):
                
                # We need to first check if both can be combined to make a new itemset or not
                if (prev_itemset[i][:-1] == prev_itemset[j][:-1]):     # It should be a candidate for kth itemset
                    candidate = prev_itemset[i] + [prev_itemset[j][-1]]    # Because our 1-itemset is always sorted, candidate is sorted
                    if(self.check_candidate(candidate, prev_itemset)):
                        if(self.get_itemset_support(candidate) >= self.support_threshold):    # Checking itemset support
                            candidate = tuple(sorted(candidate))    # Because we want to add it to set
                            curr_itemset.add(candidate)

        curr_itemset = [list(c) for c in curr_itemset]
        print(f"{k}-itemset consists of {len(curr_itemset)} objects.")
        return curr_itemset

    def generate_rules_from_itemset(self, itemset: list[int]):
        """
        for k - itemset, we can generate ((2^k) - 2) rules
        """
        rules = []
        num_rules = 2**(len(itemset))    # 2^k
        for i in range(1, num_rules - 1):    # ((2^k) - 2)
            binary_mask =  list(format(i, f'0{len(itemset)}b'))    # This creates binary masks to choose LHS and RHS

            lhs = [v for v, m in zip(itemset, binary_mask) if m == "1"]
            rhs = [v for v, m in zip(itemset, binary_mask) if m == "0"]
            rules.append((lhs, rhs))
        
        return rules
    
    def discover_rules(self):
        prev_itemset = self.build_1st_itemset()
        discovered_rules = []

        # Till we are able to extract new itemsets
        while(True):
            curr_itemset = self.build_kth_itemset(prev_itemset)
            if (len(curr_itemset) == 0):
                break
            
            for itemset in curr_itemset:
                candidate_rules = self.generate_rules_from_itemset(itemset)
                
                for rule in candidate_rules:
                    rule_confidence = self.get_confidence(rule[0], rule[1])
                    rule_lift = self.get_lift(rule[0], rule[1])

                    if (rule_confidence >= self.confidence_threshold and rule_lift > 1):
                        discovered_rules.append(rule)

            prev_itemset = curr_itemset
        
        return discovered_rules



In [171]:
# We can assume that courses which have >= 50 students are popular courses
# 50 students mean support_threshold = 0.025 since total number of students is 2000

apriori = Apriori(data, course_mapping, support_threshold=0.025, confidence_threshold=0.95)
rules = apriori.discover_rules()

Building 1-itemset
1-itemset consists of 40 objects.
Building 2-itemset
2-itemset consists of 38 objects.
Building 3-itemset
3-itemset consists of 16 objects.
Building 4-itemset
4-itemset consists of 0 objects.


In [172]:
print(len(rules))

68


In [173]:
# print(rules)
for rule in rules:
    lhs = []
    rhs = []

    for item in rule[0]:
        lhs.append(course_mapping[str(item)])
    
    for item in rule[1]:
        rhs.append(course_mapping[str(item)])

    print(f"{lhs} -> {rhs}")

['MA4.101-Real Analysis'] -> ['Computer Programming']
['EC5.201-Signal Processing'] -> ['EC5.202-Systems Thinking']
['EC5.202-Systems Thinking'] -> ['EC5.201-Signal Processing']
['EC5.101-Networks Signals and Systems'] -> ['MA4.101-Real Analysis']
['MA5.101-Discrete Structures'] -> ['Computer Programming']
['EC5.201-Signal Processing'] -> ['SC1.110a-Science I']
['EC2.201-VLSI Design'] -> ['SC1.110a-Science I']
['CS4.301-Data and Applications'] -> ['Algorithm Analysis and Design']
['Algorithm Analysis and Design'] -> ['CS4.301-Data and Applications']
['CS1.304-Data Structures & Algorithms for Problem Solving'] -> ['CS3.304-Advanced Operating Systems']
['CS3.304-Advanced Operating Systems'] -> ['CS1.304-Data Structures & Algorithms for Problem Solving']
['CS3.304-Advanced Operating Systems'] -> ['MA6.302-MCS 2-Linear Algebra']
['MA6.302-MCS 2-Linear Algebra'] -> ['CS3.304-Advanced Operating Systems']
['EC3.202-Embedded Systems Workshop'] -> ['CS4.301-Data and Applications']
['EC3.202-Emb

All the results that we see here are courses which are compulsory for certain segment of students, and hence the associations have very high confidence and lift.

For example,
1) `['MA4.101-Real Analysis'] -> ['Computer Programming']` is a setting that is compulsory for `B.Tech I year I Semester - CSE&CSD`
2) `['CS3.304-Advanced Operating Systems'] -> ['CS1.304-Data Structures & Algorithms for Problem Solving']` is a setting that is compulsory for `M.Tech I year I Semester - CSE & CSIS`

However, these results dominate the elective choices. Students from `M.S.`, `PhD` and `B. Tech (after II year)` are allowed to choose electives. Thus to explore the associations in electives, we will remove all compulsory courses. Generally, electives for `M.S` and `PhD` are of `400 and above` level.

In [174]:
# Retaining all courses where level >= 400 
allowed_levels = {'.4', '.5', '.6'}

filtered_course_mapping = {}
for key, course in course_mapping.items():
    level = re.search(r'\.(\d)', course)
    if level and f".{level.group(1)}" in allowed_levels:
        filtered_course_mapping[key] = course

print(len(filtered_course_mapping))
print(filtered_course_mapping)


48
{'3': 'CS4.405-Data Analytics I', '5': 'EC5.413-Quantum Error-Correction Codes', '6': 'CS3.405-Blockchain and Web3 Development', '7': 'CS3.401-Distributed Systems', '9': 'CS9.428-Environmental Science & Technology', '10': 'CS9.429-Design for Social Innovation', '11': 'EC5.406-Signal Detection and Estimation Theory', '12': 'CL3.411-NLP for Healthcare', '13': 'CL2.405-Speech Analysis and Linguistics', '14': 'CS7.506-Causal Inference', '15': 'GS3.403-Geospatial Technology for Disaster Risk Modelling', '16': 'PD2.421-Business Fundamentals', '19': 'HS8.401-Rethinking Corruption in India', '20': 'CS7.403-Statistical Methods in AI', '21': 'CS7.403b-Statistical Methods in AI', '22': 'EC2.408-Digital VLSI Design', '23': 'CS8.501-Research in Information Security', '28': 'PD2.402-Early Stage Funding for a Startup', '30': 'HS1.401-Readings from Hindi Literature', '31': 'HS2.401-Work, Entrepreneurship and Technology in Contemporary Societies', '32': 'CS9.501-User Research Methods', '35': 'SC2.40

In [190]:
modified_data = {}
non_electives = ["16", "80", "54", "47", "63", "32"]    # These are some special case which are above 400, but not electives
# We convert our data to dictionary as accessing course list for each student will be easier
for idx, row in df.iterrows():
    courses = row.iloc[1].split(",")
    modified_courses = []
    for j in range(len(courses)):

        if (courses[j] in filtered_course_mapping.keys() and not(courses[j] in non_electives)):
            modified_courses.append(int(courses[j]))

            if modified_courses[-1] == "21":
                modified_courses[j] = 20
            if modified_courses[-1] == "87":
                modified_courses[j] = 86
    
    if (len(modified_courses) > 0):
        modified_data[int(row.iloc[0])] = modified_courses

In [191]:
print(len(modified_data))    # 1001 students out of ~2000 students choose atleast one elective

960


In [204]:
# For electives, class size vary a lot, hence a lower support value is preffered
# 10 students mean support_threshold = 0.01 since total number of students is ~1000

apriori = Apriori(modified_data, filtered_course_mapping, support_threshold=0.01, confidence_threshold=0.7)
rules = apriori.discover_rules()

Building 1-itemset
1-itemset consists of 40 objects.
Building 2-itemset
2-itemset consists of 43 objects.
Building 3-itemset
3-itemset consists of 2 objects.
Building 4-itemset
4-itemset consists of 0 objects.


In [205]:
print(len(rules))

4


In [206]:
# print(rules)
for rule in rules:
    lhs = []
    rhs = []

    for item in rule[0]:
        lhs.append(course_mapping[str(item)])
    
    for item in rule[1]:
        rhs.append(course_mapping[str(item)])

    print(f"{lhs} -> {rhs}")

['CE1.621-Retrofit of Existing Infrastructure'] -> ['CE1.610-Advanced Design of Steel Structures']
['CE1.610-Advanced Design of Steel Structures'] -> ['CE1.621-Retrofit of Existing Infrastructure']
['CS3.401-Distributed Systems', 'CG1.402-Introduction to Cognitive Science'] -> ['CS3.402-Advanced Computer Networks']
['CS3.401-Distributed Systems', 'CS8.501-Research in Information Security'] -> ['CS3.402-Advanced Computer Networks']
