In [77]:
import random
from math import exp
from math import log
from collections import defaultdict
import numpy as np
import time
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import kagglehub
from sklearn.preprocessing import OneHotEncoder
import torch


In [78]:


class KNN(object):

    def __init__(self, numUsers, numItems,batchSize  ,lamI = 6e-2, lamJ = 6e-3, learningRate = 0.1, epochs=1):
        self._numUsers = numUsers
        self._numItems = numItems
        self._lamI = lamI
        self._lamJ = lamJ
        self._learningRate = learningRate
        self._users = set()
        self._items = set()
        self._Iu = defaultdict(set)
        self.C = torch.rand(numItems, numItems)  # Random correlation matrix
        self._batchSize = batchSize
        self._epochs = epochs

        
    def predict(self, u, i):
        # Predict the score for user u and item i
        return torch.matmul(self.C[i], self.C.T[:, u])

    
    
    def sigmoid(self, x):
        return 1/(1+np.exp(-x))

    def train(self, trainData):
        
        # correlation matrix
        self.C =np.random.rand(self._numItems, self._numItems)  
        for l in range(self._numItems):
            self.C[l][l] = 0
            for n in range(l + 1, self._numItems):
                self.C[l][n] = self.C[n][l]


        print("for loop for items*2 is done.")
        # change batch_size to min(batch-size, len(train))
        if len(trainData) < self._batchSize:
            sys.stderr.write("WARNING: Batch size is greater than number of training samples, switching to a batch size of %s\n" % str(len(trainData)))
            self._batchSize = len(trainData)
                  
        self._trainDict, self._users, self._items = self._dataPretreatment(trainData)
        N = len(trainData) * self._epochs
        users, pItems, nItems = self._sampling(N)
        itr = 0
        t2 = t0 = time.time()
        while (itr+1)*self._batchSize < N:
            # print("iter : " ,  itr , " batch size : "  , self._batchSize)
      
            self._mbgd(
                
                users[itr*self._batchSize: (itr+1)*self._batchSize],
                pItems[itr*self._batchSize: (itr+1)*self._batchSize],
                nItems[itr*self._batchSize: (itr+1)*self._batchSize]
            )
            
            itr += 1
            t2 = time.time()
            sys.stderr.write("\rProcessed %s ( %.3f%% ) in %.1f seconds" %(str(itr*self._batchSize), 100.0 * float(itr*self._batchSize)/N, t2 - t0))
            sys.stderr.flush()
        if N > 0:
            sys.stderr.write("\nTotal training time %.2f seconds; %.2f samples per second\n" % (t2 - t0, N*1.0/(t2 - t0)))
            sys.stderr.flush()
            
            
    def _mbgd(self, users, pItems, nItems):
        
        prev = -2**10
        for _ in range(30):
            
            gradientC = defaultdict(float)
            obj = 0

            for ind in range(len(users)):
                u, i, j = users[ind], pItems[ind], nItems[ind]
                x_ui = sum([self.C[i][l] for l in self._Iu[u] if i != l])
                x_uj = sum([self.C[j][l] for l in self._Iu[u]])
                x_uij =  x_ui - x_uj
                
                for l in self._Iu[u]:
                    if l != i:
                        gradientC[(i,l)] += (1-self.sigmoid(x_uij)) + self._lamI * self.C[i][l]**2
                        gradientC[(l,i)] += (1-self.sigmoid(x_uij)) + self._lamI * self.C[l][i]**2
                    gradientC[(j,l)] += -(1-self.sigmoid(x_uij)) + self._lamJ * self.C[j][l]**2
                    gradientC[(l,j)] += -(1-self.sigmoid(x_uij)) + self._lamJ * self.C[l][j]**2
                    
                    obj -= 2*self._lamI * self.C[i][l]**2 + 2*self._lamJ * self.C[j][l]**2
                    
                obj += log(self.sigmoid(x_uij))
            
            #print 'OBJ: ', obj
            if prev > obj: 
                break
            prev = obj
            
            for a,b in gradientC:
                self.C[a][b] += self._learningRate * gradientC[(a,b)]
            
        #print _, '\n'
        
    def _sampling(self, N):
        print(f"Generating {N} random training samples")
        userList = list(self._users)
        
        # Generate all random numbers at once
        userIndex = np.random.randint(0, len(self._users), N)
        pItems = []
        nItems = []
        
        # Process in smaller chunks
        chunk_size = 1000
        for chunk_start in range(0, N, chunk_size):
            chunk_end = min(chunk_start + chunk_size, N)
            chunk_indices = userIndex[chunk_start:chunk_end]
            
            for index in chunk_indices:
                u = userList[index]
                # Get positive item
                i = self._trainDict[u][np.random.randint(len(self._Iu[u]))]
                pItems.append(i)
                
                # Get negative item more efficiently
                j = np.random.randint(self._numItems)
                while j in self._Iu[u]:
                    j = np.random.randint(self._numItems)
                nItems.append(j)
            
            print(f"Generated {chunk_end}/{N} samples ({(chunk_end/N)*100:.1f}%)")
        
        print("Sampling completed!")
        return userIndex, pItems, nItems
    def predictionsKNN(self, K, u):
        # Convert the correlation matrix to a PyTorch tensor if it's not already
        if not isinstance(self.C, torch.Tensor):
            self.C = torch.tensor(self.C, dtype=torch.float32)

        # Check if the user has any items
        if not self._Iu[u]:
            print(f"User {u} has no items.")
            return torch.zeros(self._numItems)  # Return a zero tensor or handle as needed

        if K >= len(self._Iu[u]):
            # Use PyTorch sum and indexing
            res = torch.sum(torch.stack([self.C[:, l] for l in self._Iu[u]]), dim=0)
        else:
            res = []
            for i in range(self._numItems):
                # Use PyTorch operations for sorting and summing
                item_scores = torch.tensor([self.C[i][l] for l in self._Iu[u]], dtype=torch.float32)
                top_k_scores = torch.topk(item_scores, K).values
                res.append(torch.sum(top_k_scores).item())
        return res
    ...

    def prediction(self, u, i):
        scores = self.predictionsAll(u)
        # Fix the index calculation
        return scores[i] > sorted(scores)[int(self._numItems * 0.8)]
    

    def prediction(self, u, i):
        
        scores = self.predictionsAll(u)
        return scores[i] > sorted(scores)[self._numItem*0.8]

    def _dataPretreatment(self, data):
        # print(" control is here")
        dataDict = defaultdict(list)
        items = set()
        ind = 0
        for index , rows in data.iterrows():
            ind+=1
            u = rows['userid_DI']

            i = rows['course_id']
            # print("user : " , u , "item : " , i)
            self._Iu[u].add(i)
            dataDict[u].append(i)
            items.add(i)

        # print("data dicts are : " , dataDict , dataDict.keys , items)
        return dataDict, set(dataDict.keys()), items

In [79]:
# path2 = kagglehub.dataset_download('thedevastator/online-course-student-engagement-metrics', path='Courses.csv')
# print("path 2 :  " , path2)

path2 = "Courses.csv"
df = pd.read_csv(path2)  # Windows-1252 encoding
print(df.head())

   index                   course_id       userid_DI  registered  viewed  \
0      0  HarvardX/CB22x/2013_Spring  MHxPC130442623           1       0   
1      1         HarvardX/CS50x/2012  MHxPC130442623           1       1   
2      2  HarvardX/CB22x/2013_Spring  MHxPC130275857           1       0   
3      3         HarvardX/CS50x/2012  MHxPC130275857           1       0   
4      4  HarvardX/ER22x/2013_Spring  MHxPC130275857           1       0   

   explored  certified final_cc_cname_DI LoE_DI  YoB  ... grade start_time_DI  \
0         0          0     United States    NaN  NaN  ...     0    2012-12-19   
1         0          0     United States    NaN  NaN  ...     0    2012-10-15   
2         0          0     United States    NaN  NaN  ...     0    2013-02-08   
3         0          0     United States    NaN  NaN  ...     0    2012-09-17   
4         0          0     United States    NaN  NaN  ...     0    2012-12-19   

  last_event_DI nevents  ndays_act  nplay_video  nchapte

In [80]:
df2 = df[["course_id"  , "userid_DI"]]
print(df2.head())
print(df2.shape)

                    course_id       userid_DI
0  HarvardX/CB22x/2013_Spring  MHxPC130442623
1         HarvardX/CS50x/2012  MHxPC130442623
2  HarvardX/CB22x/2013_Spring  MHxPC130275857
3         HarvardX/CS50x/2012  MHxPC130275857
4  HarvardX/ER22x/2013_Spring  MHxPC130275857
(641138, 2)


In [81]:
#  Create mapping dictionaries
course_id_map = {id_: idx for idx, id_ in enumerate(df2['course_id'].unique())}
user_id_map = {id_: idx for idx, id_ in enumerate(df2['userid_DI'].unique())}

# Create new dataframe with mapped values
df2_numeric = df2.copy()
df2_numeric['course_id'] = df2['course_id'].map(course_id_map)
df2_numeric['userid_DI'] = df2['userid_DI'].map(user_id_map)

# Print first 15 rows of numeric data
i = 0
for index, rows in df2_numeric.iterrows():
    i += 1
    if(i > 15):
        break
    print(rows['course_id'], rows['userid_DI'])


user_course_counts = df2_numeric.groupby('userid_DI')['course_id'].nunique()

# Print users with their course counts, sorted by count
print("\nUsers and their course counts:")
print(user_course_counts.sort_values().head(10))  # Show 10 users with least courses
print("\nSummary statistics:")
print(f"Average courses per user: {user_course_counts.mean():.2f}")
print(f"Median courses per user: {user_course_counts.median():.2f}")
print(f"Max courses by a user: {user_course_counts.max()}")
print(f"Min courses by a user: {user_course_counts.min()}")
# Create train/test splits following leave-one-out evaluation scheme
def create_train_test_split(df):
    print("creating train-test split")
    
    # Get random indices for test set - one per user
    test_indices = df.groupby('userid_DI').apply(
        lambda x: x.sample(n=1).index[0]
    ).values
    
    # Create test and train dataframes using boolean indexing
    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    print("done with test-train split")
    
    return train_df, test_df

# Create evaluation pairs E(u) for each user
def create_evaluation_pairs(train_df, test_df, num_items, sample_size=100):

    print("creating evaluation pairs")
    evaluation_pairs = {}
    
    # Pre-compute test user data
    test_user_items = test_df.set_index('userid_DI')['course_id']
    train_user_items = train_df.groupby('userid_DI')['course_id'].apply(set).to_dict()
    all_items = set(range(num_items))
    
    # Process each user more effidef predictions_to_courses(predictions):
#     predictions_int = [int(round(p)) for p in predictions]
#     return [course_id_map[i] for i in predictions_int]

# print("predictions to courses:", predictions_to_courses(predictions))ciently
    for user in test_df['userid_DI'].unique():
        positive_item = test_user_items[user]
        train_items = train_user_items.get(user, set())
        
        # Use set operations for faster filtering
        negative_items = list(all_items - train_items - {positive_item})
        
        # Optimize random sampling
        if len(negative_items) > sample_size:
            # Use numpy's efficient random sampling
            negative_items = np.random.choice(
                negative_items, 
                size=sample_size,
                replace=False
            )
            
        evaluation_pairs[user] = {
            'positive_item': positive_item,
            'negative_items': negative_items
        }

    print("done with evaluation pairs")
    
    return evaluation_pairs

# Calculate AUC using PyTorch
def calculate_auc_pytorch(model, eval_pairs, device):
    auc_sum = 0.0
    user_count = 0

    for user, items in eval_pairs.items():
        positive_item = items['positive_item']
        negative_items = items['negative_items']
        
        # Move data to GPU
        user_tensor = torch.tensor([user], device=device)
        positive_item_tensor = torch.tensor([positive_item], device=device)
        negative_items_tensor = torch.tensor(negative_items, device=device)

        # Predict scores
        x_ui = model.predict(user_tensor, positive_item_tensor)
        x_uj = model.predict(user_tensor.repeat(len(negative_items)), negative_items_tensor)

        # Vectorized comparison
        auc_user = torch.sum(x_ui > x_uj).item() / len(negative_items)
        auc_sum += auc_user
        user_count += 1

    return auc_sum / user_count if user_count > 0 else 0.0


# # splitting into training and testing data
# train_df, test_df = create_train_test_split(df2_numeric)

# # Creating evaluation pairs for calculation of auc
# eval_pairs = create_evaluation_pairs(train_df, test_df, num_items=16, sample_size=100)



0 0
1 0
0 1
1 1
2 1
3 1
4 1
0 2
0 3
1 3
2 3
2 4
0 5
1 5
2 5

Users and their course counts:
userid_DI
476518    1
476519    1
476520    1
476521    1
476522    1
476523    1
476524    1
476525    1
476528    1
12        1
Name: course_id, dtype: int64

Summary statistics:
Average courses per user: 1.35
Median courses per user: 1.00
Max courses by a user: 16
Min courses by a user: 1


In [82]:
# Initialize the KNN model
# no of UI pairs = 641138

# Initialize the KNN model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Train the model with smaller parameters
epochs = 1
batch_size =(int)( 641138/10)  # Smaller batch size
print("batch size:", batch_size)




# Significantly reduce the data size for testing
sample_size = (int) (641138)  # Take only 10k samples from the dataset
df_sample = df2_numeric.sample(n=sample_size, random_state=42)
print("total samples:", sample_size)
num_users = 476532
num_items = 16
bpr = KNN(num_users, num_items , epochs, batch_size)

user = 476518
print(f"User {user} took {len(df_sample[df_sample['userid_DI'] == user])} courses")

try:
    bpr.train(df_sample)
except Exception as e:
    print(f"Error during training: {e}")

# Make predictions
K = 3
user = 476518


predictions = bpr.predictionsKNN(K, user)

print("predictions:", predictions)

# def predictions_to_courses(predictions):
#     predictions_int = [int(round(p)) for p in predictions]
#     return [course_id_map[i] for i in predictions_int]

# print("predictions to courses:", predictions_to_courses(predictions))

batch size: 64113
total samples: 641138
User 476518 took 1 courses
for loop for items*2 is done.
Generating 641138 random training samples
Generated 1000/641138 samples (0.2%)
Generated 2000/641138 samples (0.3%)
Generated 3000/641138 samples (0.5%)
Generated 4000/641138 samples (0.6%)
Generated 5000/641138 samples (0.8%)
Generated 6000/641138 samples (0.9%)
