In [4]:
import os 
import pandas as pd

df = pd.read_csv('ml-32m-20241011T132018Z-001/ml-32m/ratings.csv')

In [5]:
df.drop( columns = ['rating' , 'timestamp'] , inplace = True)

df.head()

Unnamed: 0,userId,movieId
0,1,17
1,1,25
2,1,29
3,1,30
4,1,32


In [6]:
df.describe()

Unnamed: 0,userId,movieId
count,32000200.0,32000200.0
mean,100278.5,29318.61
std,57949.05,50958.16
min,1.0,1.0
25%,50053.0,1233.0
50%,100297.0,3452.0
75%,150451.0,44199.0
max,200948.0,292757.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 2 columns):
 #   Column   Dtype
---  ------   -----
 0   userId   int64
 1   movieId  int64
dtypes: int64(2)
memory usage: 488.3 MB


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming df is already defined
# Shorten the length of the data to 50%
df = df.sample(frac=0.5, random_state=42)

# Now splitting the data into training and testing split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the splits to CSV files
train_df.to_csv('train_data.txt', index=False)
test_df.to_csv('test_data.txt', index=False)

# Display information about the splits
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12800081 entries, 10128624 to 11717299
Data columns (total 2 columns):
 #   Column   Dtype
---  ------   -----
 0   userId   int64
 1   movieId  int64
dtypes: int64(2)
memory usage: 293.0 MB
<class 'pandas.core.frame.DataFrame'>
Index: 3200021 entries, 20566633 to 19454882
Data columns (total 2 columns):
 #   Column   Dtype
---  ------   -----
 0   userId   int64
 1   movieId  int64
dtypes: int64(2)
memory usage: 73.2 MB


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12800081 entries, 10128624 to 11717299
Data columns (total 2 columns):
 #   Column   Dtype
---  ------   -----
 0   userId   int64
 1   movieId  int64
dtypes: int64(2)
memory usage: 293.0 MB


In [10]:
import numpy as np
import math

def topK_scores(test, predict, topk, user_count, item_count):
    PrecisionSum = np.zeros(topk+1)
    RecallSum = np.zeros(topk+1)
    F1Sum = np.zeros(topk+1)
    NDCGSum = np.zeros(topk+1)
    OneCallSum = np.zeros(topk+1)
    MRRSum = 0
    MAPSum = 0
    total_test_data_count = 0
    
    # Precompute DCGbest for efficiency
    DCGbest = np.zeros(topk+1)
    for k in range(1, topk+1):
        DCGbest[k] = DCGbest[k - 1] + 1.0 / math.log2(k + 1)
    
    # Loop over each user
    for i in range(user_count):
        user_test = test[i * item_count:(i + 1) * item_count]
        user_predict = predict[i * item_count:(i + 1) * item_count]

        # Get test data size for this user
        test_data_size = np.sum(user_test)
        if test_data_size == 0:
            continue
        total_test_data_count += 1

        # Get top-k item indices for this user
        top_k_indices = np.argsort(user_predict)[-topk:][::-1]  # Sort in descending order
        
        hit_sum = 0
        DCG = np.zeros(topk + 1)
        for k in range(1, topk + 1):
            item_id = top_k_indices[k - 1]
            if user_test[item_id] == 1:
                hit_sum += 1
                DCG[k] = DCG[k - 1] + 1 / math.log2(k + 1)
            else:
                DCG[k] = DCG[k - 1]
            
            prec = hit_sum / k
            rec = hit_sum / test_data_size
            f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0

            PrecisionSum[k] += prec
            RecallSum[k] += rec
            F1Sum[k] += f1
            NDCGSum[k] += DCG[k] / DCGbest[k]
            OneCallSum[k] += 1 if hit_sum > 0 else 0

        # Compute MRR
        for rank, idx in enumerate(top_k_indices, start=1):
            if user_test[idx] == 1:
                MRRSum += 1 / rank
                break

        # Compute MAP
        AP = 0
        hit_count = 0
        for rank, idx in enumerate(top_k_indices, start=1):
            if user_test[idx] == 1:
                hit_count += 1
                AP += hit_count / rank
        MAPSum += AP / test_data_size
    
    # Normalize metrics
    total_test_data_count = max(1, total_test_data_count)  # Avoid division by 0
    print('MAP:', MAPSum / total_test_data_count)
    print('MRR:', MRRSum / total_test_data_count)
    print('Prec@5:', PrecisionSum[5] / total_test_data_count)
    print('Rec@5:', RecallSum[5] / total_test_data_count)
    print('F1@5:', F1Sum[5] / total_test_data_count)
    print('NDCG@5:', NDCGSum[5] / total_test_data_count)
    print('1-call@5:', OneCallSum[5] / total_test_data_count)

In [11]:
import pandas as pd
import os
from collections import defaultdict


# Convert the dataframes to lists of tuples
train_user_item_pairs = list(train_df.itertuples(index=False, name=None))  # [(userid, movieid), ...]
test_user_item_pairs = list(test_df.itertuples(index=False, name=None))  # [(userid, movieid), ...]

# Define the filtering function
def filter_data(user_item_pairs, min_interactions=10):
    # Step 1: Count interactions
    user_counts = defaultdict(set)
    item_counts = defaultdict(set)
    
    for user, item in user_item_pairs:
        user_counts[user].add(item)
        item_counts[item].add(user)

    # Step 2: Filter out users with less than `min_interactions` items
    filtered_users = {user: items for user, items in user_counts.items() if len(items) >= min_interactions}
    
    # Step 3: Filter out items with less than `min_interactions` users
    filtered_items = {item: users for item, users in item_counts.items() if len(users) >= min_interactions}

    # Step 4: Remove any remaining users/items that no longer meet the conditions
    while True:
        # Remove items from users that don't exist in filtered items
        new_filtered_users = {user: {item for item in items if item in filtered_items} for user, items in filtered_users.items()}
        # Remove users that now have fewer than `min_interactions` items
        new_filtered_users = {user: items for user, items in new_filtered_users.items() if len(items) >= min_interactions}
        
        # Remove users from items that don't exist in filtered users
        new_filtered_items = {item: {user for user in users if user in new_filtered_users} for item, users in filtered_items.items()}
        # Remove items that now have fewer than `min_interactions` users
        new_filtered_items = {item: users for item, users in new_filtered_items.items() if len(users) >= min_interactions}

        # Check if the filtering stabilized
        if new_filtered_users == filtered_users and new_filtered_items == filtered_items:
            break
        
        filtered_users, filtered_items = new_filtered_users, new_filtered_items

    # Convert filtered data back to a list of duplets
    filtered_user_item_pairs = [(user, item) for user, items in filtered_users.items() for item in items]
    
    return filtered_user_item_pairs

# Apply filtering to train and test data
filtered_train_pairs = filter_data(train_user_item_pairs)
filtered_test_pairs = filter_data(test_user_item_pairs)

# Convert the filtered pairs back to DataFrames
filtered_train_df = pd.DataFrame(filtered_train_pairs, columns=['userid', 'movieid'])
filtered_test_df = pd.DataFrame(filtered_test_pairs, columns=['userid', 'movieid'])

# Overwrite the filtered DataFrames to their respective text files
filtered_train_df.to_csv("train_data_filtered.txt", index=False, header=False)
filtered_test_df.to_csv("test_data_filtered.txt", index=False, header=False)

print("Filtered train and test data saved.")

Filtered train and test data saved.


In [12]:
remaining_users = filtered_train_df['userid'].nunique()  # Count unique users
remaining_items = filtered_train_df['movieid'].nunique()  # Count unique items

print(f"Remaining users: {remaining_users}")
print(f"Remaining items: {remaining_items}")

Remaining users: 181686
Remaining items: 21570


In [None]:

import random
from collections import defaultdict
import numpy as np
from sklearn.metrics import roc_auc_score
import scores

class BPR:
    user_count = 943
    item_count = 1682
    latent_factors = 20
    lr = 0.01
    reg = 0.01
    train_count = 1000
    train_data_path = 'train.txt'
    test_data_path = 'test.txt'
    size_u_i = user_count * item_count
    # latent_factors of U & V
    U = np.random.rand(user_count, latent_factors) * 0.01
    V = np.random.rand(item_count, latent_factors) * 0.01
    biasV = np.random.rand(item_count) * 0.01
    test_data = np.zeros((user_count, item_count))
    test = np.zeros(size_u_i)
    predict_ = np.zeros(size_u_i)

    def load_data(self, path):
        user_ratings = defaultdict(set)
        max_u_id = -1
        max_i_id = -1
        with open(path, 'r') as f:
            for line in f.readlines():
                u, i = line.split(" ")
                u = int(u)
                i = int(i)
                user_ratings[u].add(i)  
                max_u_id = max(u, max_u_id)
                max_i_id = max(i, max_i_id)
        return user_ratings

    def load_test_data(self, path):
        file = open(path, 'r')
        for line in file:
            line = line.split(' ')
            user = int(line[0])
            item = int(line[1])
            self.test_data[user - 1][item - 1] = 1

    def train(self, user_ratings_train):
        for user in range(self.user_count):
            # sample a user
            u = random.randint(1, self.user_count)
            if u not in user_ratings_train.keys():
                continue
            # sample a positive item from the observed items
            i = random.sample(user_ratings_train[u], 1)[0] #sample(user,1) 的作用是从user_rating_train[u]中随机选取一个元素
            # sample a negative item from the unobserved items
            j = random.randint(1, self.item_count)
            while j in user_ratings_train[u]:
                j = random.randint(1, self.item_count)
            u -= 1
            i -= 1
            j -= 1
            r_ui = np.dot(self.U[u], self.V[i].T) + self.biasV[i]
            r_uj = np.dot(self.U[u], self.V[j].T) + self.biasV[j]
            r_uij = r_ui - r_uj
            loss_func = -1.0 / (1 + np.exp(r_uij))
            # update U and V
            self.U[u] += -self.lr * (loss_func * (self.V[i] - self.V[j]) + self.reg * self.U[u])
            self.V[i] += -self.lr * (loss_func * self.U[u] + self.reg * self.V[i])
            self.V[j] += -self.lr * (loss_func * (-self.U[u]) + self.reg * self.V[j])
            # update biasV
            self.biasV[i] += -self.lr * (loss_func + self.reg * self.biasV[i])
            self.biasV[j] += -self.lr * (-loss_func + self.reg * self.biasV[j])

    def predict(self, user, item):
        predict = np.mat(user) * np.mat(item.T)
        return predict

    def main(self):
        user_ratings_train = self.load_data(self.train_data_path)
        self.load_test_data(self.test_data_path)
        for u in range(self.user_count):
            for item in range(self.item_count):
                if int(self.test_data[u][item]) == 1:
                    self.test[u * self.item_count + item] = 1
                else:
                    self.test[u * self.item_count + item] = 0
        # training
        for i in range(self.train_count):
            self.train(user_ratings_train)
        predict_matrix = self.predict(self.U, self.V)
        # prediction
        # getA()使得矩阵转换为数组narry，这样才可以取出其中的元素，负责会造成指针越界，而reshape(-1)则是把数组变成一行
        self.predict_ = predict_matrix.getA().reshape(-1)
        self.predict_ = pre_handel(user_ratings_train, self.predict_, self.item_count)
        auc_score = roc_auc_score(self.test, self.predict_)
        print('AUC:', auc_score)# auc=(area under curve)
        # Top-K evaluation
        str(scores.topK_scores(self.test, self.predict_, 5, self.user_count, self.item_count))

def pre_handel(set, predict, item_count):
    # Ensure the recommendation cannot be positive items in the training set.
    for u in set.keys():
        for j in set[u]:
            predict[(u - 1) * item_count + j - 1] = 0
    return predict

if __name__ == '__main__':
    bpr = BPR()
    bpr.main()

In [14]:
import random
from collections import defaultdict
import numpy as np  # Import NumPy for CPU support
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

class BPR:
    def __init__(self, filtered_user_count, unique_item_count, latent_factors, lr, reg, filtered_train_df, filtered_test_df):
        self.user_count = filtered_user_count
        self.item_count = unique_item_count  # Unique item count
        self.latent_factors = latent_factors
        self.lr = lr
        self.reg = reg
        self.filtered_train_df = filtered_train_df  # Using filtered train DataFrame
        self.filtered_test_df = filtered_test_df  # Using filtered test DataFrame

        # Initialize latent factors for users and items
        self.U = np.random.rand(self.user_count, self.latent_factors).astype(np.float32) * 0.01
        self.V = np.random.rand(self.item_count, self.latent_factors).astype(np.float32) * 0.01
        self.biasV = np.random.rand(self.item_count).astype(np.float32) * 0.01
        
        # Initialize test data using a dictionary for sparse representation
        self.test_data = defaultdict(dict)  # Maps user index to item index with values
        self.predict_ = np.zeros(self.user_count * self.item_count, dtype=np.float32)

        # Initialize user and item mappings
        self.user_mapping = {}
        self.item_mapping = {}
        self.reverse_user_mapping = {}
        self.reverse_item_mapping = {}

        # Create user and item mappings from filtered DataFrames
        self.create_mappings()

    def create_mappings(self):
        user_id_set = self.filtered_train_df['userid'].unique()  # Extract unique user IDs
        item_id_set = self.filtered_train_df['movieid'].unique()  # Extract unique item IDs

        # Create a mapping for user IDs
        for index, user_id in enumerate(user_id_set):
            self.user_mapping[user_id] = index
            self.reverse_user_mapping[index] = user_id

        # Create a mapping for item IDs
        for index, item_id in enumerate(item_id_set):
            self.item_mapping[item_id] = index
            self.reverse_item_mapping[index] = item_id

    def load_test_data(self):
        for _, row in self.filtered_test_df.iterrows():
            user = int(row['userid'])
            item = int(row['movieid'])

            if user in self.user_mapping:  # Only process if user is in mapping
                # Get the mapped index
                mapped_user_index = self.user_mapping[user]
                
                if item in self.item_mapping:  # Check if the item is mapped
                    mapped_item_index = self.item_mapping[item]
                    # Use dictionary for test data
                    self.test_data[mapped_user_index][mapped_item_index] = 1.0  # Mark interaction

    def train(self, user_ratings_train, bootstrap=True):
            
            for _ in range(self.user_count):  # Number of iterations
                try: 
                    # Bootstrap sample a user
                    u_original = random.choice(list(user_ratings_train.keys())) if bootstrap else random.randint(1, self.user_count)
                    if u_original not in user_ratings_train:
                        continue

                    # Map to the new user index
                    u = self.user_mapping[u_original]

                    # Sample a positive item (i) from user's interactions
                    i = random.sample(user_ratings_train[u_original], 1)[0]  # Convert to list for sampling

                    # Sample a negative item (j) not interacted by user
                    j = random.randint(1, self.item_count)
                    while j in user_ratings_train[u_original]:
                        j = random.randint(1, self.item_count)

                    # Decrement to make zero-indexed
                    i = self.item_mapping[i]  # Map to new item index
                    j = self.item_mapping[j]  # Map to new item index

                    # Compute predictions for positive (i) and negative (j) items
                    r_ui = np.dot(self.U[u], self.V[i]) + self.biasV[i]
                    r_uj = np.dot(self.U[u], self.V[j]) + self.biasV[j]
                    r_uij = r_ui - r_uj

                    # BPR optimization: Compute the gradient and update using log-sigmoid
                    loss_func = -1.0 / (1 + np.exp(r_uij))  # Logistic function for BPR loss

                    # Update latent factors for user and items
                    self.U[u] += self.lr * (loss_func * (self.V[i] - self.V[j]) - self.reg * self.U[u])
                    self.V[i] += self.lr * (loss_func * self.U[u] - self.reg * self.V[i])
                    self.V[j] += self.lr * (-loss_func * self.U[u] - self.reg * self.V[j])

                    # Update biases for items
                    self.biasV[i] += self.lr * (loss_func - self.reg * self.biasV[i])
                    self.biasV[j] += self.lr * (-loss_func - self.reg * self.biasV[j])
                except:
                    continue

    def predict(self):
        # Prediction matrix for user-item interactions
        predict_matrix = np.dot(self.U, self.V.T) + self.biasV
        return predict_matrix

    def evaluate(self):
        # Evaluate using AUC score
        predictions = self.predict()
        # Flatten the test data using a comprehension since it's a dict
        test_flat = np.array([self.test_data[u].get(i, 0) for u in range(self.user_count) for i in range(self.item_count)], dtype=np.float32)
        auc_score = roc_auc_score(test_flat, predictions.flatten())  # Convert to 1D array
        print('AUC:', auc_score)

    def main(self):
        user_ratings_train = defaultdict(set)
        
        # Prepare user ratings from filtered_train_df
        for _, row in self.filtered_train_df.iterrows():
            user = int(row['userid'])
            item = int(row['movieid'])
            user_ratings_train[user].add(item)

        self.load_test_data()

        # Train model
        for _ in range(1000):  # Number of training steps
            self.train(user_ratings_train)
        
        # Evaluate model
        self.evaluate()

def cross_validate_bpr(filtered_user_count, unique_item_count, latent_factors, lr, reg, filtered_train_df, filtered_test_df, n_splits=5):
    kf = KFold(n_splits=n_splits)
    auc_scores = []

    for train_index, test_index in kf.split(filtered_train_df):
        train_df = filtered_train_df.iloc[train_index]
        test_df = filtered_train_df.iloc[test_index]

        bpr = BPR(filtered_user_count, unique_item_count, latent_factors, lr, reg, train_df, test_df)
        bpr.main()
        auc_scores.append(bpr.evaluate())

    print(f'Mean AUC: {np.mean(auc_scores)}')
    print(f'Standard Deviation of AUC: {np.std(auc_scores)}')

# Running the BPR model with cross-validation
if __name__ == '__main__':
    filtered_user_count = 181686  # Updated number of unique users after filtering
    unique_item_count = 21570  # Updated number of unique items after filtering
    latent_factors = 10  # Example value
    lr = 0.01  # Learning rate
    reg = 0.01  # Regularization term

    cross_validate_bpr(filtered_user_count, unique_item_count, latent_factors, lr, reg, filtered_train_df, filtered_test_df)






since Python 3.9 and will be removed in a subsequent version.
  i = random.sample(user_ratings_train[u_original], 1)[0]  # Convert to list for sampling
  loss_func = -1.0 / (1 + np.exp(r_uij))  # Logistic function for BPR loss
  r_uij = r_ui - r_uj
  r_uij = r_ui - r_uj


: 

In [2]:
%pip install theano_bpr

Note: you may need to restart the kernel to use updated packages.


In [1]:
from theano_bpr import BPR

bpr = BPR(10, 181686, 21570)
bpr.train(filtered_train_df, epochs=50)
bpr.test(filtered_test_df)

ModuleNotFoundError: No module named 'bpr'

In [2]:
%pip install recommenders

Collecting recommenders
  Using cached recommenders-1.2.0-py3-none-any.whl (356 kB)
Collecting transformers<5,>=4.27.0
  Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightfm<2,>=1.17
  Using cached lightfm-1.17.tar.gz (316 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting numba<1,>=0.57.0
  Using cached numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)
Collecting lightgbm<5,>=4.0.0
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting pandera[strategies]>=0.15.0
  Downloading pandera-0.20.4-py3-none-any.whl (259 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.6/259.6 KB[0m [31m88.2 MB/s[0m eta [36m

In [3]:
%pip install cornac
%pip install numpy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import sys
import cornac
import pandas as pd

from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.utils.notebook_utils import store_metadata

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 500


train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/iiitd/bayesian_machine_learning/virtual/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/iiitd/bayesian_machine_learning/virtual/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()

ImportError: numpy.core.multiarray failed to import (auto-generated because you didn't call 'numpy.import_array()' after cimporting numpy; use '<void>numpy._import_array' to disable if you are certain you don't need it).