\## Recommender Systems

# Recommender Using GRU4REc:
Expected Runtime:

For ml-100k (100,000 interactions) and with the configurations (10 epochs, batch size of 128):


*   On a GPU (e.g., NVIDIA Tesla T4): It may take about 3–8 minutes.
*   On a CPU (e.g., 8-core): It could range from 10–20 minutes
*   GPU (Google Colab) T4 GPU: ~2–5 minutes
*   GPU (Google Colab) L4 GPU: ~1.5–4 minutes
*   GPU (Google Colab) A100 GPU: ~1–2 minutes






**Model 1: Using GRU4Rec**

Its a sequencial model, datasets selection is in progress.

Source: https://grouplens.org/datasets/movielens/100k/


This data set consists of:

*   100,000 ratings (1-5) from 943 users on 1682 movies.
*   Each user has rated at least 20 movies.
*  Simple demographic info for the users (age, gender, occupation, zip)

In [None]:
import os
import time
import torch
from recbole.config import Config
from recbole.data.utils import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Ensure that only one GPU (A100) is visible to avoid distributed issues
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only the first GPU available

# Configuration dictionary for GRU4Rec with MovieLens 20M dataset on GPU
config_dict = {
    'model': 'GRU4Rec',  # Model to use
    'dataset': 'ml-20m',  # MovieLens 20M dataset with timestamps for sequential modeling
    'data_path': 'dataset/',  # Path to the dataset directory
    'epochs': 10,  # Number of training epochs
    'train_batch_size': 256,  # Batch size for training
    'eval_batch_size': 256,  # Batch size for evaluation
    'learning_rate': 0.001,  # Learning rate
    'topk': [5, 10],  # Evaluate top-k results (e.g., Recall@5, Recall@10)
    'metrics': ['Recall', 'Precision', 'NDCG'],  # Metrics to evaluate during validation and testing
    'valid_metric': 'recall@10',  # Metric used to select the best model during validation
    'log_level': 'none',  # Disable logging output for simplicity
    'checkpoint_dir': './saved_models/',  # Directory to save the best model
    'USER_INTER_ORDER': True,  # Order user interactions by timestamp
    'MAX_ITEM_LIST_LENGTH': 50,  # Define the max length of interaction sequence per user
    'train_neg_sample_args': None,  # Explicitly set to None for compatibility with GRU4Rec
    'use_gpu': True  # Enable GPU usage
}

# Capture the start time to calculate total runtime
start_time = time.time()

# Initialize the configuration object
config = Config(model='GRU4Rec', dataset='ml-20m', config_dict=config_dict)

# Initialize the seed for reproducibility and logging (if enabled)
init_seed(42, True)
init_logger(config)

# Check and confirm that a single GPU is available
print("Using device:", config['device'])  # Should print "cuda" if GPU is set up correctly

# Load the dataset and prepare data splits
dataset = create_dataset(config)  # Downloads the dataset automatically if not found locally
train_data, valid_data, test_data = data_preparation(config, dataset)  # Split data into train, validation, and test sets

# Initialize the GRU4Rec model and the Trainer
model = GRU4Rec(config, train_data.dataset).to(config['device'])  # Initialize model on GPU
trainer = Trainer(config, model)  # Trainer handles the training and evaluation process

# Lists to store performance metrics at each epoch for analysis
epochs = []
recall_at_10 = []
precision_at_10 = []
ndcg_at_10 = []

# Training loop across epochs
for epoch in range(config['epochs']):
    print(f"Epoch {epoch + 1} / {config['epochs']}")

    # Train the model for one epoch and get the best validation score and results
    best_valid_score, valid_result = trainer.fit(train_data, valid_data)

    # Capture the validation metrics for recall@10, precision@10, ndcg@10
    recall_at_10.append(valid_result.get('recall@10', 'N/A'))
    precision_at_10.append(valid_result.get('precision@10', 'N/A'))
    ndcg_at_10.append(valid_result.get('ndcg@10', 'N/A'))

    # Append the current epoch number if all metrics were recorded
    epochs.append(epoch + 1)

    # Display the metrics for the current epoch
    print(f"Epoch {epoch + 1}: recall@10={recall_at_10[-1]}, precision@10={precision_at_10[-1]}, ndcg@10={ndcg_at_10[-1]}")

# Load the best model checkpoint for testing
best_model_path = os.path.join(config['checkpoint_dir'], config['model'] + '.pth')
if os.path.exists(best_model_path):
    model.load_state_dict(trainer.load_checkpoint(best_model_path))  # Load saved state
else:
    print("Best model checkpoint not found; using the current model.")

# Evaluate the model on the test data
test_result = trainer.evaluate(test_data)
print("Test results:", test_result)

# Capture the end time and calculate total runtime
end_time = time.time()
total_time = end_time - start_time
print(f"Total runtime: {total_time:.2f} seconds")

# Plot epoch-wise performance metrics if available
if recall_at_10 and precision_at_10 and ndcg_at_10:
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, recall_at_10, label='Recall@10')
    plt.plot(epochs, precision_at_10, label='Precision@10')
    plt.plot(epochs, ndcg_at_10, label='NDCG@10')
    plt.xlabel('Epochs')
    plt.ylabel('Metric Value')
    plt.title('Epoch-wise Performance Metrics for GRU4Rec on MovieLens 20M')
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("Metrics not available for plotting.")

In [None]:
import os
import time
from recbole.config import Config
from recbole.data.utils import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Ensure that no GPU is visible to enforce CPU-only usage
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disables GPU visibility

# Configuration dictionary for GRU4Rec with MovieLens 20M dataset on CPU
config_dict = {
    'model': 'GRU4Rec',  # Model to use
    'dataset': 'ml-20m',  # MovieLens 20M dataset with timestamps for sequential modeling
    'data_path': 'dataset/',  # Path to the dataset directory
    'epochs': 10,  # Number of training epochs
    'train_batch_size': 256,  # Batch size for training
    'eval_batch_size': 256,  # Batch size for evaluation
    'learning_rate': 0.001,  # Learning rate
    'topk': [5, 10],  # Evaluate top-k results (e.g., Recall@5, Recall@10)
    'metrics': ['Recall', 'Precision', 'NDCG'],  # Metrics to evaluate during validation and testing
    'valid_metric': 'recall@10',  # Metric used to select the best model during validation
    'log_level': 'none',  # Disable logging output for simplicity
    'checkpoint_dir': './saved_models/',  # Directory to save the best model
    'USER_INTER_ORDER': True,  # Order user interactions by timestamp
    'MAX_ITEM_LIST_LENGTH': 50,  # Define the max length of interaction sequence per user
    'train_neg_sample_args': None,  # Explicitly set to None for compatibility with GRU4Rec
    'use_gpu': False  # Explicitly disable GPU usage
}

# Capture the start time to calculate total runtime
start_time = time.time()

# Initialize the configuration object
config = Config(model='GRU4Rec', dataset='ml-20m', config_dict=config_dict)

# Initialize the seed for reproducibility and logging (if enabled)
init_seed(42, True)
init_logger(config)

# Load the dataset and prepare data splits
dataset = create_dataset(config)  # Downloads the dataset automatically if not found locally
train_data, valid_data, test_data = data_preparation(config, dataset)  # Split data into train, validation, and test sets

# Initialize the GRU4Rec model and the Trainer
model = GRU4Rec(config, train_data.dataset)  # Initialize model on CPU
trainer = Trainer(config, model)  # Trainer handles the training and evaluation process

# Lists to store performance metrics at each epoch for analysis
epochs = []
recall_at_10 = []
precision_at_10 = []
ndcg_at_10 = []

# Training loop across epochs
for epoch in range(config['epochs']):
    print(f"Epoch {epoch + 1} / {config['epochs']}")

    # Train the model for one epoch and get the best validation score and results
    best_valid_score, valid_result = trainer.fit(train_data, valid_data)

    # Capture the validation metrics for recall@10, precision@10, ndcg@10
    recall_at_10.append(valid_result.get('recall@10', 'N/A'))
    precision_at_10.append(valid_result.get('precision@10', 'N/A'))
    ndcg_at_10.append(valid_result.get('ndcg@10', 'N/A'))

    # Append the current epoch number if all metrics were recorded
    epochs.append(epoch + 1)

    # Display the metrics for the current epoch
    print(f"Epoch {epoch + 1}: recall@10={recall_at_10[-1]}, precision@10={precision_at_10[-1]}, ndcg@10={ndcg_at_10[-1]}")

# Load the best model checkpoint for testing
best_model_path = os.path.join(config['checkpoint_dir'], config['model'] + '.pth')
if os.path.exists(best_model_path):
    model.load_state_dict(trainer.load_checkpoint(best_model_path))  # Load saved state
else:
    print("Best model checkpoint not found; using the current model.")

# Evaluate the model on the test data
test_result = trainer.evaluate(test_data)
print("Test results:", test_result)

# Capture the end time and calculate total runtime
end_time = time.time()
total_time = end_time - start_time
print(f"Total runtime: {total_time:.2f} seconds")

# Plot epoch-wise performance metrics if available
if recall_at_10 and precision_at_10 and ndcg_at_10:
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, recall_at_10, label='Recall@10')
    plt.plot(epochs, precision_at_10, label='Precision@10')
    plt.plot(epochs, ndcg_at_10, label='NDCG@10')
    plt.xlabel('Epochs')
    plt.ylabel('Metric Value')
    plt.title('Epoch-wise Performance Metrics for GRU4Rec on MovieLens 20M')
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("Metrics not available for plotting.")

In [2]:
!pip install recbole
!pip install ray
!pip install kmeans_pytorch
import recbole.model.sequential_recommender as seq_recommenders
print(dir(seq_recommenders))


Collecting recbole
  Downloading recbole-1.2.0-py3-none-any.whl.metadata (1.4 kB)
Collecting colorlog==4.7.2 (from recbole)
  Downloading colorlog-4.7.2-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting colorama==0.4.4 (from recbole)
  Downloading colorama-0.4.4-py2.py3-none-any.whl.metadata (14 kB)
Collecting thop>=0.1.1.post2207130030 (from recbole)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting texttable>=0.9.0 (from recbole)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading recbole-1.2.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: texttable, colorlog, 