# KuaiRec 2.0 Recommender System

This repository implements a two-stage recommender system pipeline for the KuaiRec 2.0 dataset, comparing a baseline collaborative filtering model with a hybrid model that incorporates side features.




In [5]:
# Import standard libraries
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import lightfm
import os
import multiprocessing as mp


# Import local modules
from loaddata import load_interaction_data, load_item_categories, load_user_features, print_dataset_info
from preprocess import (
    derive_implicit_labels, filter_interactions, create_user_item_maps,
    leave_n_out_split, prepare_item_features, prepare_user_features
)
from evaluation import evaluate_model, plot_learning_curves
from main import train_baseline_model, train_hybrid_model, run_pipeline

# Constants
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Set up data directory
SCRIPT_DIR = Path(os.path.dirname(os.path.abspath("__file__")))
DATA_DIR = SCRIPT_DIR / "KuaiRec2.0" / "data"

NUM_THREADS = mp.cpu_count()
print(f"Using {NUM_THREADS} threads for computation")
print("Run good")


Using 12 threads for computation
Run good


## Data Loading and Preprocessing

Let's start by loading the KuaiRec 2.0 dataset and examining its structure.

In [6]:
# Configurable parameters for faster execution
FAST_MODE = True  # Set to False for more accurate but slower results
MAX_USERS = 500 if FAST_MODE else None  # Limit number of users for faster processing
TEST_NEG_RATIO = 20  # Default: 20, Lower for faster execution, Higher for better evaluation
EPOCHS = 50 if FAST_MODE else 200  # Using optimized settings from main.py
EVAL_EVERY = 10  # Evaluate every N epochs to save time (optimized)
PATIENCE = 5  # Number of evaluations with no improvement before early stopping (optimized)

print(f"Running in {'FAST' if FAST_MODE else 'STANDARD'} mode")
print(f"Test negative ratio: {TEST_NEG_RATIO}, Epochs: {EPOCHS}, Eval frequency: {EVAL_EVERY}")
print(f"Early stopping patience: {PATIENCE} evaluations")


Running in FAST mode
Test negative ratio: 20, Epochs: 50, Eval frequency: 10
Early stopping patience: 5 evaluations


In [7]:
# Define file paths
matrix_file = "small_matrix.csv"  # Update with actual filename
item_categories_file = "item_categories.csv"  # Update with actual filename
user_features_file = "user_features.csv"  # Update with actual filename

# Check if files exist
matrix_path = DATA_DIR / matrix_file
item_categories_path = DATA_DIR / item_categories_file
user_features_path = DATA_DIR / user_features_file

file_check = {
    "Interaction data": matrix_path.exists(),
    "Item categories": item_categories_path.exists(),
    "User features": user_features_path.exists()
}
print("File availability:")
for name, exists in file_check.items():
    print(f"  {name}: {'✓' if exists else '✗'} ({name} {'exists' if exists else 'not found'})")


File availability:
  Interaction data: ✓ (Interaction data exists)
  Item categories: ✓ (Item categories exists)
  User features: ✓ (User features exists)


In [8]:
# Load data if files exist
if all(file_check.values()):
    print(f"Loading data from {matrix_file}...")
    interactions_df = load_interaction_data(matrix_path)
    
    print(f"Loading item categories...")
    item_categories_df = load_item_categories(item_categories_path)
    
    print(f"Loading user features...")
    user_features_df = load_user_features(user_features_path)
    
    # Display basic information about the datasets
    print_dataset_info(interactions_df, "Interaction Data")
    print_dataset_info(item_categories_df, "Item Categories")
    print_dataset_info(user_features_df, "User Features")


Loading data from small_matrix.csv...
Loading item categories...
Loading user features...

--------------------------------------------------
Dataset: Interaction Data
--------------------------------------------------
Shape: (4676570, 8) (4676570 rows, 8 columns)

Columns: user_id, video_id, play_duration, video_duration, time, date, timestamp, watch_ratio

Sample data:
   user_id  video_id  play_duration  video_duration                     time  \
0       14       148           4381            6067  2020-07-05 05:27:48.378   
1       14       183          11635            6100  2020-07-05 05:28:00.057   
2       14      3649          22422           10867  2020-07-05 05:29:09.479   
3       14      5262           4479            7908  2020-07-05 05:30:43.285   
4       14      8234           4602           11000  2020-07-05 05:35:43.459   

         date     timestamp  watch_ratio  
0  20200705.0  1.593898e+09     0.722103  
1  20200705.0  1.593898e+09     1.907377  
2  20200705.0  1

## Data Preprocessing

Now we'll prepare the data for training by deriving implicit labels, filtering interactions, and creating train-test splits.

In [9]:
# Process data if loaded successfully
if 'interactions_df' in locals():
    # Derive implicit labels
    print("\nDeriving implicit labels (watch_ratio >= 0.8)...")
    interactions_df = derive_implicit_labels(interactions_df)
    positive_ratio = interactions_df['label'].mean()
    print(f"Positive interactions ratio: {positive_ratio:.4f}")
    
    # Filter users and items
    print("\nFiltering users and items with >= 3 positive interactions...")
    filtered_df, valid_users, valid_items = filter_interactions(interactions_df)
    
    if FAST_MODE and MAX_USERS and len(valid_users) > MAX_USERS:
        sampled_users = np.random.choice(valid_users, MAX_USERS, replace=False)
        filtered_df = filtered_df[filtered_df['user_id'].isin(sampled_users)]
        valid_users = sampled_users
        print(f"Fast mode: Sampled down to {len(valid_users)} users")
    
    # Create ID mappings
    user_to_idx, idx_to_user, item_to_idx, idx_to_item = create_user_item_maps(valid_users, valid_items)
    
    # Split data into train and test sets
    print("\nSplitting data (leave-n-out)...")
    split_data = leave_n_out_split(
        filtered_df, 
        user_to_idx, 
        item_to_idx, 
        test_ratio=0.2, 
        neg_ratio=4, 
        test_neg_ratio=TEST_NEG_RATIO, 
        random_state=RANDOM_SEED
    )



Deriving implicit labels (watch_ratio >= 0.8)...
Positive interactions ratio: 0.4744

Filtering users and items with >= 3 positive interactions...
Counting positive interactions per user and item...
Applying filtering...
Original interactions: 4676570
Filtered interactions: 4621597
Unique users: 1411
Unique items: 3288
Fast mode: Sampled down to 500 users
Creating user and item mappings...

Splitting data (leave-n-out)...
Building user interaction dictionaries...


Processing interactions: 100%|██████████| 1637715/1637715 [01:07<00:00, 24286.81it/s]


Creating train-test split...


Splitting users: 100%|██████████| 500/500 [00:01<00:00, 420.84it/s]


Creating DataFrames and matrices...
Building sparse matrices...
Training interactions: 628311
Testing interactions: 3244990


## Model Training

We'll train both a baseline collaborative filtering model and a hybrid model that incorporates side features.

In [10]:
# Train baseline model if data is prepared
if 'split_data' in locals():
    # Define baseline model with optimized hyperparameters
    baseline_model_params = {
        'loss': "warp",
        'no_components': 128,
        'learning_rate': 0.05,
        'user_alpha': 0.0001,
        'item_alpha': 0.0001,
        'max_sampled': 100,
        'random_state': RANDOM_SEED
    }
    
    print(f"\nBaseline model parameters: {baseline_model_params}")
    
    # Train baseline model
    baseline_model, baseline_train_metrics, baseline_test_metrics, baseline_epochs, baseline_time = train_baseline_model(
        split_data, 
        split_data, 
        epochs=EPOCHS, 
        eval_every=EVAL_EVERY,
        patience=PATIENCE
    )



Baseline model parameters: {'loss': 'warp', 'no_components': 128, 'learning_rate': 0.05, 'user_alpha': 0.0001, 'item_alpha': 0.0001, 'max_sampled': 100, 'random_state': 42}

Training Baseline Model (LightFM with WARP loss)


Evaluating at epoch 50: 100%|██████████| 50/50 [02:21<00:00,  2.84s/it, train_f1@5=0.0084, test_f1@5=0.0311, best_f1@5=0.0311, no_improv=3]


Total training time: 141.96 seconds

Final metrics:
  Train: {'precision@5': 1.0, 'recall@5': 0.004230615143342552, 'f1@5': 0.008423464728080646, 'ndcg@5': 1.0, 'precision@10': 1.0, 'recall@10': 0.008461230286685105, 'f1@10': 0.016772113452877144, 'ndcg@10': 1.0, 'item_coverage@10': 0.7077250608272506, 'diversity@10': 0.8916979254962264}
  Test:  {'precision@5': 0.9432, 'recall@5': 0.01582891530650134, 'f1@5': 0.031106626162144446, 'ndcg@5': 0.9525570003249053, 'precision@10': 0.8914000000000001, 'recall@10': 0.029845874197428204, 'f1@10': 0.05765501113723962, 'ndcg@10': 0.9137497101295551, 'item_coverage@10': 0.7025547445255474, 'diversity@10': 0.8912410368312784}





In [None]:
# Prepare item and user features and train hybrid model
if 'split_data' in locals() and 'item_categories_df' in locals() and 'user_features_df' in locals():
    print("\nPreparing item and user features...")
    item_features_mat = prepare_item_features(item_categories_df, item_to_idx)
    user_features_mat = prepare_user_features(user_features_df, user_to_idx)
    
    # Define hybrid model with optimized hyperparameters
    hybrid_model_params = {
        'loss': "warp",
        'no_components': 128,
        'learning_rate': 0.02,
        'user_alpha': 0.0001,
        'item_alpha': 0.0001,
        'max_sampled': 50,
        'random_state': RANDOM_SEED
    }
    
    print(f"\nHybrid model parameters: {hybrid_model_params}")
    
    # Train hybrid model
    hybrid_model, hybrid_train_metrics, hybrid_test_metrics, hybrid_epochs, hybrid_time = train_hybrid_model(
        split_data, 
        split_data, 
        user_features_mat,
        item_features_mat,
        epochs=EPOCHS, 
        eval_every=EVAL_EVERY,
        patience=PATIENCE
    )



Preparing item and user features...
Preparing item features...
Extracting categories...


Processing categories: 100%|██████████| 3288/3288 [00:00<00:00, 1195774.87it/s]


Total unique categories: 108


Building feature matrix: 100%|██████████| 3288/3288 [00:00<00:00, 21259.47it/s]


Creating sparse feature matrix...
Item feature matrix shape: (3288, 108)
Matrix sparsity: 0.9872
Preparing user features...
Using numerical features: ['follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days']
Using categorical features: ['user_active_degree', 'follow_user_num_range', 'fans_user_num_range', 'friend_user_num_range', 'register_days_range']
User feature matrix shape: (500, 29)
Matrix sparsity: 0.6897

Hybrid model parameters: {'loss': 'warp', 'no_components': 128, 'learning_rate': 0.02, 'user_alpha': 0.0001, 'item_alpha': 0.0001, 'max_sampled': 50, 'random_state': 42}

Training Hybrid Model (LightFM with user and item features)


Evaluating at epoch 10:  32%|███▏      | 16/50 [02:08<04:06,  7.26s/it, train_f1@5=0.0083, test_f1@5=0.0278, best_f1@5=0.0278, no_improv=0]

## Results Visualization

Let's visualize the learning curves and compare the performance of the baseline and hybrid models.

In [None]:
# Plot learning curves if both models were trained
if 'baseline_model' in locals() and 'hybrid_model' in locals():
    metrics_to_plot = ['precision@5', 'recall@5', 'f1@5', 'ndcg@10', 'item_coverage@10']
    
    # Plot baseline model learning curves
    plot_learning_curves(
        baseline_train_metrics, 
        baseline_test_metrics, 
        metrics_to_plot, 
        baseline_epochs, 
        "Baseline Model"
    )
    
    # Plot hybrid model learning curves
    plot_learning_curves(
        hybrid_train_metrics, 
        hybrid_test_metrics, 
        metrics_to_plot, 
        hybrid_epochs, 
        "Hybrid Model"
    )
    
    # Compare final metrics
    baseline_final = baseline_test_metrics[-1]
    hybrid_final = hybrid_test_metrics[-1]
    
    # Create comparison dataframe with safety check for keys
    comparison_data = {}
    for metric in metrics_to_plot:
        if metric in baseline_final and metric in hybrid_final:
            comparison_data[metric] = [baseline_final[metric], hybrid_final[metric]]
    
    comparison = pd.DataFrame(comparison_data, index=['Baseline', 'Hybrid']).T
    comparison['Improvement (%)'] = ((comparison['Hybrid'] - comparison['Baseline']) / comparison['Baseline'] * 100).round(2)
    
    print("\nModel Comparison:")
    display(comparison)
    
    # Visualize comparison
    ax = comparison[['Baseline', 'Hybrid']].plot(kind='bar', figsize=(10, 6))
    ax.set_ylabel('Score')
    ax.set_title('Baseline vs Hybrid Model Performance')
    ax.legend()
    plt.tight_layout()
    plt.show()


## Hyperparameter Tuning

Let's experiment with different hyperparameters to improve our hybrid model performance.

In [12]:
from sklearn.model_selection import ParameterGrid

# Define hyperparameter grid using optimized values as reference
param_grid = {
    'no_components': [64, 128, 256],
    'learning_rate': [0.01, 0.02, 0.05],
    'item_alpha': [0.0, 0.0001, 0.001],
    'user_alpha': [0.0, 0.0001, 0.001],
    'loss': ['warp', 'warp-kos'],
    'max_sampled': [30, 50, 100]
}

# For demonstration, we'll use a smaller grid with our optimized values
small_grid = {
    'no_components': [128],
    'learning_rate': [0.02],
    'item_alpha': [0.0001],
    'user_alpha': [0.0001],
    'loss': ['warp'],
    'max_sampled': [50]
}


In [13]:
# Function to train and evaluate a model with given parameters
def train_evaluate_model(params, train_data, test_data, user_features=None, item_features=None, epochs=20, patience=3):
    model = lightfm.LightFM(
        loss=params['loss'],
        no_components=params['no_components'],
        learning_rate=params['learning_rate'],
        item_alpha=params['item_alpha'],
        user_alpha=params['user_alpha'],
        max_sampled=params.get('max_sampled', 50),
        random_state=RANDOM_SEED
    )
    
    # Train model with early stopping
    best_score = -float('inf')
    best_model = None
    no_improvement = 0
    
    for epoch in range(1, epochs+1):
        model.fit_partial(
            train_data['train_interactions'],
            user_features=user_features,
            item_features=item_features,
            epochs=1,
            num_threads=4  # Lower thread count for notebook environment
        )
        
        # Check performance every 5 epochs
        if epoch % 5 == 0:
            # Evaluate on test data
            metrics = evaluate_model(
                model, 
                test_data['test_df'], 
                test_data['n_users'], 
                test_data['n_items'],
                user_features,
                item_features
            )
            
            current_score = metrics['f1@5']  # Metric to monitor
            
            if current_score > best_score:
                best_score = current_score
                best_model = model.get_params()
                no_improvement = 0
            else:
                no_improvement += 1
                
            if no_improvement >= patience:
                print(f"Early stopping at epoch {epoch}")
                break
    
    # Restore best model if needed
    if best_model is not None and no_improvement >= patience:
        model = lightfm.LightFM(**best_model)
    
    # Final evaluation
    metrics = evaluate_model(
        model, 
        test_data['test_df'], 
        test_data['n_users'], 
        test_data['n_items'],
        user_features,
        item_features
    )
    
    return model, metrics


In [None]:
# Run a mini grid search if data is available
if 'split_data' in locals() and 'user_features_mat' in locals() and 'item_features_mat' in locals():
    results = []
    
    # Test a few configurations from the small grid
    for params in list(ParameterGrid(small_grid))[:2]:  # Just try 2 configs for demonstration
        print(f"\nTraining with parameters: {params}")
        
        model, metrics = train_evaluate_model(
            params, 
            split_data, 
            split_data, 
            user_features_mat, 
            item_features_mat,
            epochs=20,  # Reduced for notebook demonstration
            patience=PATIENCE
        )
        
        results.append({
            'params': params,
            'metrics': metrics
        })
        
        print(f"Results:\n{metrics}")
    
    # Format results into a DataFrame for easy comparison
    result_df = pd.DataFrame([
        {
            'components': r['params']['no_components'],
            'learning_rate': r['params']['learning_rate'],
            'loss': r['params']['loss'],
            'max_sampled': r['params'].get('max_sampled', 50),
            'user_alpha': r['params']['user_alpha'],
            'item_alpha': r['params']['item_alpha'],
            'precision@5': r['metrics']['precision@5'],
            'recall@5': r['metrics']['recall@5'],
            'f1@5': r['metrics']['f1@5'],
            'ndcg@10': r['metrics']['ndcg@10'],
            'diversity@10': r['metrics'].get('diversity@10', 0),
            'coverage@10': r['metrics'].get('item_coverage@10', 0)
        } for r in results
    ])
    
    print("\nHyperparameter Tuning Results:")
    display(result_df)


In [None]:
# Compare with previous models if tuning was successful
if 'result_df' in locals() and len(result_df) > 0 and 'baseline_final' in locals() and 'hybrid_final' in locals():
    # Find best model from tuning
    best_idx = result_df['f1@5'].idxmax()
    best_params = results[best_idx]['params']
    best_metrics = results[best_idx]['metrics']
    
    print(f"Best configuration: {best_params}")
    
    # Create comparison dataframe
    comparison = pd.DataFrame({
        'Baseline': [baseline_final[m] for m in metrics_to_plot],
        'Hybrid (initial)': [hybrid_final[m] for m in metrics_to_plot],
        'Hybrid (optimized)': [best_metrics[m] for m in metrics_to_plot]
    }, index=metrics_to_plot)
    
    # Calculate improvement percentages
    improvement = pd.DataFrame({
        'Baseline → Hybrid': [(hybrid_final[m] - baseline_final[m]) / baseline_final[m] * 100 for m in metrics_to_plot],
        'Hybrid → Optimized': [(best_metrics[m] - hybrid_final[m]) / hybrid_final[m] * 100 for m in metrics_to_plot],
        'Baseline → Optimized': [(best_metrics[m] - baseline_final[m]) / baseline_final[m] * 100 for m in metrics_to_plot]
    }, index=metrics_to_plot)
    
    print("\nModel Performance Comparison:")
    display(comparison)
    
    print("\nImprovement Percentages:")
    display(improvement)
    
    # Visualize the comparison
    ax = comparison.plot(kind='bar', figsize=(12, 6))
    ax.set_ylabel('Score')
    ax.set_title('Performance Comparison Across Models')
    ax.legend()
    plt.tight_layout()
    plt.show()


## Conclusions and Recommendations

Based on our experimentation, we can draw several conclusions about our recommender system:

1. **Model Performance**: The optimized hybrid model using WARP loss and proper regularization demonstrates improved metrics compared to the baseline approach.

2. **Feature Importance**: Side features (user demographics and item categories) can enhance recommendation quality when properly normalized and incorporated.

3. **Hyperparameters**: The most influential hyperparameters are:
   - Loss function: WARP outperforms BPR for ranking tasks
   - Number of components: Higher dimensionality (128) captures more complex patterns
   - Learning rate: Lower values (0.02) provide more stable convergence for hybrid model
   - Regularization: Light regularization (0.0001) prevents overfitting
   - Negative sampling: Tuned max_sampled parameter improves training efficiency

4. **Tradeoffs**: There's a tradeoff between precision and recall that should be considered based on the specific application requirements.

5. **Future Work**: Potential improvements include:
   - Incorporating temporal information
   - Using more advanced feature engineering
   - Exploring ensemble approaches combining multiple models
   - Implementing online learning for dynamic updates