In [11]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)
from parquet_data_reader import ParquetDataReader
from models.ring_buffer_baseline import RingBufferBaseline


import polars as pl
pl.Config.set_tbl_cols(-1)
import numpy as np
parquet_reader = ParquetDataReader()

In [12]:
import polars as pl
import numpy as np

# Read the parquet data.
train_behavior_df = parquet_reader.read_data("../../data/train/behaviors.parquet")
test_behaviours_df = parquet_reader.read_data('../../data/validation/behaviors.parquet')

# Process the training data.
processed_train_behavior_df = train_behavior_df.explode("article_ids_clicked")
processed_train_behavior_df = processed_train_behavior_df.filter(pl.col("article_ids_clicked").is_not_null())
processed_train_behavior_df = processed_train_behavior_df.filter(pl.col("article_id").is_not_null())  # TODO: IMPORTANT

# Process the testing data.
processed_test_behaviours_df = test_behaviours_df.explode("article_ids_clicked")
processed_test_behaviours_df = processed_test_behaviours_df.filter(pl.col("article_ids_clicked").is_not_null())
processed_test_behaviours_df = processed_test_behaviours_df.filter(pl.col("article_id").is_not_null())

# Concatenate processed training and testing data.
combined_df = pl.concat([processed_train_behavior_df, processed_test_behaviours_df])

# ----- Method 1: Random Split -----
n = combined_df.height  # Total number of rows
test_mask = np.random.rand(n) < 0.30  # 30% test, 70% train

# Apply the mask.
test_random = combined_df.filter(test_mask)
train_random = combined_df.filter(~test_mask)

print("Random Split:")
print("Train shape:", train_random.shape)
print("Test shape:", test_random.shape)


# ----- Method 2: Time-based Split -----
# Sort the combined DataFrame by impression_time and a secondary key (impression_id)
combined_df_time = combined_df.sort(["impression_time", "impression_id"])

# Compute the number of rows for the test set (30% of the data).
n_total = combined_df_time.height
n_test = int(n_total * 0.30)

# Split the DataFrame based on time:
# - Test: the oldest 30% interactions.
# - Train: the newest 70% interactions.
test_time = combined_df_time.head(n_test)
train_time = combined_df_time.tail(n_total - n_test)

print("\nTime-based Split:")
print("Train shape:", train_time.shape)
print("Test shape:", test_time.shape)


Random Split:
Train shape: (99739, 17)
Test shape: (42417, 17)

Time-based Split:
Train shape: (99510, 17)
Test shape: (42646, 17)


In [13]:
recommender = RingBufferBaseline(behaviors=train_random)
recommender.fit()

user_id_test = 151570
recommendations = recommender.recommend(user_id=user_id_test, n=5)
# Prints the recommended articles and the user id
# Recommendations for user X
print(f"Recommendations for user {user_id_test}:")
print(recommendations)


Recommendations for user 151570:
[9770989, 9771042, 9770882, 9769650, 9771042]


In [14]:
recommender2 = RingBufferBaseline(behaviors=train_time)
recommender2.fit()

user_id_test2 = 151570
recommendations2 = recommender.recommend(user_id=user_id_test2, n=5)
# Prints the recommended articles and the user id
# Recommendations for user X
print(f"Recommendations for user {user_id_test2}:")
print(recommendations2)

Recommendations for user 151570:
[9770989, 9771042, 9770882, 9769650, 9771042]


In [15]:
# Evaluate the recommender using the same data as test data.
metrics = recommender.evaluate(test_data=test_random, k=5)
print("\nEvaluation metrics (precision and recall at k):")
print(metrics)


# Evaluate the recommender using the same data as test data.
metrics2 = recommender2.evaluate(test_data=test_time, k=5)
print("\nEvaluation metrics (precision and recall at k):")
print(metrics2)



Evaluation metrics (precision and recall at k):
{'precision': np.float64(0.0015333333333333332), 'recall': np.float64(0.0015454534325243274), 'fpr': np.float64(0.002176436721272252)}

Evaluation metrics (precision and recall at k):
{'precision': np.float64(0.014185981569394024), 'recall': np.float64(0.022623360242550565), 'fpr': np.float64(0.00430274981628095)}
