## Demostration of the user-items bias baseline model


In [39]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)
from utils.process_data import user_item_interaction_scores
from utils.data_exploration import perform_eda
from parquet_data_reader import ParquetDataReader
from models.baseline import UserItemBiasRecommender

import polars as pl
import numpy as np
parquet_reader = ParquetDataReader()

### Reading data

In [40]:
train_behavior_df = parquet_reader.read_data("../../data/train/behaviors.parquet")
embeddings_df = parquet_reader.read_data("../../data/document_vector.parquet")
article_df = parquet_reader.read_data("../../data/articles.parquet")
test_behavior_df = parquet_reader.read_data("../../data/validation/behaviors.parquet")
processed_behavior_df = user_item_interaction_scores(train_behavior_df, article=article_df)

In [41]:
# Performs an exploratory data analysis on the articles
# This prints the basic information about the data
perform_eda(article_df, name="Articles Dataframe")

=== Articles Dataframe Schema ===
Schema([('article_id', Int32), ('title', String), ('subtitle', String), ('last_modified_time', Datetime(time_unit='us', time_zone=None)), ('premium', Boolean), ('body', String), ('published_time', Datetime(time_unit='us', time_zone=None)), ('image_ids', List(Int64)), ('article_type', String), ('url', String), ('ner_clusters', List(String)), ('entity_groups', List(String)), ('topics', List(String)), ('category', Int16), ('subcategory', List(Int16)), ('category_str', String), ('total_inviews', Int32), ('total_pageviews', Int32), ('total_read_time', Float32), ('sentiment_score', Float32), ('sentiment_label', String)])

=== Articles Dataframe describe() ===
shape: (9, 22)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ article_i ┆ title     ┆ subtitle  ┆ … ┆ total_pag ┆ total_rea ┆ sentiment ┆ sentimen │
│ ---       ┆ d         ┆ ---       ┆ ---       ┆   ┆ eviews    ┆ d_time    ┆ _score   

In [42]:
# Performs an exploratory data analysis on the processed behavior data
# This prints the basic information about the data
perform_eda(processed_behavior_df, name="Processed Behavior Dataframe")

=== Processed Behavior Dataframe Schema ===
Schema([('user_id', UInt32), ('article_id', Int32), ('impression_time', Datetime(time_unit='us', time_zone=None)), ('score', Float64)])

=== Processed Behavior Dataframe describe() ===
shape: (9, 5)
┌────────────┬───────────────┬───────────────┬────────────────────────────┬──────────┐
│ statistic  ┆ user_id       ┆ article_id    ┆ impression_time            ┆ score    │
│ ---        ┆ ---           ┆ ---           ┆ ---                        ┆ ---      │
│ str        ┆ f64           ┆ f64           ┆ str                        ┆ f64      │
╞════════════╪═══════════════╪═══════════════╪════════════════════════════╪══════════╡
│ count      ┆ 53349.0       ┆ 53349.0       ┆ 53349                      ┆ 52004.0  │
│ null_count ┆ 0.0           ┆ 0.0           ┆ 0                          ┆ 1345.0   │
│ mean       ┆ 1.3239e6      ┆ 9.7647e6      ┆ 2023-05-21 18:14:12.956868 ┆ 0.627594 │
│ std        ┆ 733898.681736 ┆ 135439.146612 ┆ null          

### Create and fit the model

In [43]:
model = UserItemBiasRecommender(processed_behavior_df)
model.fit()


### Recommendations

In [44]:
model.recommend(2423448)

[9514727, 9667501, 9714376, 9419945, 9761391]

In [45]:
model.predict(2423448,9714376)

0.841934084892273

### Evaluation

In [46]:
results = model.evaluate_recommender(test_data=test_behavior_df,k=5,n_jobs=4,user_sample=1000)
print("Results")
results

Results


{'Precision@K': np.float64(0.0), 'NDCG@K': np.float64(0.0)}