In [1]:
import polars as pl
import numpy as np
# from utils.process_data import process_item_data, explode_user_interactions #TODO see if I need this
from parquet_data_reader import ParquetDataReader
from models.user_based import CollaborativeRecommender

pl.Config.set_tbl_cols(-1)

polars.config.Config

## Data import and EDA

In [2]:
data_reader = ParquetDataReader()
articles_df = data_reader.read_data('../data/articles.parquet')
train_behaviors_df = data_reader.read_data('../data/train/behaviors.parquet')
train_history_df = data_reader.read_data('../data/train/history.parquet')
document_vectors_df = data_reader.read_data('../data/document_vector.parquet')

We check the size of the data. From the sizes we learn that:
<ol>
  <li>We have 20738 unique articles</li>
  <li>We have 15143 unique users</li>
  <li>We have 232887 interactions in the testset</li>
</ol> 

In [3]:
print("articles_df has the size:         ", articles_df.shape)
print("train_behaviors_df has the size:  ", train_behaviors_df.shape)
print("train_history_df has the size:    ", train_history_df.shape)
print("document_vectors_df has the size: ", document_vectors_df.shape)

articles_df has the size:          (20738, 21)
train_behaviors_df has the size:   (232887, 17)
train_history_df has the size:     (15143, 5)
document_vectors_df has the size:  (125541, 2)


### Validation set

In [4]:
test_behaviours_df = data_reader.read_data('../data/validation/behaviors.parquet')
test_behaviours_df.head()

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32
96791,,2023-05-28 04:21:24,9.0,,2,"[9783865, 9784591, … 9784710]",[9784696],22548,False,,,,False,142,72.0,100.0
96798,,2023-05-28 04:31:48,46.0,,2,"[9782884, 9783865, … 9784648]",[9784281],22548,False,,,,False,143,16.0,28.0
96801,,2023-05-28 04:30:17,14.0,,2,"[9784648, 7184889, … 9781983]",[9784444],22548,False,,,,False,143,12.0,24.0
96808,,2023-05-28 04:27:19,22.0,,2,"[9784607, 9695098, … 9781983]",[9781983],22548,False,,,,False,142,125.0,80.0
96810,,2023-05-28 04:29:47,23.0,,2,"[9781983, 7184889, … 9781520]",[9784642],22548,False,,,,False,142,,


In [5]:
# Combine train and test behaviors
combined_df = pl.concat([train_behaviors_df, test_behaviours_df])

# Generate a random mask for splitting
n = combined_df.height  # Total number of rows
test_mask = np.random.rand(n) < 0.30  # 30% test, 70% train

# Apply the mask
test_behaviors_df = combined_df.filter(test_mask)
train_behaviors_df = combined_df.filter(~test_mask)

# Verify the split
print(f"Train size: {train_behaviors_df.shape[0]}, Test size: {test_behaviors_df.shape[0]}")

Train size: 333982, Test size: 143552


### Table contents

The information on news articles. As we are going to perform user-user CF, this table is not neccesary

In [6]:
articles_df.head()

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var ikke den første""","""Politiet frygter nu, at Natasc…",2023-06-29 06:20:33,False,"""Sagen om den østriske Natascha…",2006-08-31 08:06:45,[3150850],"""article_default""","""https://ekstrabladet.dk/krimi/…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars tjente mere""","""Biografgængerne strømmer ind f…",2023-06-29 06:20:35,False,"""Vatikanet har opfordret til at…",2006-05-21 16:57:00,[3006712],"""article_default""","""https://ekstrabladet.dk/underh…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""
3012771,"""Morten Bruun fyret i Sønderjys…","""FODBOLD: Morten Bruun fyret me…",2023-06-29 06:20:39,False,"""Kemien mellem spillerne i Supe…",2006-05-01 14:28:40,[3177953],"""article_default""","""https://ekstrabladet.dk/sport/…",[],[],"[""Erhverv"", ""Kendt"", … ""Ansættelsesforhold""]",142,"[196, 199]","""sport""",,,,0.8241,"""Negative"""
3023463,"""Luderne flytter på landet""","""I landets tyndest befolkede om…",2023-06-29 06:20:43,False,"""Det frække erhverv rykker på l…",2007-03-24 08:27:59,[3184029],"""article_default""","""https://ekstrabladet.dk/nyhede…",[],[],"[""Livsstil"", ""Erotik""]",118,[133],"""nyheder""",,,,0.7053,"""Neutral"""
3032577,"""Cybersex: Hvornår er man utro?""","""En flirtende sms til den flott…",2023-06-29 06:20:46,False,"""De fleste af os mener, at et t…",2007-01-18 10:30:37,[3030463],"""article_default""","""https://ekstrabladet.dk/sex_og…",[],[],"[""Livsstil"", ""Partnerskab""]",565,[],"""sex_og_samliv""",,,,0.9307,"""Neutral"""


Each file consists of seven days of impression logs. The train_behaviors_df table contains all information about interactions between users and items, and can be used as a basis for user-user CF. <strong>Therefore we only need this table</strong>.

In [7]:
train_behaviors_df.head()

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32
150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, … 9778682]",[9778623],143471,False,,,,False,1240,287.0,100.0
153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, … 9778682]",[9778669],151570,False,,,,False,1976,45.0,100.0
153070,9777492.0,2023-05-24 07:13:14,26.0,100.0,1,"[9020783, 9778444, … 9778628]",[9778628],151570,False,,,,False,1976,4.0,18.0
153071,9778623.0,2023-05-24 07:11:08,125.0,100.0,1,"[9777492, 9774568, … 9775990]",[9777492],151570,False,,,,False,1976,26.0,100.0
153075,9777492.0,2023-05-24 07:13:58,26.0,100.0,1,"[9778500, 9776420, … 9020783]",[9777034],151570,False,,,,False,1976,7.0,16.0


Each file consists of users' click histories collected over 21 days period. This table does contain the same values as the train_behaviours_df, but as that table is easier to work with we will use train_behaviours_df over this one

In [8]:
train_history_df.head()

user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,list[datetime[μs]],list[f32],list[i32],list[f32]
13538,"[2023-04-27 10:17:43, 2023-04-27 10:18:01, … 2023-05-17 20:36:34]","[100.0, 35.0, … 100.0]","[9738663, 9738569, … 9769366]","[17.0, 12.0, … 16.0]"
14241,"[2023-04-27 09:40:18, 2023-04-27 09:40:33, … 2023-05-17 17:08:41]","[100.0, 46.0, … 100.0]","[9738557, 9738528, … 9767852]","[8.0, 9.0, … 12.0]"
20396,"[2023-04-27 12:30:44, 2023-04-27 12:31:34, … 2023-05-17 10:59:44]","[100.0, 59.0, … 13.0]","[9738760, 9738355, … 9769679]","[49.0, 34.0, … 4.0]"
34912,"[2023-04-29 07:12:49, 2023-04-29 13:01:18, … 2023-05-18 05:06:40]","[100.0, 35.0, … 27.0]","[9741802, 9741804, … 9770882]","[153.0, 7.0, … 5.0]"
37953,"[2023-04-27 19:17:10, 2023-04-27 19:17:27, … 2023-05-17 21:29:22]","[14.0, 28.0, … 18.0]","[9739205, 9739202, … 9769306]","[4.0, 16.0, … 6.0]"


List of vectors for each article. This is used to describe the items. It could be used for item-item CF, but is not relevant to user-user CF.  <strong>This table will therefore not be used</strong>

In [9]:
document_vectors_df.head()

article_id,document_vector
i32,list[f32]
3000022,"[0.065424, -0.047425, … 0.035706]"
3000063,"[0.028815, -0.000166, … 0.027167]"
3000613,"[0.037971, 0.033923, … 0.063961]"
3000700,"[0.046524, 0.002913, … 0.023423]"
3000840,"[0.014737, 0.024068, … 0.045991]"


From the analasys we see that we only need train_behaviour_df to perform user-user CF

## Preprocessing

### Remove non-needed values

We see that we have several items that are not required for performing user-user CF

In [10]:
train_behaviors_df.head()

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32
150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, … 9778682]",[9778623],143471,False,,,,False,1240,287.0,100.0
153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, … 9778682]",[9778669],151570,False,,,,False,1976,45.0,100.0
153070,9777492.0,2023-05-24 07:13:14,26.0,100.0,1,"[9020783, 9778444, … 9778628]",[9778628],151570,False,,,,False,1976,4.0,18.0
153071,9778623.0,2023-05-24 07:11:08,125.0,100.0,1,"[9777492, 9774568, … 9775990]",[9777492],151570,False,,,,False,1976,26.0,100.0
153075,9777492.0,2023-05-24 07:13:58,26.0,100.0,1,"[9778500, 9776420, … 9020783]",[9777034],151570,False,,,,False,1976,7.0,16.0


All information that does not describe a user, or a user-item interaction can therefore be removed

In [11]:
irelevant_columns = ["impression_time", "device_type", "article_ids_inview", "article_ids_clicked", "session_id", "next_read_time", "next_scroll_percentage"]
train_behaviors_df = train_behaviors_df.drop(irelevant_columns)
train_behaviors_df.head()

impression_id,article_id,read_time,scroll_percentage,user_id,is_sso_user,gender,postcode,age,is_subscriber
u32,i32,f32,f32,u32,bool,i8,i8,i8,bool
150528,,25.0,,143471,False,,,,False
153068,9778682.0,78.0,100.0,151570,False,,,,False
153070,9777492.0,26.0,100.0,151570,False,,,,False
153071,9778623.0,125.0,100.0,151570,False,,,,False
153075,9777492.0,26.0,100.0,151570,False,,,,False


The remaining items are the ones that can be used. But already here we see that we have several features with lacking information. We should therefore treat this

### Account for missing values

We see here that alot of the behaviours contain missing values. The therefore have to either remove or replace the values

In [12]:
print(train_behaviors_df.shape)
train_behaviors_df.null_count()

(333982, 10)


impression_id,article_id,read_time,scroll_percentage,user_id,is_sso_user,gender,postcode,age,is_subscriber
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,234815,0,236316,0,0,311382,327583,325266,0


In [13]:
train_behaviors_df = train_behaviors_df.filter(train_behaviors_df["article_id"].is_not_null())
print(train_behaviors_df.shape)
train_behaviors_df.null_count()

(99167, 10)


impression_id,article_id,read_time,scroll_percentage,user_id,is_sso_user,gender,postcode,age,is_subscriber
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,2531,0,0,92409,96914,95809,0


We see that of 70421, are there between 65-68000 missing values for gender, postcode and age. We therefore remove these as there is no use subsidizing them

In [14]:
train_behaviors_df = train_behaviors_df.drop(["gender", "postcode", "age"])
print(train_behaviors_df.shape)
train_behaviors_df.null_count()

(99167, 7)


impression_id,article_id,read_time,scroll_percentage,user_id,is_sso_user,is_subscriber
u32,u32,u32,u32,u32,u32,u32
0,0,0,2531,0,0,0


We still see that 1780/70421 rows are missing a scroll percentage. As this is very low (<3%) we can easily replace this. Intitially we just set scroll to 0

In [15]:
train_behaviors_df = train_behaviors_df.fill_null(strategy="zero")

### Account for multiple instances of the same article and user

By checking rows where the user_id and article_id are the same we see that we have 9855 instances where the user has read the same article multiple times

In [16]:
duplicates = train_behaviors_df.group_by(["article_id", "user_id"]).count().filter(pl.col("count") > 1)

print(duplicates)

shape: (12_373, 3)
┌────────────┬─────────┬───────┐
│ article_id ┆ user_id ┆ count │
│ ---        ┆ ---     ┆ ---   │
│ i32        ┆ u32     ┆ u32   │
╞════════════╪═════════╪═══════╡
│ 9787524    ┆ 1171451 ┆ 2     │
│ 9778945    ┆ 1943992 ┆ 2     │
│ 9786172    ┆ 1509061 ┆ 2     │
│ 9771127    ┆ 956441  ┆ 2     │
│ 9783237    ┆ 1929291 ┆ 2     │
│ …          ┆ …       ┆ …     │
│ 9759544    ┆ 1017436 ┆ 2     │
│ 9785668    ┆ 1907236 ┆ 3     │
│ 9769380    ┆ 2063711 ┆ 4     │
│ 9788497    ┆ 946450  ┆ 2     │
│ 9783850    ┆ 119106  ┆ 12    │
└────────────┴─────────┴───────┘


  duplicates = train_behaviors_df.group_by(["article_id", "user_id"]).count().filter(pl.col("count") > 1)


We see that we need to combine these duplicate rows. We therefore propose that for multiple instances of the same article and user, we combine the readtime and select the largest scroll percentage. This way we can preserve the data without having duplicates

In [17]:
train_behaviors_df = (
    train_behaviors_df
    .group_by(["article_id", "user_id"])
    .agg(
        pl.col("read_time").product().alias("total_readtime"),  # Multiply all readtime values
        pl.col("scroll_percentage").max().alias("max_scroll")  # Select the largest scroll percentage
    )
)

print(train_behaviors_df)

shape: (79_395, 4)
┌────────────┬─────────┬────────────────┬────────────┐
│ article_id ┆ user_id ┆ total_readtime ┆ max_scroll │
│ ---        ┆ ---     ┆ ---            ┆ ---        │
│ i32        ┆ u32     ┆ f32            ┆ f32        │
╞════════════╪═════════╪════════════════╪════════════╡
│ 9778302    ┆ 1566629 ┆ 69.0           ┆ 100.0      │
│ 9790942    ┆ 201350  ┆ 7.0            ┆ 100.0      │
│ 9783641    ┆ 1678250 ┆ 53.0           ┆ 100.0      │
│ 9790942    ┆ 1407646 ┆ 7623.0         ┆ 100.0      │
│ 9778939    ┆ 2246396 ┆ 34.0           ┆ 100.0      │
│ …          ┆ …       ┆ …              ┆ …          │
│ 9787264    ┆ 1300255 ┆ 47.0           ┆ 100.0      │
│ 9772963    ┆ 2124713 ┆ 178.0          ┆ 100.0      │
│ 9790272    ┆ 1022414 ┆ 229.0          ┆ 78.0       │
│ 9786268    ┆ 1939404 ┆ 357048.0       ┆ 100.0      │
│ 9776369    ┆ 1938976 ┆ 57.0           ┆ 100.0      │
└────────────┴─────────┴────────────────┴────────────┘


## Model Fit

In [18]:
recommender = CollaborativeRecommender(train_behaviors_df)
recommender.fit()

{1566629: [(207620, np.float64(0.9968324191215926)),
  (157768, np.float64(0.9968324191215926)),
  (63392, np.float64(0.975233720194176)),
  (773943, np.float64(0.8925199497591783)),
  (1019260, np.float64(0.8297078387113909)),
  (2325549, np.float64(0.787036580905828)),
  (1747113, np.float64(0.6913405132981456)),
  (694059, np.float64(0.6048152900700846)),
  (2164366, np.float64(0.6036084239608379)),
  (1600660, np.float64(0.5764819706247039))],
 201350: [(176267, np.float64(0.9999782438468509)),
  (1907419, np.float64(0.9999782438468509)),
  (1173066, np.float64(0.9999782438468509)),
  (2265375, np.float64(0.9999782438468507)),
  (14241, np.float64(0.9999782438468507)),
  (2317323, np.float64(0.9999782438468507)),
  (2051126, np.float64(0.9999782438468507)),
  (34748, np.float64(0.9999782438468507)),
  (1281654, np.float64(0.9999782438468507)),
  (1875404, np.float64(0.9999782436342861))],
 1678250: [(1425157, np.float64(0.9369954244134808)),
  (82351, np.float64(0.9369954244134808)

Of the original 15143 users, only 9194 can be accounted for with the current solution. This should be changed in the future

## Model presentation

In [19]:
for user in [630220, 620796, 1067393, 1726258, 17205]:
    print("reccomended for user ", user, ": ", recommender.recommend_n_articles(user_id=user, n=5, allow_read=True))

reccomended for user  630220 :  [9782722, 9784506, 9776406, 9778035, 9785209]
reccomended for user  620796 :  [9783865, 9771995, 9765753, 9769580, 9775978]
reccomended for user  1067393 :  [9787098, 9786906, 9776508, 9773341, 9773887]
reccomended for user  1726258 :  [9771224, 9786209, 9766752, 9780406, 9773464]
reccomended for user  17205 :  [9780325, 9754241, 9780860, 9765941, 9788352]


In [20]:
results = recommender.evaluate_recommender(test_behaviours_df, k=100, n_jobs=4, user_sample=200, allow_read=False)
print("Results")
results

Results


{'MAP@K': np.float64(0.0005952380952380953),
 'NDCG@K': np.float64(0.0019411719648461262)}

In [21]:
results = recommender.evaluate_recommender(test_behaviours_df, k=100, n_jobs=4, user_sample=200, allow_read=True)
print("Results")
results

Results


{'MAP@K': np.float64(0.00963855421686747),
 'NDCG@K': np.float64(0.21365806551523156)}

## Model Experimentation

In [27]:
test_user_id = 630220

predictions = recommender.recommend_n_articles(user_id=test_user_id, n=1000)
results = set(test_behaviours_df.filter(pl.col("user_id") == test_user_id)["article_id"])

print(results)
print(predictions)

for prediction in predictions:
    if prediction in results:
        print("Yes")

{9786243, 9787524, 9781902, 9784591, 9783824, 9786111, 9776916, 9779615, 9788705, 9789473, 9428643, 9783334, 9782315, 9756075, 9787441, 9782722, 9786821, 9782726, 9786566, 9789896, 9787465, 9788362, 9791049, None, 9782092, 9780815, 9783509, 9772508, 9786718, 9786719, 9787487, 9790942, 9783655, 9786351, 9780849, 9781875, 9788661, 9781878, 9787510, 9786618, 9673979, 9780348, 9781887}
[9784506, 9776406, 9778035, 9785209, 9777320, 9786378, 9786932, 9785668, 9779723, 9783137, 9782468, 9788677, 9783317, 9780962, 9770288, 9790052, 9785009, 9778328, 9789703, 9775697, 9780284, 9754241, 9788134, 9782131, 9790272, 9789754, 9785260, 9777296, 9781520, 9779227, 9790987, 9779577, 9781785, 9789997, 9761784, 9789623, 9787863, 9778845, 9773574, 9776040, 9777941, 9772903, 9781389, 9788666, 9777299, 9776148, 9779242, 9787586, 9777319, 9781057, 9776287, 9781624, 9788108, 9777910, 9784444, 9777345, 9788621, 9772453, 9778375, 9778804, 9782993, 9779269, 9782695, 9786359, 9788524, 9782407, 9789711, 9778915, 97

In [28]:
test_user_id = 630220

predictions = recommender.recommend_n_articles(user_id=test_user_id, n=1000, allow_read=True)
results = set(test_behaviours_df.filter(pl.col("user_id") == test_user_id)["article_id"])

print(results)
print(predictions)

for prediction in predictions:
    if prediction in results:
        print("Yes")

{9786243, 9787524, 9781902, 9784591, 9783824, 9786111, 9776916, 9779615, 9788705, 9789473, 9428643, 9783334, 9782315, 9756075, 9787441, 9782722, 9786821, 9782726, 9786566, 9789896, 9787465, 9788362, 9791049, None, 9782092, 9780815, 9783509, 9772508, 9786718, 9786719, 9787487, 9790942, 9783655, 9786351, 9780849, 9781875, 9788661, 9781878, 9787510, 9786618, 9673979, 9780348, 9781887}
[9782722, 9784506, 9776406, 9778035, 9785209, 9777320, 9786378, 9786932, 9785668, 9779723, 9783137, 9782468, 9783509, 9788677, 9783317, 9780962, 9770288, 9790052, 9785009, 9776862, 9778328, 9789703, 9775697, 9780284, 9754241, 9788134, 9782131, 9790272, 9789754, 9785260, 9777296, 9781520, 9779227, 9790987, 9779577, 9781785, 9789997, 9761784, 9789623, 9787863, 9778845, 9780020, 9773574, 9776040, 9777941, 9772903, 9781389, 9788666, 9777299, 9779777, 9779242, 9776148, 9787586, 9777319, 9781057, 9776287, 9781624, 9777910, 9788108, 9784444, 9777345, 9784591, 9788621, 9790942, 9772453, 9778804, 9778375, 9782993, 97