In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

import polars as pl
import numpy as np

from parquet_data_reader import ParquetDataReader
from models.item_based import ItemBasedCollaborativeRecommender

pl.Config.set_tbl_cols(-1)

polars.config.Config

## Data import and EDA

In [2]:
data_reader = ParquetDataReader()
articles_df = data_reader.read_data('../../data/articles.parquet')
train_behaviors_df = data_reader.read_data('../../data/train/behaviors.parquet')
train_history_df = data_reader.read_data('../../data/train/history.parquet')
document_vectors_df = data_reader.read_data('../../data/document_vector.parquet')

We check the size of the data. From the sizes we learn that:
<ol>
  <li>We have 20738 unique articles</li>
  <li>We have 15143 unique users</li>
  <li>We have 232887 interactions in the testset</li>
</ol> 

In [3]:
print("articles_df has the size:         ", articles_df.shape)
print("train_behaviors_df has the size:  ", train_behaviors_df.shape)
print("train_history_df has the size:    ", train_history_df.shape)
print("document_vectors_df has the size: ", document_vectors_df.shape)

articles_df has the size:          (20738, 21)
train_behaviors_df has the size:   (232887, 17)
train_history_df has the size:     (15143, 5)
document_vectors_df has the size:  (125541, 2)


### Validation set

In [4]:
test_behaviours_df = data_reader.read_data('../../data/validation/behaviors.parquet')
test_behaviours_df.head()

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32
96791,,2023-05-28 04:21:24,9.0,,2,"[9783865, 9784591, … 9784710]",[9784696],22548,False,,,,False,142,72.0,100.0
96798,,2023-05-28 04:31:48,46.0,,2,"[9782884, 9783865, … 9784648]",[9784281],22548,False,,,,False,143,16.0,28.0
96801,,2023-05-28 04:30:17,14.0,,2,"[9784648, 7184889, … 9781983]",[9784444],22548,False,,,,False,143,12.0,24.0
96808,,2023-05-28 04:27:19,22.0,,2,"[9784607, 9695098, … 9781983]",[9781983],22548,False,,,,False,142,125.0,80.0
96810,,2023-05-28 04:29:47,23.0,,2,"[9781983, 7184889, … 9781520]",[9784642],22548,False,,,,False,142,,


In [5]:
# Combine train and test behaviors
combined_df = pl.concat([train_behaviors_df, test_behaviours_df])

# Generate a random mask for splitting
n = combined_df.height  # Total number of rows
test_mask = np.random.rand(n) < 0.30  # 30% test, 70% train

# Apply the mask
test_behaviors_df = combined_df.filter(test_mask)
train_behaviors_df = combined_df.filter(~test_mask)

# Verify the split
print(f"Train size: {train_behaviors_df.shape[0]}, Test size: {test_behaviors_df.shape[0]}")

Train size: 334227, Test size: 143307


### Table contents

The information on news articles. It therefore is <strong>very important</strong> as it contains information to compare items by. 

In [6]:
articles_df.head()

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var ikke den første""","""Politiet frygter nu, at Natasc…",2023-06-29 06:20:33,False,"""Sagen om den østriske Natascha…",2006-08-31 08:06:45,[3150850],"""article_default""","""https://ekstrabladet.dk/krimi/…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars tjente mere""","""Biografgængerne strømmer ind f…",2023-06-29 06:20:35,False,"""Vatikanet har opfordret til at…",2006-05-21 16:57:00,[3006712],"""article_default""","""https://ekstrabladet.dk/underh…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""
3012771,"""Morten Bruun fyret i Sønderjys…","""FODBOLD: Morten Bruun fyret me…",2023-06-29 06:20:39,False,"""Kemien mellem spillerne i Supe…",2006-05-01 14:28:40,[3177953],"""article_default""","""https://ekstrabladet.dk/sport/…",[],[],"[""Erhverv"", ""Kendt"", … ""Ansættelsesforhold""]",142,"[196, 199]","""sport""",,,,0.8241,"""Negative"""
3023463,"""Luderne flytter på landet""","""I landets tyndest befolkede om…",2023-06-29 06:20:43,False,"""Det frække erhverv rykker på l…",2007-03-24 08:27:59,[3184029],"""article_default""","""https://ekstrabladet.dk/nyhede…",[],[],"[""Livsstil"", ""Erotik""]",118,[133],"""nyheder""",,,,0.7053,"""Neutral"""
3032577,"""Cybersex: Hvornår er man utro?""","""En flirtende sms til den flott…",2023-06-29 06:20:46,False,"""De fleste af os mener, at et t…",2007-01-18 10:30:37,[3030463],"""article_default""","""https://ekstrabladet.dk/sex_og…",[],[],"[""Livsstil"", ""Partnerskab""]",565,[],"""sex_og_samliv""",,,,0.9307,"""Neutral"""


Each file consists of seven days of impression logs. The train_behaviors_df table contains all interactions between users and items and is therefore important for our item-item CF.

In [7]:
train_behaviors_df.head()

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32
149474,,2023-05-24 07:47:53,13.0,,2,"[9778623, 9778682, … 9778728]",[9778657],139836,False,,,,False,759,7.0,22.0
150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, … 9778682]",[9778623],143471,False,,,,False,1240,287.0,100.0
153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, … 9778682]",[9778669],151570,False,,,,False,1976,45.0,100.0
153071,9778623.0,2023-05-24 07:11:08,125.0,100.0,1,"[9777492, 9774568, … 9775990]",[9777492],151570,False,,,,False,1976,26.0,100.0
153078,9777492.0,2023-05-24 07:13:46,7.0,100.0,1,"[9778021, 9778627, … 7213923]",[9778226],151570,False,,,,False,1976,4.0,21.0


Each file consists of users' click histories collected over 21 days period. This table does contain the same values as the train_behaviours_df, but as that table is easier to work with we will use train_behaviours_df over this one

In [8]:
train_history_df.head()

user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,list[datetime[μs]],list[f32],list[i32],list[f32]
13538,"[2023-04-27 10:17:43, 2023-04-27 10:18:01, … 2023-05-17 20:36:34]","[100.0, 35.0, … 100.0]","[9738663, 9738569, … 9769366]","[17.0, 12.0, … 16.0]"
14241,"[2023-04-27 09:40:18, 2023-04-27 09:40:33, … 2023-05-17 17:08:41]","[100.0, 46.0, … 100.0]","[9738557, 9738528, … 9767852]","[8.0, 9.0, … 12.0]"
20396,"[2023-04-27 12:30:44, 2023-04-27 12:31:34, … 2023-05-17 10:59:44]","[100.0, 59.0, … 13.0]","[9738760, 9738355, … 9769679]","[49.0, 34.0, … 4.0]"
34912,"[2023-04-29 07:12:49, 2023-04-29 13:01:18, … 2023-05-18 05:06:40]","[100.0, 35.0, … 27.0]","[9741802, 9741804, … 9770882]","[153.0, 7.0, … 5.0]"
37953,"[2023-04-27 19:17:10, 2023-04-27 19:17:27, … 2023-05-17 21:29:22]","[14.0, 28.0, … 18.0]","[9739205, 9739202, … 9769306]","[4.0, 16.0, … 6.0]"


List of vectors for each article. This is used to describe the items and therefore importaint for our item-item CF. But we need to change the format of the document_vector as now it isn't very useful

In [9]:
document_vectors_df.head()

article_id,document_vector
i32,list[f32]
3000022,"[0.065424, -0.047425, … 0.035706]"
3000063,"[0.028815, -0.000166, … 0.027167]"
3000613,"[0.037971, 0.033923, … 0.063961]"
3000700,"[0.046524, 0.002913, … 0.023423]"
3000840,"[0.014737, 0.024068, … 0.045991]"


From the analasys we see that we only need train_behaviour_df to perform user-user CF

## Preprocessing

### User-Item Interactions

We see that we have several items that are not required for performing user-user CF. There is both excess interaction information as well as user information that we do not need

In [10]:
train_behaviors_df.head()

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32
149474,,2023-05-24 07:47:53,13.0,,2,"[9778623, 9778682, … 9778728]",[9778657],139836,False,,,,False,759,7.0,22.0
150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, … 9778682]",[9778623],143471,False,,,,False,1240,287.0,100.0
153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, … 9778682]",[9778669],151570,False,,,,False,1976,45.0,100.0
153071,9778623.0,2023-05-24 07:11:08,125.0,100.0,1,"[9777492, 9774568, … 9775990]",[9777492],151570,False,,,,False,1976,26.0,100.0
153078,9777492.0,2023-05-24 07:13:46,7.0,100.0,1,"[9778021, 9778627, … 7213923]",[9778226],151570,False,,,,False,1976,4.0,21.0


All information that does not describe a user, or a user-item interaction can therefore be removed

In [11]:
relevant_columns = ["impression_id", "article_id", "user_id", "scroll_percentage", "read_time"]
train_behaviors_df = train_behaviors_df.select(relevant_columns) 
train_behaviors_df.head()

impression_id,article_id,user_id,scroll_percentage,read_time
u32,i32,u32,f32,f32
149474,,139836,,13.0
150528,,143471,,25.0
153068,9778682.0,151570,100.0,78.0
153071,9778623.0,151570,100.0,125.0
153078,9777492.0,151570,100.0,7.0


The remaining items are the ones that can be used. But already here we see that we have several features with lacking information. We should therefore treat this.

In [12]:
print(train_behaviors_df.shape)
train_behaviors_df.null_count()

(334227, 5)


impression_id,article_id,user_id,scroll_percentage,read_time
u32,u32,u32,u32,u32
0,234919,0,236407,0


In [13]:
train_behaviors_df = train_behaviors_df.filter(train_behaviors_df["article_id"].is_not_null())
print(train_behaviors_df.shape)
train_behaviors_df.null_count()

(99308, 5)


impression_id,article_id,user_id,scroll_percentage,read_time
u32,u32,u32,u32,u32
0,0,0,2528,0


We still see that 2523/99283 rows are missing a scroll percentage. As this is very low (<3%) we can easily replace this. Intitially we just set scroll to 0

In [14]:
train_behaviors_df = train_behaviors_df.fill_null(strategy="zero")
train_behaviors_df.head()

impression_id,article_id,user_id,scroll_percentage,read_time
u32,i32,u32,f32,f32
153068,9778682,151570,100.0,78.0
153071,9778623,151570,100.0,125.0
153078,9777492,151570,100.0,7.0
155587,9778627,161621,100.0,50.0
155589,9778375,161621,100.0,119.0


#### Article information

We have alot of information about articles that can be removed and combined to create one big matrix

In [15]:
articles_df.head()

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var ikke den første""","""Politiet frygter nu, at Natasc…",2023-06-29 06:20:33,False,"""Sagen om den østriske Natascha…",2006-08-31 08:06:45,[3150850],"""article_default""","""https://ekstrabladet.dk/krimi/…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars tjente mere""","""Biografgængerne strømmer ind f…",2023-06-29 06:20:35,False,"""Vatikanet har opfordret til at…",2006-05-21 16:57:00,[3006712],"""article_default""","""https://ekstrabladet.dk/underh…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""
3012771,"""Morten Bruun fyret i Sønderjys…","""FODBOLD: Morten Bruun fyret me…",2023-06-29 06:20:39,False,"""Kemien mellem spillerne i Supe…",2006-05-01 14:28:40,[3177953],"""article_default""","""https://ekstrabladet.dk/sport/…",[],[],"[""Erhverv"", ""Kendt"", … ""Ansættelsesforhold""]",142,"[196, 199]","""sport""",,,,0.8241,"""Negative"""
3023463,"""Luderne flytter på landet""","""I landets tyndest befolkede om…",2023-06-29 06:20:43,False,"""Det frække erhverv rykker på l…",2007-03-24 08:27:59,[3184029],"""article_default""","""https://ekstrabladet.dk/nyhede…",[],[],"[""Livsstil"", ""Erotik""]",118,[133],"""nyheder""",,,,0.7053,"""Neutral"""
3032577,"""Cybersex: Hvornår er man utro?""","""En flirtende sms til den flott…",2023-06-29 06:20:46,False,"""De fleste af os mener, at et t…",2007-01-18 10:30:37,[3030463],"""article_default""","""https://ekstrabladet.dk/sex_og…",[],[],"[""Livsstil"", ""Partnerskab""]",565,[],"""sex_og_samliv""",,,,0.9307,"""Neutral"""


In [16]:
articles_df = articles_df.drop(["title", "subtitle", "last_modified_time", "body", "published_time", "image_ids", "url"])
articles_df.head()

article_id,premium,article_type,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,bool,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,False,"""article_default""",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,False,"""article_default""",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""
3012771,False,"""article_default""",[],[],"[""Erhverv"", ""Kendt"", … ""Ansættelsesforhold""]",142,"[196, 199]","""sport""",,,,0.8241,"""Negative"""
3023463,False,"""article_default""",[],[],"[""Livsstil"", ""Erotik""]",118,[133],"""nyheder""",,,,0.7053,"""Neutral"""
3032577,False,"""article_default""",[],[],"[""Livsstil"", ""Partnerskab""]",565,[],"""sex_og_samliv""",,,,0.9307,"""Neutral"""


We also see that total_inviews, total_pageviews and total_read_time contain alot of null values. We should fix this

In [17]:
print("Total articles:", articles_df.__len__())
articles_df.null_count()

Total articles: 20738


article_id,premium,article_type,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,10770,10882,10882,0,0


We see that around half of the articles have missing values for total_inviews, total_pageviews and total_read_time. We therefore remove these values

In [18]:
articles_df = articles_df.drop(["total_inviews", "total_pageviews", "total_read_time"])
articles_df.head()

article_id,premium,article_type,ner_clusters,entity_groups,topics,category,subcategory,category_str,sentiment_score,sentiment_label
i32,bool,str,list[str],list[str],list[str],i16,list[i16],str,f32,str
3001353,False,"""article_default""",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",0.9955,"""Negative"""
3003065,False,"""article_default""",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",0.846,"""Positive"""
3012771,False,"""article_default""",[],[],"[""Erhverv"", ""Kendt"", … ""Ansættelsesforhold""]",142,"[196, 199]","""sport""",0.8241,"""Negative"""
3023463,False,"""article_default""",[],[],"[""Livsstil"", ""Erotik""]",118,[133],"""nyheder""",0.7053,"""Neutral"""
3032577,False,"""article_default""",[],[],"[""Livsstil"", ""Partnerskab""]",565,[],"""sex_og_samliv""",0.9307,"""Neutral"""


As ner_clusters and entity_groups are quite empty, we remove them

In [19]:
articles_df = articles_df.drop(["ner_clusters", "entity_groups"])
articles_df.head()

article_id,premium,article_type,topics,category,subcategory,category_str,sentiment_score,sentiment_label
i32,bool,str,list[str],i16,list[i16],str,f32,str
3001353,False,"""article_default""","[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",0.9955,"""Negative"""
3003065,False,"""article_default""","[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",0.846,"""Positive"""
3012771,False,"""article_default""","[""Erhverv"", ""Kendt"", … ""Ansættelsesforhold""]",142,"[196, 199]","""sport""",0.8241,"""Negative"""
3023463,False,"""article_default""","[""Livsstil"", ""Erotik""]",118,[133],"""nyheder""",0.7053,"""Neutral"""
3032577,False,"""article_default""","[""Livsstil"", ""Partnerskab""]",565,[],"""sex_og_samliv""",0.9307,"""Neutral"""


We also have to expand the topics and subcategories, as we would rather have seperate columns for the different topics ad subcategories rather than working with the arrays. 

The problem here is that we have >100 subcategories. So expanding our list for both topic and subcategory would result in >260 columns. Therefore for this solution I have decided to remove subcategory for simplicity and only focus on topics and category

In [20]:
exploded_topics = articles_df.explode("topics")

topics_wide = (
    exploded_topics
    .with_columns(pl.lit(1).alias("value"))
    .pivot(index="article_id", columns="topics", values="value", aggregate_function="first")
    .fill_null(0)
)

# Rename topic columns to have "topic_" prefix
topics_wide = topics_wide.rename({col: f"topic_{col}" for col in topics_wide.columns if col != "article_id"})

# ---- Join everything back ----
articles_df = (
    articles_df.drop(["topics", "subcategory", "category_str"])
    .join(topics_wide, on="article_id")
)

articles_df.head()

  .pivot(index="article_id", columns="topics", values="value", aggregate_function="first")


article_id,premium,article_type,category,sentiment_score,sentiment_label,topic_Kriminalitet,topic_Personfarlig kriminalitet,topic_Underholdning,topic_Film og tv,topic_Økonomi,topic_Erhverv,topic_Kendt,topic_Sport,topic_Fodbold,topic_Ansættelsesforhold,topic_Livsstil,topic_Erotik,topic_Partnerskab,topic_Kultur,topic_Mad og drikke,topic_Privat virksomhed,topic_Offentlig instans,topic_Politik,topic_National politik,topic_Transportmiddel,topic_Begivenhed,topic_Personlig begivenhed,topic_Bolig,topic_Sportsbegivenhed,topic_Makro,topic_Køb og salg,topic_Bil,topic_Mikro,topic_Underholdningsbegivenhed,topic_Samfund,topic_Værdier,topic_Konflikt og krig,topic_Religion,topic_Litteratur,topic_Katastrofe,topic_Mindre ulykke,topic_Større transportmiddel,topic_Bandekriminalitet,topic_Håndbold,topic_Familieliv,topic_Krop og velvære,topic_Sundhed,topic_Sygdom og behandling,topic_Offentlig transport,topic_International politik,topic_Musik og lyd,topic_Reality,topic_Byliv,topic_Rejse,topic_Uddannelse,topic_Ungdomsuddannelse,topic_Grundskole,topic_Videnskab,topic_Samfundsvidenskab og humaniora,topic_Cykling,topic_Videregående uddannelse,topic_Dyr,topic_Kosmetisk behandling,topic_Teknologi,topic_Bedrageri,topic_Fritid,topic_Museum og seværdighed,topic_Naturvidenskab,topic_Renovering og indretning,topic_Udlejning,topic_Bæredygtighed og klima,topic_Tendenser,topic_Væbnet konflikt,topic_Mindre transportmiddel,topic_Vejr,topic_Ketcher- og batsport,topic_Motorsport,topic_null,topic_Kunstig intelligens og software,topic_Kunst,topic_Terror,topic_Større katastrofe,topic_Forbrugerelektronik,topic_Mærkedag
i32,bool,str,i16,f32,str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
3001353,False,"""article_default""",140,0.9955,"""Negative""",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3003065,False,"""article_default""",414,0.846,"""Positive""",0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3012771,False,"""article_default""",142,0.8241,"""Negative""",0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3023463,False,"""article_default""",118,0.7053,"""Neutral""",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3032577,False,"""article_default""",565,0.9307,"""Neutral""",0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


As we can't work with strings, we remove article_type and sentiment_label. These are already described by other columns

In [21]:
articles_df = articles_df.drop(["article_type", "sentiment_label"])
articles_df.head()

article_id,premium,category,sentiment_score,topic_Kriminalitet,topic_Personfarlig kriminalitet,topic_Underholdning,topic_Film og tv,topic_Økonomi,topic_Erhverv,topic_Kendt,topic_Sport,topic_Fodbold,topic_Ansættelsesforhold,topic_Livsstil,topic_Erotik,topic_Partnerskab,topic_Kultur,topic_Mad og drikke,topic_Privat virksomhed,topic_Offentlig instans,topic_Politik,topic_National politik,topic_Transportmiddel,topic_Begivenhed,topic_Personlig begivenhed,topic_Bolig,topic_Sportsbegivenhed,topic_Makro,topic_Køb og salg,topic_Bil,topic_Mikro,topic_Underholdningsbegivenhed,topic_Samfund,topic_Værdier,topic_Konflikt og krig,topic_Religion,topic_Litteratur,topic_Katastrofe,topic_Mindre ulykke,topic_Større transportmiddel,topic_Bandekriminalitet,topic_Håndbold,topic_Familieliv,topic_Krop og velvære,topic_Sundhed,topic_Sygdom og behandling,topic_Offentlig transport,topic_International politik,topic_Musik og lyd,topic_Reality,topic_Byliv,topic_Rejse,topic_Uddannelse,topic_Ungdomsuddannelse,topic_Grundskole,topic_Videnskab,topic_Samfundsvidenskab og humaniora,topic_Cykling,topic_Videregående uddannelse,topic_Dyr,topic_Kosmetisk behandling,topic_Teknologi,topic_Bedrageri,topic_Fritid,topic_Museum og seværdighed,topic_Naturvidenskab,topic_Renovering og indretning,topic_Udlejning,topic_Bæredygtighed og klima,topic_Tendenser,topic_Væbnet konflikt,topic_Mindre transportmiddel,topic_Vejr,topic_Ketcher- og batsport,topic_Motorsport,topic_null,topic_Kunstig intelligens og software,topic_Kunst,topic_Terror,topic_Større katastrofe,topic_Forbrugerelektronik,topic_Mærkedag
i32,bool,i16,f32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
3001353,False,140,0.9955,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3003065,False,414,0.846,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3012771,False,142,0.8241,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3023463,False,118,0.7053,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3032577,False,565,0.9307,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Lastl, we change premium from a boolean to 0 and 1

In [22]:
articles_df = articles_df.with_columns(articles_df["premium"].cast(pl.Int32))
articles_df.head()

article_id,premium,category,sentiment_score,topic_Kriminalitet,topic_Personfarlig kriminalitet,topic_Underholdning,topic_Film og tv,topic_Økonomi,topic_Erhverv,topic_Kendt,topic_Sport,topic_Fodbold,topic_Ansættelsesforhold,topic_Livsstil,topic_Erotik,topic_Partnerskab,topic_Kultur,topic_Mad og drikke,topic_Privat virksomhed,topic_Offentlig instans,topic_Politik,topic_National politik,topic_Transportmiddel,topic_Begivenhed,topic_Personlig begivenhed,topic_Bolig,topic_Sportsbegivenhed,topic_Makro,topic_Køb og salg,topic_Bil,topic_Mikro,topic_Underholdningsbegivenhed,topic_Samfund,topic_Værdier,topic_Konflikt og krig,topic_Religion,topic_Litteratur,topic_Katastrofe,topic_Mindre ulykke,topic_Større transportmiddel,topic_Bandekriminalitet,topic_Håndbold,topic_Familieliv,topic_Krop og velvære,topic_Sundhed,topic_Sygdom og behandling,topic_Offentlig transport,topic_International politik,topic_Musik og lyd,topic_Reality,topic_Byliv,topic_Rejse,topic_Uddannelse,topic_Ungdomsuddannelse,topic_Grundskole,topic_Videnskab,topic_Samfundsvidenskab og humaniora,topic_Cykling,topic_Videregående uddannelse,topic_Dyr,topic_Kosmetisk behandling,topic_Teknologi,topic_Bedrageri,topic_Fritid,topic_Museum og seværdighed,topic_Naturvidenskab,topic_Renovering og indretning,topic_Udlejning,topic_Bæredygtighed og klima,topic_Tendenser,topic_Væbnet konflikt,topic_Mindre transportmiddel,topic_Vejr,topic_Ketcher- og batsport,topic_Motorsport,topic_null,topic_Kunstig intelligens og software,topic_Kunst,topic_Terror,topic_Større katastrofe,topic_Forbrugerelektronik,topic_Mærkedag
i32,i32,i16,f32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
3001353,0,140,0.9955,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3003065,0,414,0.846,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3012771,0,142,0.8241,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3023463,0,118,0.7053,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3032577,0,565,0.9307,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


This list, while big, can now be used to represent our articles

### Account for multiple instances of the same article and user

By checking rows where the user_id and article_id are the same we see that we have 9855 instances where the user has read the same article multiple times

In [23]:
duplicates = train_behaviors_df.group_by(["article_id", "user_id"]).count().filter(pl.col("count") > 1)

print(duplicates)

shape: (12_373, 3)
┌────────────┬─────────┬───────┐
│ article_id ┆ user_id ┆ count │
│ ---        ┆ ---     ┆ ---   │
│ i32        ┆ u32     ┆ u32   │
╞════════════╪═════════╪═══════╡
│ 9785020    ┆ 62566   ┆ 2     │
│ 9779289    ┆ 1224501 ┆ 2     │
│ 9771253    ┆ 228737  ┆ 9     │
│ 9777190    ┆ 21552   ┆ 2     │
│ 9784702    ┆ 77529   ┆ 2     │
│ …          ┆ …       ┆ …     │
│ 9790572    ┆ 1285306 ┆ 2     │
│ 9784875    ┆ 1289830 ┆ 2     │
│ 9771125    ┆ 1970700 ┆ 2     │
│ 9771237    ┆ 644542  ┆ 3     │
│ 9772882    ┆ 1513457 ┆ 3     │
└────────────┴─────────┴───────┘


  duplicates = train_behaviors_df.group_by(["article_id", "user_id"]).count().filter(pl.col("count") > 1)


We see that we need to combine these duplicate rows. We therefore propose that for multiple instances of the same article and user, we combine the readtime and select the largest scroll percentage. This way we can preserve the data without having duplicates

In [24]:
train_behaviors_df = (
    train_behaviors_df
    .group_by(["article_id", "user_id"])
    .agg(
        pl.col("read_time").product().alias("total_readtime"),  # Multiply all readtime values
        pl.col("scroll_percentage").max().alias("max_scroll")  # Select the largest scroll percentage
    )
)

print(train_behaviors_df)

shape: (79_474, 4)
┌────────────┬─────────┬────────────────┬────────────┐
│ article_id ┆ user_id ┆ total_readtime ┆ max_scroll │
│ ---        ┆ ---     ┆ ---            ┆ ---        │
│ i32        ┆ u32     ┆ f32            ┆ f32        │
╞════════════╪═════════╪════════════════╪════════════╡
│ 9785668    ┆ 419331  ┆ 48.0           ┆ 100.0      │
│ 9774065    ┆ 1744238 ┆ 20.0           ┆ 100.0      │
│ 9784273    ┆ 157385  ┆ 136.0          ┆ 100.0      │
│ 9785019    ┆ 1520070 ┆ 22.0           ┆ 100.0      │
│ 9772601    ┆ 198686  ┆ 104.0          ┆ 100.0      │
│ …          ┆ …       ┆ …              ┆ …          │
│ 9772050    ┆ 1763407 ┆ 17.0           ┆ 100.0      │
│ 9775673    ┆ 1949315 ┆ 92.0           ┆ 100.0      │
│ 9779659    ┆ 814683  ┆ 131.0          ┆ 100.0      │
│ 9772543    ┆ 274053  ┆ 55.0           ┆ 0.0        │
│ 9785205    ┆ 76452   ┆ 3969.0         ┆ 100.0      │
└────────────┴─────────┴────────────────┴────────────┘


## Model Fit

This first model uses readtime and read percentage interactions to compare the user interactions 

In [25]:
recommender = ItemBasedCollaborativeRecommender(interactions=train_behaviors_df, items=articles_df)
recommender.fit()

{3001353: [(3036444, np.float64(0.9999999999999998)),
  (3056457, np.float64(0.9999999999999998)),
  (3056458, np.float64(0.9999999999999998)),
  (3058102, np.float64(0.9999999999999998)),
  (3059391, np.float64(0.9999999999999998)),
  (3060234, np.float64(0.9999999999999998)),
  (3077944, np.float64(0.9999999999999998)),
  (3083788, np.float64(0.9999999999999998)),
  (3088141, np.float64(0.9999999999999998)),
  (3147552, np.float64(0.9999999999999998))],
 3003065: [(9302456, np.float64(1.0)),
  (9370044, np.float64(1.0)),
  (9373083, np.float64(1.0)),
  (9425354, np.float64(1.0)),
  (9534036, np.float64(1.0)),
  (9713719, np.float64(1.0)),
  (9765153, np.float64(1.0)),
  (4127820, np.float64(0.8660254037844387)),
  (6974930, np.float64(0.8660254037844387)),
  (7923741, np.float64(0.8660254037844387))],
 3012771: [(3245287, np.float64(0.9999999999999998)),
  (4079351, np.float64(0.9999999999999998)),
  (4094513, np.float64(0.9999999999999998)),
  (4181100, np.float64(0.9999999999999998

This first model just compares all artilces read by users when comparing users

In [26]:
#binary_recommender = ItemBasedCollaborativeRecommender(interactions=train_behaviors_df, items=articles_df, binary_model=True)
#binary_recommender.fit()

Of the original 15143 users, only 9194 can be accounted for with the current solution. This should be changed in the future

## Model presentation

### Article Reccomendation

In [27]:
for user in [630220, 620796, 1067393, 1726258, 17205]:
    print("reccomended for user ", user, ": ", recommender.recommend_n_articles(user_id=user, n=5, allow_read_articles=True))

reccomended for user  630220 :  [9397136, 9436386, 9439767, 9501278, 9612165]
reccomended for user  620796 :  [3987251, 4038009, 4129979, 4236680, 4267981]
reccomended for user  1067393 :  [3248633, 4209547, 4296140, 4439831, 4613172]
reccomended for user  1726258 :  []
reccomended for user  17205 :  [9710488, 9735094, 9753048, 9771125, 8907011]


In [28]:
#for user in [630220, 620796, 1067393, 1726258, 17205]:
#    print("reccomended for user ", user, ": ", binary_recommender.recommend_n_articles(user_id=user, n=5, allow_read_articles=True))

### Evaluation Scores

#### Without ability to reccomend read articles

The complex model only reccomending articles the user has not yet read

In [29]:
results = recommender.evaluate_recommender(test_behaviours_df, k=100, n_jobs=4, user_sample=200, allow_read_articles=False)
results

{'MAP@K': np.float64(0.00011627906976744187),
 'NDCG@K': np.float64(0.00035450990193563005)}

The binary reccomender model only reccomending articles the user has not yet read

In [30]:
#results = binary_recommender.evaluate_recommender(test_behaviours_df, k=100, n_jobs=4, user_sample=200, allow_read_articles=False)
#results

#### With ability to reccomend previously read articles

The complex model reccomending articles the user, even if they have read them before

In [31]:
results = recommender.evaluate_recommender(test_behaviours_df, k=100, n_jobs=4, user_sample=200, allow_read_articles=True)
results

{'MAP@K': np.float64(0.0037640449438202245),
 'NDCG@K': np.float64(0.01808144123376919)}

The binary reccomender model reccomending articles the user, even if they have read them before

In [32]:
#results = binary_recommender.evaluate_recommender(test_behaviours_df, k=100, n_jobs=4, user_sample=200, allow_read_articles=True)
#results

## Model Experimentation

In [33]:
test_user_id = 630220

predictions = recommender.recommend_n_articles(user_id=test_user_id, n=1000, allow_read_articles=True)
results = set(test_behaviours_df.filter(pl.col("user_id") == test_user_id)["article_id"])

print(results)
print(predictions)

for prediction in predictions:
    if prediction in results:
        print("Yes")

{9786243, 9787524, 9781902, 9784591, 9783824, 9786111, 9776916, 9779615, 9788705, 9789473, 9428643, 9783334, 9782315, 9756075, 9787441, 9782722, 9786821, 9782726, 9786566, 9789896, 9787465, 9788362, 9791049, None, 9782092, 9780815, 9783509, 9772508, 9786718, 9786719, 9787487, 9790942, 9783655, 9786351, 9780849, 9781875, 9788661, 9781878, 9787510, 9786618, 9673979, 9780348, 9781887}
[9397136, 9436386, 9439767, 9501278, 9612165, 9612347, 9670132, 9676512, 9704792, 9708579, 3984408, 4142372, 4200067, 4257445, 4270671, 4434693, 4605951, 4710499, 4745422, 4756973, 6842422, 7960047, 8582542, 8586253, 8692855, 9022250, 9096458, 9097165, 9101181, 9131032, 4079307, 4079353, 4096785, 4097362, 4139668, 4146642, 4160922, 4234038, 4258311, 4299766, 4357728, 4407575, 4442898, 7034691, 7081601, 7711674, 7726465, 8606959, 8616013, 8811560, 4467642, 4653087, 4750604, 4790718, 4352918, 7014210, 8494332, 8499893, 8902811, 8919219, 8922722, 9086225, 9268309, 9276554, 5864927, 5866876, 6395266, 6438755, 65

In [34]:
test_user_id = 630220

predictions = recommender.recommend_n_articles(user_id=test_user_id, n=1000, allow_read_articles=True)
results = set(test_behaviours_df.filter(pl.col("user_id") == test_user_id)["article_id"])

print(results)
print(predictions)

for prediction in predictions:
    if prediction in results:
        print("Yes")

{9786243, 9787524, 9781902, 9784591, 9783824, 9786111, 9776916, 9779615, 9788705, 9789473, 9428643, 9783334, 9782315, 9756075, 9787441, 9782722, 9786821, 9782726, 9786566, 9789896, 9787465, 9788362, 9791049, None, 9782092, 9780815, 9783509, 9772508, 9786718, 9786719, 9787487, 9790942, 9783655, 9786351, 9780849, 9781875, 9788661, 9781878, 9787510, 9786618, 9673979, 9780348, 9781887}
[9397136, 9436386, 9439767, 9501278, 9612165, 9612347, 9670132, 9676512, 9704792, 9708579, 3984408, 4142372, 4200067, 4257445, 4270671, 4434693, 4605951, 4710499, 4745422, 4756973, 6842422, 7960047, 8582542, 8586253, 8692855, 9022250, 9096458, 9097165, 9101181, 9131032, 4079307, 4079353, 4096785, 4097362, 4139668, 4146642, 4160922, 4234038, 4258311, 4299766, 4357728, 4407575, 4442898, 7034691, 7081601, 7711674, 7726465, 8606959, 8616013, 8811560, 4467642, 4653087, 4750604, 4790718, 4352918, 7014210, 8494332, 8499893, 8902811, 8919219, 8922722, 9086225, 9268309, 9276554, 5864927, 5866876, 6395266, 6438755, 65