In [24]:
BLACKHOLE = False


import os
import sys
from pathlib import Path


if BLACKHOLE:
    workspace_path = os.path.expandvars('$BLACKHOLE')
    sys.path.append(workspace_path+'/DeepLearning/02456_news_project/src')
    DATAPATH = Path(workspace_path+"/DeepLearning/ebnerd_data").expanduser()
else:
    DATAPATH = Path("~/ebnerd_data").expanduser()

DATASET = "ebnerd_demo"
#DATASET = "ebnerd_small"

Packages:
- torch (PyTorch)
- transformers (Huggingface)

In [25]:
import torch

print("torch version:", torch.__version__)

# Check gpu availability


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Test:
#print(torch.zeros(1).cuda())

torch version: 2.5.1+cu124
cuda


TODO HERFRA OG NED

In [39]:
from utils.data_handler import NewsDataset
import from_ebrec._constants as cs

SEED = 42
HISTORY_SIZE = 50

COLS = [
    cs.DEFAULT_USER_COL,
    cs.DEFAULT_IMPRESSION_ID_COL,
    cs.DEFAULT_IMPRESSION_TIMESTAMP_COL,
    cs.DEFAULT_HISTORY_ARTICLE_ID_COL,
    cs.DEFAULT_CLICKED_ARTICLES_COL,
    cs.DEFAULT_INVIEW_ARTICLES_COL,

    cs.DEFAULT_ARTICLE_ID_COL,
    cs.DEFAULT_AGE_COL,
    cs.DEFAULT_READ_TIME_COL
]

FRACTION = 0.01
#FRACTION = 0.1
#FRACTION = 1

# test
dataset = NewsDataset()

dataset.setup_df(dataset_path = DATAPATH, datasplit = DATASET, history_size = HISTORY_SIZE, columns = COLS, fraction = FRACTION, seed = SEED)


In [32]:
import transformers as huggingface
from from_ebrec._nlp import get_transformers_word_embeddings
from from_ebrec._polars import concat_str_columns
from from_ebrec._articles import convert_text2encoding_with_transformers
from from_ebrec._articles import create_article_id_to_value_mapping

dataset.setup_articles_data(dataset_path = DATAPATH.joinpath(DATASET))

df_articles = dataset.df_articles

TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [cs.DEFAULT_SUBTITLE_COL, cs.DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = huggingface.AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = huggingface.AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

word2vec_embedding = get_transformers_word_embeddings(transformer_model)
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH)
article_mapping = create_article_id_to_value_mapping(df=df_articles, value_col=token_col_title)



In [33]:
from dataloader import NRMSDataLoader

BATCH_SIZE = 64

train_dataloader = NRMSDataLoader(
    behaviors= dataset.df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column= cs.DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=BATCH_SIZE,
)
val_dataloader = NRMSDataLoader(
    behaviors= dataset.df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column= cs.DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=BATCH_SIZE,
)


In [34]:
from nrms import NRMSModel
from hyperparameters import hparams_nrms

hparams = hparams_nrms()

# PARAMETERS
hparams.title_size = MAX_TITLE_LENGTH
hparams.history_size = HISTORY_SIZE

# MODEL ARCHITECTURE
hparams.head_num = 20
hparams.head_dim = 20
hparams.attention_hidden_dim = 200
hparams.linear_hidden_dim = 400

# MODEL OPTIMIZER:
hparams.optimizer = "adam"
hparams.loss = "cross_entropy_loss"
hparams.dropout = 0.2
hparams.learning_rate = 1e-4

model = NRMSModel(hparams=hparams, word2vec_embedding=word2vec_embedding, seed = SEED, debug=False)

print(model)


NRMSModel(
  (news_encoder): NewsEncoder(
    (embedding): Embedding(250002, 768)
    (dropout): Dropout(p=0.2, inplace=False)
    (self_attention): SelfAttention()
    (dense_layers): Sequential(
      (0): Linear(in_features=400, out_features=400, bias=True)
      (1): ReLU()
      (2): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=400, out_features=400, bias=True)
      (5): ReLU()
      (6): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=400, out_features=400, bias=True)
      (9): ReLU()
      (10): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
      (11): Dropout(p=0.2, inplace=False)
    )
    (att_layer): AttLayer2()
  )
  (user_encoder): UserEncoder(
    (title_encoder): NewsEncoder(
      (embedding): Embedding(250002, 768)
      (dropout): Dropout(p=0.2, inplace=False)
      (self_attention): SelfAttention()
    

In [35]:
import torch.nn as nn
import torch.optim as optim

# Define the loss function and optimizer
if hparams.loss == "cross_entropy_loss":
    criterion = nn.CrossEntropyLoss() 
elif hparams.loss == "mse_loss":
    criterion = nn.MSELoss()
else:
    raise ValueError(f"Loss function {hparams.loss} not supported")

if hparams.optimizer == "adam":
    optimizer = optim.Adam(model.parameters(), lr=hparams_nrms.learning_rate)
else:
    raise ValueError(f"Optimizer {hparams.optimizer} not supported")

In [36]:
from tqdm import tqdm  # for progress bars
# Train the model 

EPOCHS = 10

# Move model to GPU if available
model.to(device)

# Training loop
train_loss_history, val_loss_history = [], []

for epoch in range(EPOCHS):
    model.train()  # Set the model to training mode
    train_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{EPOCHS}"):
        # Unpacking of batch
        (inputs, labels) = batch
        his_input_title, pred_input_title = inputs

        # Move data to device
        his_input_title = his_input_title.to(device)
        pred_input_title = pred_input_title.to(device)

        labels = labels.to(device)

        # Forward pass
        outputs = model(pred_input_title, his_input_title)  
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Validation Epoch {epoch + 1}/{EPOCHS}"):
            (inputs, labels) = batch
            his_input_title, pred_input_title = inputs

            his_input_title = his_input_title.to(device)
            pred_input_title = pred_input_title.to(device)
            labels = labels.to(device)

            outputs = model(pred_input_title, his_input_title)
            loss = criterion(outputs, labels)

            #print("Output")
            #print(outputs.tolist())
            #print("Labels")
            #print(labels.tolist())
            #print("")   
            
            val_loss += loss.item()

    train_loss /= len(train_dataloader)
    val_loss /= len(val_dataloader)
    train_loss_history.append(train_loss)
    val_loss_history.append(val_loss)

    print(f"Epoch {epoch + 1}/{EPOCHS}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")


Training Epoch 1/10: 100%|██████████| 336/336 [02:06<00:00,  2.65it/s]
Validation Epoch 1/10: 100%|██████████| 54/54 [00:02<00:00, 19.22it/s]


Epoch 1/10: Train Loss = 1.6161, Val Loss = 1.5986


Training Epoch 2/10: 100%|██████████| 336/336 [02:05<00:00,  2.68it/s]
Validation Epoch 2/10: 100%|██████████| 54/54 [00:02<00:00, 19.76it/s]


Epoch 2/10: Train Loss = 1.5665, Val Loss = 1.6001


Training Epoch 3/10: 100%|██████████| 336/336 [02:05<00:00,  2.68it/s]
Validation Epoch 3/10: 100%|██████████| 54/54 [00:02<00:00, 19.58it/s]


Epoch 3/10: Train Loss = 1.5387, Val Loss = 1.6067


Training Epoch 4/10: 100%|██████████| 336/336 [02:14<00:00,  2.50it/s]
Validation Epoch 4/10: 100%|██████████| 54/54 [00:06<00:00,  8.86it/s]


Epoch 4/10: Train Loss = 1.5225, Val Loss = 1.5976


Training Epoch 5/10: 100%|██████████| 336/336 [02:31<00:00,  2.21it/s]
Validation Epoch 5/10: 100%|██████████| 54/54 [00:02<00:00, 19.69it/s]


Epoch 5/10: Train Loss = 1.5119, Val Loss = 1.6055


Training Epoch 6/10: 100%|██████████| 336/336 [02:05<00:00,  2.67it/s]
Validation Epoch 6/10: 100%|██████████| 54/54 [00:02<00:00, 20.15it/s]


Epoch 6/10: Train Loss = 1.5061, Val Loss = 1.6076


Training Epoch 7/10: 100%|██████████| 336/336 [02:05<00:00,  2.67it/s]
Validation Epoch 7/10: 100%|██████████| 54/54 [00:02<00:00, 19.19it/s]


Epoch 7/10: Train Loss = 1.4940, Val Loss = 1.6037


Training Epoch 8/10: 100%|██████████| 336/336 [02:06<00:00,  2.66it/s]
Validation Epoch 8/10: 100%|██████████| 54/54 [00:02<00:00, 19.82it/s]


Epoch 8/10: Train Loss = 1.4913, Val Loss = 1.5907


Training Epoch 9/10: 100%|██████████| 336/336 [02:09<00:00,  2.59it/s]
Validation Epoch 9/10: 100%|██████████| 54/54 [00:02<00:00, 19.36it/s]


Epoch 9/10: Train Loss = 1.4848, Val Loss = 1.6060


Training Epoch 10/10: 100%|██████████| 336/336 [02:10<00:00,  2.58it/s]
Validation Epoch 10/10: 100%|██████████| 54/54 [00:02<00:00, 19.52it/s]


Epoch 10/10: Train Loss = 1.4771, Val Loss = 1.6015


In [40]:
# Evaluate the model
BATCH_SIZE_TEST = 1 # Currently onoy supports batch size 1

dataset.setup_test_data(dataset_path = DATAPATH, datasplit = DATASET, history_size = HISTORY_SIZE, columns = COLS, fraction = FRACTION, seed = SEED)

test_dataloader = NRMSDataLoader(
    behaviors=dataset.df_test,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=cs.DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False, # Is true in EBREC, but then it does not work
    batch_size=BATCH_SIZE_TEST,
)

# go from [[a], [b], [c]] to [a, b, c]
def convert_to_list(l):
    return [(item) for sublist in l for item in sublist]
model.eval()

pred_test = []
article_titles = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Test"):
        (inputs, labels) = batch
        his_input_title, pred_input_title = inputs


        his_input_title = his_input_title.to(device)
        pred_input_title = pred_input_title.to(device)
        labels = labels.to(device)

        outputs = model(pred_input_title, his_input_title)

        #print("Output")
        #print(outputs.tolist())
        #print("Labels")
        #print(labels.tolist())
        bob = convert_to_list(outputs.tolist())
        #print(bob)
        #print(convert_to_int_list(labels.tolist()))
        #print("")

        article_titles.append(convert_to_list(pred_input_title.tolist()))

        pred_test.append(bob)

from from_ebrec.evaluation import MetricEvaluator
from from_ebrec.evaluation import AucScore, MrrScore, NdcgScore
metrics = MetricEvaluator(
    labels= dataset.df_test["labels"].to_list(),
    predictions= pred_test,
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

Test: 100%|██████████| 253/253 [00:02<00:00, 99.43it/s] 
AUC: 100%|██████████████████████████████████| 253/253 [00:00<00:00, 1873.80it/s]
AUC: 100%|█████████████████████████████████| 253/253 [00:00<00:00, 84346.15it/s]
AUC: 100%|█████████████████████████████████| 253/253 [00:00<00:00, 36143.01it/s]
AUC: 100%|█████████████████████████████████| 253/253 [00:00<00:00, 42087.77it/s]


<MetricEvaluator class>: 
 {
    "auc": 0.5514161423075251,
    "mrr": 0.34352461244264965,
    "ndcg@5": 0.3842280498816002,
    "ndcg@10": 0.46110656588116095
}

In [None]:
number_to_print = 20
print("Top %d predictions vs labels:" % number_to_print)
labels = dataset.df_test["labels"].to_list()
for i in range(number_to_print):
    print(f"Article {i}")
    for j in range(len(pred_test[i])):
        print(f"{pred_test[i][j]:.3f} vs {labels[i][j]:.3f}")
    print("")





Top 20 predictions vs labels:
Article 0: [[6121.0, 1803.0, 33.0, 171.0, 99.0, 207745.0, 149.0, 82217.0, 53630.0, 80430.0, 145187.0, 71395.0, 42.0, 2918.0, 16471.0, 107.0, 588.0, 99.0, 11504.0, 37453.0, 52454.0, 4.0, 13613.0, 40308.0, 265.0, 1687.0, 2949.0, 77199.0, 71395.0, 42.0], [118405.0, 1409.0, 44.0, 73903.0, 175870.0, 116.0, 58.0, 1823.0, 545.0, 139.0, 99.0, 216861.0, 15198.0, 39406.0, 933.0, 4200.0, 7.0, 60.0, 47208.0, 5342.0, 50825.0, 1953.0, 52658.0, 14.0, 6889.0, 57513.0, 46466.0, 118405.0, 429.0, 588.0], [51032.0, 170.0, 115.0, 4.0, 57687.0, 32.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [126867.0, 4288.0, 14978.0, 72.0, 111059.0, 18.0, 645.0, 99.0, 308.0, 372.0, 112923.0, 104.0, 25.0, 33135.0, 109.0, 168.0, 25531.0, 119103.0, 1442.0, 20.0, 1731.0, 72.0, 20714.0, 100.0, 27194.0, 139.0, 149.0, 7822.0, 18.0, 1.0], [1913.0, 53752.0, 72.0, 22.0, 19389.0, 24507.0, 112.0, 12729.0, 2320.0, 4.0, 182.0, 8