In [1]:
BLACKHOLE = False


import os
import sys
from pathlib import Path


if BLACKHOLE:
    workspace_path = os.path.expandvars('$BLACKHOLE')
    sys.path.append(workspace_path+'/DeepLearning/02456_news_project/src')
    DATAPATH = Path(workspace_path+"/DeepLearning/ebnerd_data").expanduser()
else:
    DATAPATH = Path("~/ebnerd_data").expanduser()

DATASET = "ebnerd_demo"
#DATASET = "ebnerd_small"

Packages:
- torch (PyTorch)
- transformers (Huggingface)

In [2]:
import torch

print("torch version:", torch.__version__)

# Check gpu availability


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Test:
#print(torch.zeros(1).cuda())

torch version: 2.5.1+cu124
cuda


TODO HERFRA OG NED

In [3]:
from utils.data_handler import NewsDataset
import from_ebrec._constants as cs

SEED = 42
HISTORY_SIZE = 20

COLS = [
    cs.DEFAULT_USER_COL,
    cs.DEFAULT_IMPRESSION_ID_COL,
    cs.DEFAULT_IMPRESSION_TIMESTAMP_COL,
    cs.DEFAULT_HISTORY_ARTICLE_ID_COL,
    cs.DEFAULT_CLICKED_ARTICLES_COL,
    cs.DEFAULT_INVIEW_ARTICLES_COL,
]

#FRACTION = 0.01
FRACTION = 0.1
#FRACTION = 1

# test
dataset = NewsDataset()

dataset.setup_df(dataset_path = DATAPATH, datasplit = DATASET, history_size = HISTORY_SIZE, columns = COLS, fraction = FRACTION, seed = SEED)


In [4]:
import transformers as huggingface
from from_ebrec._nlp import get_transformers_word_embeddings
from from_ebrec._polars import concat_str_columns
from from_ebrec._articles import convert_text2encoding_with_transformers
from from_ebrec._articles import create_article_id_to_value_mapping

dataset.setup_articles_data(dataset_path = DATAPATH.joinpath(DATASET))

df_articles = dataset.df_articles

TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [cs.DEFAULT_SUBTITLE_COL, cs.DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = huggingface.AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = huggingface.AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

word2vec_embedding = get_transformers_word_embeddings(transformer_model)
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)



In [5]:
from dataloader import NRMSDataLoader

BATCH_SIZE = 64

train_dataloader = NRMSDataLoader(
    behaviors= dataset.df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column= cs.DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=BATCH_SIZE,
)
val_dataloader = NRMSDataLoader(
    behaviors= dataset.df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column= cs.DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=BATCH_SIZE,
)


In [6]:
from nrms import NRMSModel
from hyperparameters import hparams_nrms

hparams = hparams_nrms()

# PARAMETERS
hparams.title_size = MAX_TITLE_LENGTH
hparams.history_size = HISTORY_SIZE

# MODEL ARCHITECTURE
hparams.head_num = 20
hparams.head_dim = 20
hparams.attention_hidden_dim = 200

# MODEL OPTIMIZER:
hparams.optimizer = "adam"
hparams.loss = "cross_entropy_loss"
hparams.dropout = 0.2
hparams.learning_rate = 1e-4

model = NRMSModel(hparams=hparams, word2vec_embedding=word2vec_embedding, seed = SEED, debug=False)

print(model)


NRMSModel(
  (news_encoder): NewsEncoder(
    (embedding): Embedding(250002, 768)
    (dropout): Dropout(p=0.2, inplace=False)
    (self_attention): SelfAttention()
    (dense_layers): Sequential(
      (0): Linear(in_features=400, out_features=400, bias=True)
      (1): ReLU()
      (2): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=400, out_features=400, bias=True)
      (5): ReLU()
      (6): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=400, out_features=400, bias=True)
      (9): ReLU()
      (10): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
      (11): Dropout(p=0.2, inplace=False)
    )
    (att_layer): AttLayer2()
  )
  (user_encoder): UserEncoder(
    (title_encoder): NewsEncoder(
      (embedding): Embedding(250002, 768)
      (dropout): Dropout(p=0.2, inplace=False)
      (self_attention): SelfAttention()
    

In [7]:
import torch.nn as nn
import torch.optim as optim

# Define the loss function and optimizer
if hparams.loss == "cross_entropy_loss":
    criterion = nn.CrossEntropyLoss() 
elif hparams.loss == "mse_loss":
    criterion = nn.MSELoss()
else:
    raise ValueError(f"Loss function {hparams.loss} not supported")

if hparams.optimizer == "adam":
    optimizer = optim.Adam(model.parameters(), lr=hparams_nrms.learning_rate)
else:
    raise ValueError(f"Optimizer {hparams.optimizer} not supported")

In [8]:
from tqdm import tqdm  # for progress bars
# Train the model 

EPOCHS = 10

# Move model to GPU if available
model.to(device)

# Training loop
train_loss_history, val_loss_history = [], []

for epoch in range(EPOCHS):
    model.train()  # Set the model to training mode
    train_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{EPOCHS}"):
        # Unpacking of batch
        (inputs, labels) = batch
        his_input_title, pred_input_title = inputs

        # Move data to device
        his_input_title = his_input_title.to(device)
        pred_input_title = pred_input_title.to(device)

        labels = labels.to(device)

        # Forward pass
        outputs = model(pred_input_title, his_input_title)  
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Validation Epoch {epoch + 1}/{EPOCHS}"):
            (inputs, labels) = batch
            his_input_title, pred_input_title = inputs

            his_input_title = his_input_title.to(device)
            pred_input_title = pred_input_title.to(device)
            labels = labels.to(device)

            outputs = model(pred_input_title, his_input_title)
            loss = criterion(outputs, labels)

            #print("Output")
            #print(outputs.tolist())
            #print("Labels")
            #print(labels.tolist())
            #print("")   
            
            val_loss += loss.item()

    train_loss /= len(train_dataloader)
    val_loss /= len(val_dataloader)
    train_loss_history.append(train_loss)
    val_loss_history.append(val_loss)

    print(f"Epoch {epoch + 1}/{EPOCHS}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")


Training Epoch 1/10: 100%|██████████| 34/34 [00:12<00:00,  2.63it/s]
Validation Epoch 1/10: 100%|██████████| 6/6 [00:00<00:00, 22.26it/s]


Epoch 1/10: Train Loss = 1.6459, Val Loss = 1.6479


Training Epoch 2/10: 100%|██████████| 34/34 [00:12<00:00,  2.67it/s]
Validation Epoch 2/10: 100%|██████████| 6/6 [00:00<00:00, 22.06it/s]


Epoch 2/10: Train Loss = 1.6229, Val Loss = 1.6532


Training Epoch 3/10: 100%|██████████| 34/34 [00:12<00:00,  2.71it/s]
Validation Epoch 3/10: 100%|██████████| 6/6 [00:00<00:00, 22.30it/s]


Epoch 3/10: Train Loss = 1.5979, Val Loss = 1.6527


Training Epoch 4/10: 100%|██████████| 34/34 [00:15<00:00,  2.20it/s]
Validation Epoch 4/10: 100%|██████████| 6/6 [00:00<00:00,  7.96it/s]


Epoch 4/10: Train Loss = 1.5795, Val Loss = 1.6136


Training Epoch 5/10: 100%|██████████| 34/34 [00:16<00:00,  2.10it/s]
Validation Epoch 5/10: 100%|██████████| 6/6 [00:00<00:00, 22.79it/s]


Epoch 5/10: Train Loss = 1.5574, Val Loss = 1.6244


Training Epoch 6/10: 100%|██████████| 34/34 [00:12<00:00,  2.70it/s]
Validation Epoch 6/10: 100%|██████████| 6/6 [00:00<00:00, 21.97it/s]


Epoch 6/10: Train Loss = 1.5495, Val Loss = 1.6447


Training Epoch 7/10: 100%|██████████| 34/34 [00:12<00:00,  2.70it/s]
Validation Epoch 7/10: 100%|██████████| 6/6 [00:00<00:00, 20.29it/s]


Epoch 7/10: Train Loss = 1.5401, Val Loss = 1.6139


Training Epoch 8/10: 100%|██████████| 34/34 [00:12<00:00,  2.68it/s]
Validation Epoch 8/10: 100%|██████████| 6/6 [00:00<00:00, 21.76it/s]


Epoch 8/10: Train Loss = 1.5129, Val Loss = 1.6324


Training Epoch 9/10: 100%|██████████| 34/34 [00:12<00:00,  2.62it/s]
Validation Epoch 9/10: 100%|██████████| 6/6 [00:00<00:00, 22.73it/s]


Epoch 9/10: Train Loss = 1.5064, Val Loss = 1.6502


Training Epoch 10/10: 100%|██████████| 34/34 [00:13<00:00,  2.60it/s]
Validation Epoch 10/10: 100%|██████████| 6/6 [00:00<00:00, 22.73it/s]

Epoch 10/10: Train Loss = 1.4944, Val Loss = 1.6380





In [11]:
# Evaluate the model
BATCH_SIZE_TEST = 1 # Currently onoy supports batch size 1

dataset.setup_test_data(dataset_path = DATAPATH, datasplit = DATASET, history_size = HISTORY_SIZE, columns = COLS, fraction = FRACTION, seed = SEED)

test_dataloader = NRMSDataLoader(
    behaviors=dataset.df_test,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=cs.DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False, # Is true in EBREC, but then it does not work
    batch_size=BATCH_SIZE_TEST,
)

# go from [[a], [b], [c]] to [a, b, c]
def convert_to_list(l):
    return [(item) for sublist in l for item in sublist]
model.eval()

pred_test = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Test"):
        (inputs, labels) = batch
        his_input_title, pred_input_title = inputs


        his_input_title = his_input_title.to(device)
        pred_input_title = pred_input_title.to(device)
        labels = labels.to(device)

        outputs = model(pred_input_title, his_input_title)

        #print("Output")
        #print(outputs.tolist())
        #print("Labels")
        #print(labels.tolist())
        bob = convert_to_list(outputs.tolist())
        #print(bob)
        #print(convert_to_int_list(labels.tolist()))
        #print("")

        pred_test.append(bob)


print("Predictions")
print(pred_test)
print("Labels")
print(dataset.df_test["labels"].to_list())

from from_ebrec.evaluation import MetricEvaluator
from from_ebrec.evaluation import AucScore, MrrScore, NdcgScore
metrics = MetricEvaluator(
    labels= dataset.df_test["labels"].to_list(),
    predictions= pred_test,
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()


Test: 100%|██████████| 2535/2535 [00:21<00:00, 118.19it/s]


Predictions
[[8.428500564150454e-07, 0.0019464489305391908, 0.998052716255188, 4.298214074157508e-13, 0.0], [2.9248479194166066e-08, 4.8601360650093757e-23, 0.00037708610761910677, 1.2276160088475763e-09, 9.879107081001148e-13, 1.0715022646934358e-08, 1.3291647784578059e-14, 4.221552862108081e-34, 6.967043261751904e-12, 9.15016240554678e-09, 0.9996229410171509, 1.0307917186480609e-08], [0.00018660719797480851, 3.4665597922867164e-05, 9.20871645604393e-09, 4.48620238180375e-11, 1.7120624988820055e-06, 0.9997380375862122, 3.8977963413344696e-05], [1.7010411283990834e-06, 6.282137136881527e-12, 0.9993605017662048, 1.2189379283711332e-08, 1.5577424905544035e-09, 1.6473568342778577e-10, 1.2645266906474717e-05, 0.0, 5.9107708949568405e-08, 4.549776713247411e-05, 1.8355814646042745e-09, 3.09828851641214e-06, 1.3905949006332702e-16, 8.202105308960025e-14, 9.051547067429055e-07, 2.571865707068355e-06, 2.1786268415042862e-16, 2.0550719983728735e-16, 4.543470501565531e-17, 6.444853467091685e-16, 

AUC: 100%|████████████████████████████████| 2535/2535 [00:01<00:00, 1838.60it/s]
AUC: 100%|███████████████████████████████| 2535/2535 [00:00<00:00, 71096.55it/s]
AUC: 100%|███████████████████████████████| 2535/2535 [00:00<00:00, 35851.77it/s]
AUC: 100%|███████████████████████████████| 2535/2535 [00:00<00:00, 35705.01it/s]


<MetricEvaluator class>: 
 {
    "auc": 0.5403360713475353,
    "mrr": 0.33113487966983657,
    "ndcg@5": 0.3723142547414879,
    "ndcg@10": 0.45113241827422235
}