## Random

In [2]:
import pandas as pd
import os
import numpy as np
import recommenders
from sklearn.metrics import ndcg_score
import json
from tqdm import tqdm

# Comprobamos acceso a grafica
import tensorflow as tf
print("GPUs detectadas:", tf.config.list_physical_devices('GPU'))
print("cuDNN versión:", tf.sysconfig.get_build_info().get("cudnn_version", "no detectado"))

GPUs detectadas: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
cuDNN versión: 64_8


In [3]:
# ------------------------------
# 1. RUTAS DEL DATASET
# ------------------------------
train_path = "./mind_small/MINDsmall_train"
dev_path = "./mind_small/MINDsmall_dev"

behaviors_train = os.path.join(train_path, "behaviors.tsv")
news_train = os.path.join(train_path, "news.tsv")

behaviors_dev = os.path.join(dev_path, "behaviors.tsv")
news_dev = os.path.join(dev_path, "news.tsv")

# ------------------------------
# 2. CARGA DE DATOS
# ------------------------------
train_beh = pd.read_csv(behaviors_train, sep="\t", header=None,
                        names=["imp_id", "user", "time", "history", "impressions"])
dev_beh = pd.read_csv(behaviors_dev, sep="\t", header=None,
                      names=["imp_id", "user", "time", "history", "impressions"])
train_news = pd.read_csv(news_train, sep="\t", header=None,
                         names=["news_id", "category", "subcategory", "title", "abstract",
                                "url", "title_entities", "abstract_entities"])
dev_news = pd.read_csv(news_train, sep="\t", header=None,
                         names=["news_id", "category", "subcategory", "title", "abstract",
                                "url", "title_entities", "abstract_entities"])

all_news_ids = train_news["news_id"].tolist()

In [4]:
dev_news.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [None]:
# ----------------------------------------------------------
# FUNCIÓN: ordenación aleatoria
# ----------------------------------------------------------

def sort_impressions_random(candidate_ids):
    """
    Devuelve una permutación aleatoria de los índices de candidate_ids.
    Ej.: ['N1','N2','N3'] -> [1,0,2]
    """
    n = len(candidate_ids)
    return np.random.permutation(n).tolist()

In [29]:
# ------------------------------
# 1. NDCG PROMEDIO SOBRE N USUARIOS
# ------------------------------
k = 10
N = 100  # número de usuarios aleatorios para evaluar
ndcgs = []

for _ in range(N):
    example = dev_beh.sample(n=1).iloc[0]
    imps = example["impressions"].split()
    candidate_ids = [imp.split("-")[0] for imp in imps]  # todas las noticias mostradas
    clicked = [imp.split("-")[0] for imp in imps if imp.endswith("-1")]  # clics reales

    if len(clicked) == 0:
        continue  # saltar usuarios sin clics

    # Predicción Random dentro de los candidates
    pred = sort_impressions_random(candidate_ids)[:k]

    # Vector de relevancia real (1 si clicada, 0 si no)
    y_true = np.array([1 if nid in clicked else 0 for nid in candidate_ids])

    # Vector de score para predicción (1 si predicha, 0 si no)
    y_score = np.array([1 if nid in pred else 0 for nid in candidate_ids])

    ndcgs.append(ndcg_score([y_true], [y_score], k=k))

# ------------------------------
# 2. RESULTADOS
# ------------------------------
print(f"NDCG@{k} promedio Random sobre {len(ndcgs)} usuarios:", np.mean(ndcgs))

NDCG@10 promedio Random sobre 100 usuarios: 0.3263093698858336


In [17]:
# ----------------------------------------------------------
# GENERAR ARCHIVO random_pred_small.json
# ----------------------------------------------------------
with open('./mind_small/recommendations/random_pred_small.json', 'w') as f:
    for impr_index, (_, row) in tqdm(enumerate(dev_beh.iterrows())):

        imps = row["impressions"].split()
        candidate_ids = [imp.split("-")[0] for imp in imps]

        # ranking aleatorio (lista de índices)
        pred_rank = sort_impressions_random(candidate_ids)

        # construcción del JSON
        obj = {
            "impr_index": int(impr_index + 1),
            "pred_rank": pred_rank
        }

        # escritura: una línea por objeto
        f.write(json.dumps(obj) + "\n")

73152it [00:02, 27262.97it/s]


## Most popular

In [1]:
import pandas as pd
import os
import numpy as np
import recommenders
from sklearn.metrics import ndcg_score
import json
from tqdm import tqdm

# Comprobamos acceso a grafica
import tensorflow as tf
print("GPUs detectadas:", tf.config.list_physical_devices('GPU'))
print("cuDNN versión:", tf.sysconfig.get_build_info().get("cudnn_version", "no detectado"))

GPUs detectadas: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
cuDNN versión: 64_8


In [2]:
# ------------------------------
# 1. RUTAS DEL DATASET
# ------------------------------
train_path = "./mind_small/MINDsmall_train"
dev_path = "./mind_small/MINDsmall_dev"

behaviors_train = os.path.join(train_path, "behaviors.tsv")
news_train = os.path.join(train_path, "news.tsv")

behaviors_dev = os.path.join(dev_path, "behaviors.tsv")
news_dev = os.path.join(dev_path, "news.tsv")

# ------------------------------
# 2. CARGA DE DATOS
# ------------------------------
train_beh = pd.read_csv(behaviors_train, sep="\t", header=None,
                        names=["imp_id", "user", "time", "history", "impressions"])
dev_beh = pd.read_csv(behaviors_dev, sep="\t", header=None,
                      names=["imp_id", "user", "time", "history", "impressions"])
train_news = pd.read_csv(news_train, sep="\t", header=None,
                         names=["news_id", "category", "subcategory", "title", "abstract",
                                "url", "title_entities", "abstract_entities"])
dev_news = pd.read_csv(news_train, sep="\t", header=None,
                         names=["news_id", "category", "subcategory", "title", "abstract",
                                "url", "title_entities", "abstract_entities"])

all_news_ids = train_news["news_id"].tolist()

In [3]:
# ------------------------------
# 1. CALCULAR POPULARIDAD DE LAS NOTICIAS
# ------------------------------

def ranking_most_popular(behaviors):
    # asegurarse de no tener NaNs que rompan el split
    imprs = behaviors["impressions"].fillna("")
    
    all_clicked_ids = []   # lista con repeticiones: un elemento por cada "-1"
    all_shown_ids = set()  # conjunto con todos los ids que se han mostrado alguna vez
    
    for cell in imprs:
        if not cell:
            continue
        for token in cell.split():
            if not token:
                continue
            try:
                nid, flag = token.rsplit("-", 1)
            except ValueError:
                continue
            all_shown_ids.add(nid)
            if flag == "1":
                all_clicked_ids.append(nid)
    
    # contar clics
    popularity = pd.Series(all_clicked_ids).value_counts()  # Serie: ID → clicks
    
    clicked_ranking = popularity.index.tolist()      # IDs con clics, ordenados
    clicked_counts  = popularity.values.tolist()     # nº de clics en el mismo orden
    
    # IDs que nunca tuvieron clic
    never_clicked = list(all_shown_ids - set(clicked_ranking))
    never_clicked_counts = [0] * len(never_clicked)
    
    # combinar
    most_popular_ranking = clicked_ranking + never_clicked
    click_counts = clicked_counts + never_clicked_counts
    rank_pos = {nid: i for i, nid in enumerate(most_popular_ranking)}
    
    return most_popular_ranking, click_counts, rank_pos

# ----------------------------------------------------------
# 2. FUNCIÓN: ordenar por popularidad
# ----------------------------------------------------------

def sort_impressions_popularity(candidate_ids, rank_pos):
    """
    Devuelve una lista de índices que ordena candidate_ids
    según la popularidad global (clics). Las no clicadas van al final.
    
    candidate_ids: lista de IDs mostrados en una impresión.
    return: lista de índices ordenados.
    """
    
    # sort basado en la posición del ranking global
    sorted_indices = sorted(
        range(len(candidate_ids)),
        key=lambda i: rank_pos.get(candidate_ids[i], 10**6)  # num enorme → fondo
    )
    # convertimos a índices empezando en 1
    sorted_one_based = [i+1 for i in sorted_indices]
    
    return sorted_one_based

In [4]:
# rank_pos es diccionario → posición en el ranking
ranking, click_counts, rank_pos = ranking_most_popular(train_beh)

In [7]:
# ------------------------------
# 1. NDCG PROMEDIO SOBRE N USUARIOS
# ------------------------------
k = 10
N = 100
ndcgs = []

for _ in range(N):
    example = dev_beh.sample(n=1).iloc[0]
    imps = example["impressions"].split()
    candidate_ids = [imp.split("-")[0] for imp in imps]
    clicked = [imp.split("-")[0] for imp in imps if imp.endswith("-1")]

    if len(clicked) == 0:
        continue

    # Predicción Most Popular dentro de los candidates
    pred_pop = sort_impressions_popularity(candidate_ids, rank_pos)[:k]

    # Vector de relevancia real
    y_true = np.array([1 if nid in clicked else 0 for nid in candidate_ids])

    # Vector de score para predicción
    y_score_pop = np.array([1 if nid in pred_pop else 0 for nid in candidate_ids])

    ndcgs.append(ndcg_score([y_true], [y_score_pop], k=k))

# ------------------------------
# 2. RESULTADOS
# ------------------------------
print(f"NDCG@{k} promedio Most Popular sobre {len(ndcgs)} usuarios:", np.mean(ndcgs))

NDCG@10 promedio Most Popular sobre 100 usuarios: 0.2820263548761111


In [9]:
# ----------------------------------------------------------
# GENERAR ARCHIVO most_popular_pred_small.json
# ----------------------------------------------------------

output_file = "./mind_small/recommendations/pop_pred_small.json"

with open(output_file, "w") as f:
    for impr_index, (_, row) in tqdm(enumerate(dev_beh.iterrows()), total=len(dev_beh)):
        
        imps = row["impressions"].split()
        candidate_ids = [imp.split("-")[0] for imp in imps]

        # ranking por popularidad
        pred_rank = sort_impressions_popularity(candidate_ids, rank_pos)

        # construcción del JSON
        obj = {
            "impr_index": int(impr_index + 1),
            "pred_rank": pred_rank
        }

        f.write(json.dumps(obj) + "\n")

100%|█████████████████████████████████████████████████████████████████████████| 73152/73152 [00:03<00:00, 24281.63it/s]


## LSTUR: Neural News Recommendation with Long- and Short-term User Representations

- LSTUR captures both the user’s long-term preferences and short-term interests.
- It uses user ID embeddings to learn long-term representations.
- It uses the news recently read by the user, processed through a GRU network, to learn short-term representations.

In [71]:
import os
import sys
import numpy as np
import zipfile
from tqdm import tqdm
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.lstur import LSTURModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.25 (main, Nov  3 2025, 22:44:01) [MSC v.1929 64 bit (AMD64)]
Tensorflow version: 2.10.0


In [72]:
epochs = 4
seed = 42
batch_size = 32

In [73]:
# Carpeta local donde tienes los datasets
data_path = './mind_small'

# Archivos de entrenamiento
train_news_file = os.path.join(data_path, 'MINDsmall_train', 'news.tsv')
train_behaviors_file = os.path.join(data_path, 'MINDsmall_train', 'behaviors.tsv')

# Archivos de validación
valid_news_file = os.path.join(data_path, 'MINDsmall_dev', 'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'MINDsmall_dev', 'behaviors.tsv')

# Archivos de utilidades (embedding, diccionarios, yaml)
utils_path = os.path.join(data_path, 'utils')
wordDict_file = os.path.join(utils_path, "word_dict.pkl")
wordEmb_file = os.path.join(utils_path, "embedding.npy")
userDict_file = os.path.join(utils_path, "uid2index.pkl")
yaml_file = os.path.join(utils_path, 'lstur.yaml')

# Verificamos que existan los archivos
for f in [train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,
          wordEmb_file, userDict_file, wordDict_file, yaml_file]:
    if not os.path.exists(f):
        raise FileNotFoundError(f"Archivo no encontrado: {f}")

In [13]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 4, 'batch_size': 32, 'show_step': 100000, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'lstur', 'loss': 'cross_entropy_loss', 'wordEmb_file': './mind_small\\utils\\embedding.npy', 'wordDict_file': './mind_small\\utils\\word_dict.pkl', 'userDict_file': './mind_small\\utils\\uid2index.pkl'}


In [14]:
iterator = MINDIterator

In [15]:
model = LSTURModel(hparams, iterator, seed=seed)

Tensor("conv1d/Relu:0", shape=(None, 30, 400), dtype=float32)
Tensor("att_layer2/Sum_1:0", shape=(None, 400), dtype=float32)


  super().__init__(name, **kwargs)


In [16]:
%%time
# Valores previos a entrenarse
print(model.run_eval(valid_news_file, valid_behaviors_file))

  updates=self.state_updates,
1326it [00:06, 214.17it/s] 
2286it [01:18, 29.22it/s]
73152it [00:04, 14745.24it/s]


{'group_auc': 0.4802, 'mean_mrr': 0.206, 'ndcg@5': 0.2113, 'ndcg@10': 0.2711}
CPU times: total: 1min 46s
Wall time: 2min 14s


In [17]:
%%time
# Entreno
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

7386it [24:44,  4.97it/s]
1326it [00:01, 1126.87it/s]
2286it [01:13, 30.99it/s]
73152it [00:04, 15421.84it/s]


at epoch 1
train info: logloss loss:1.3824320930292573
eval info: group_auc:0.6498, mean_mrr:0.3049, ndcg@10:0.398, ndcg@5:0.334
at epoch 1 , train time: 1484.8 eval time: 123.9


7386it [24:31,  5.02it/s]
1326it [00:01, 1150.74it/s]
2286it [01:13, 31.09it/s]
73152it [00:04, 15230.79it/s]


at epoch 2
train info: logloss loss:1.29995469326512
eval info: group_auc:0.661, mean_mrr:0.3138, ndcg@10:0.4074, ndcg@5:0.344
at epoch 2 , train time: 1471.1 eval time: 124.0


7386it [24:30,  5.02it/s]
1326it [00:01, 1136.28it/s]
2286it [01:13, 30.98it/s]
73152it [00:05, 14462.67it/s]


at epoch 3
train info: logloss loss:1.2024294150463133
eval info: group_auc:0.6569, mean_mrr:0.3115, ndcg@10:0.4043, ndcg@5:0.3418
at epoch 3 , train time: 1470.7 eval time: 124.1


7386it [24:31,  5.02it/s]
1326it [00:01, 1172.85it/s]
2286it [01:13, 31.31it/s]
73152it [00:04, 15168.09it/s]


at epoch 4
train info: logloss loss:1.1086378012719014
eval info: group_auc:0.646, mean_mrr:0.303, ndcg@10:0.3943, ndcg@5:0.3299
at epoch 4 , train time: 1471.6 eval time: 123.4
CPU times: total: 2h 15min 9s
Wall time: 1h 46min 33s


<recommenders.models.newsrec.models.lstur.LSTURModel at 0x1fd8f46e6d0>

In [18]:
%%time
# Evaluamos post entreno
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

1326it [00:01, 812.50it/s] 
2286it [01:13, 31.15it/s]
73152it [00:04, 14907.21it/s]


{'group_auc': 0.646, 'mean_mrr': 0.303, 'ndcg@5': 0.3299, 'ndcg@10': 0.3943}
CPU times: total: 1min 37s
Wall time: 2min 3s


In [19]:
# Guardamos el modelo
model_path = os.path.join(data_path, "models/lstur")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "lstur"))

In [74]:
# Leemos el modelo guardado
epochs = 4
seed = 42
batch_size = 32

yaml_file = os.path.join(utils_path, 'lstur.yaml')

# 1. Cargar hparams desde YAML
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs)

iterator = MINDIterator

# 2. Crear el modelo vacío
model = LSTURModel(hparams, iterator, seed=seed)

# 3. Cargar los pesos
model.model.load_weights("mind_small/models/lstur/lstur")

print("Pesos cargados correctamente.")

Tensor("conv1d/Relu:0", shape=(None, 30, 400), dtype=float32)
Tensor("att_layer2/Sum_1:0", shape=(None, 400), dtype=float32)


  super().__init__(name, **kwargs)


Pesos cargados correctamente.


In [53]:
%%time
# Evaluamos post importacion
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

1326it [00:02, 475.13it/s]
2286it [02:54, 13.10it/s]
73152it [00:08, 9021.01it/s] 


{'group_auc': 0.646, 'mean_mrr': 0.303, 'ndcg@5': 0.3299, 'ndcg@10': 0.3943}
CPU times: total: 8min 18s
Wall time: 3min 48s


In [76]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)

  updates=self.state_updates,
1326it [00:06, 196.98it/s]
2286it [02:53, 13.19it/s]
73152it [00:05, 14269.97it/s]


In [79]:
output_file = os.path.join(data_path, 'recommendations/lstur_pred_small.json')

with open(output_file, 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1

        # Calcular el ranking
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()

        # Crear estructura JSON
        obj = {
            "impr_index": int(impr_index),
            "pred_rank": pred_rank
        }

        # Escribir como JSON en una línea
        f.write(json.dumps(obj) + "\n")

73152it [00:00, 88152.72it/s]


## NRMS: Neural News Recommendation with Multi-Head Self-Attention

- NRMS is a **content-based** news recommendation approach.  
- It uses **multi-head self-attention** to learn news representations by modeling interactions between words, and to learn user representations by capturing relationships among the news articles they have read.  
- It employs **additive attention** to select the most important words and news articles, generating more informative representations.

In [80]:
import os
import sys
import numpy as np
import zipfile
from tqdm import tqdm
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.25 (main, Nov  3 2025, 22:44:01) [MSC v.1929 64 bit (AMD64)]
Tensorflow version: 2.10.0


In [81]:
epochs = 4
seed = 42
batch_size = 32

In [82]:
# Carpeta local donde tienes los datasets
data_path = './mind_small'

# Archivos de entrenamiento
train_news_file = os.path.join(data_path, 'MINDsmall_train', 'news.tsv')
train_behaviors_file = os.path.join(data_path, 'MINDsmall_train', 'behaviors.tsv')

# Archivos de validación
valid_news_file = os.path.join(data_path, 'MINDsmall_dev', 'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'MINDsmall_dev', 'behaviors.tsv')

# Archivos de utilidades (embedding, diccionarios, yaml)
utils_path = os.path.join(data_path, 'utils')
wordDict_file = os.path.join(utils_path, "word_dict.pkl")
wordEmb_file = os.path.join(utils_path, "embedding.npy")
userDict_file = os.path.join(utils_path, "uid2index.pkl")
yaml_file = os.path.join(utils_path, 'nrms.yaml')

# Verificamos que existan los archivos
for f in [train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,
          wordEmb_file, userDict_file, wordDict_file, yaml_file]:
    if not os.path.exists(f):
        raise FileNotFoundError(f"Archivo no encontrado: {f}")

In [23]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 20, 'head_dim': 20, 'filter_num': 200, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 4, 'batch_size': 32, 'show_step': 10, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'model_type': 'nrms', 'loss': 'cross_entropy_loss', 'wordEmb_file': './mind_small\\utils\\embedding.npy', 'wordDict_file': './mind_small\\utils\\word_dict.pkl', 'userDict_file': './mind_small\\utils\\uid2index.pkl'}


In [24]:
iterator = MINDIterator

In [25]:
model = NRMSModel(hparams, iterator, seed=seed)

  super().__init__(name, **kwargs)


In [26]:
%%time
# Valores previos al entreno
print(model.run_eval(valid_news_file, valid_behaviors_file))

  updates=self.state_updates,
1326it [00:02, 538.19it/s] 
2286it [00:52, 43.82it/s]
73152it [00:04, 15207.94it/s]


{'group_auc': 0.4976, 'mean_mrr': 0.2183, 'ndcg@5': 0.2219, 'ndcg@10': 0.2855}
CPU times: total: 2min 28s
Wall time: 1min 43s


In [27]:
%%time
# Entreno
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

step 7380 , total_loss: 1.3963, data_loss: 1.3808: : 7386it [20:16,  6.07it/s]
1326it [00:01, 1116.15it/s]
2286it [00:50, 45.38it/s]
73152it [00:04, 15125.53it/s]


at epoch 1
train info: logloss loss:1.3963061434174178
eval info: group_auc:0.6366, mean_mrr:0.2949, ndcg@10:0.3874, ndcg@5:0.3216
at epoch 1 , train time: 1216.6 eval time: 100.8


step 7380 , total_loss: 1.3061, data_loss: 1.3983: : 7386it [20:10,  6.10it/s]
1326it [00:01, 1113.70it/s]
2286it [00:50, 45.42it/s]
73152it [00:04, 14978.70it/s]


at epoch 2
train info: logloss loss:1.3061103470242188
eval info: group_auc:0.6464, mean_mrr:0.3028, ndcg@10:0.3967, ndcg@5:0.3313
at epoch 2 , train time: 1210.7 eval time: 100.9


step 7380 , total_loss: 1.2720, data_loss: 1.2002: : 7386it [20:10,  6.10it/s]
1326it [00:01, 1110.42it/s]
2286it [00:50, 45.41it/s]
73152it [00:04, 15375.22it/s]


at epoch 3
train info: logloss loss:1.2719256742324023
eval info: group_auc:0.6526, mean_mrr:0.3078, ndcg@10:0.4025, ndcg@5:0.336
at epoch 3 , train time: 1210.8 eval time: 100.5


step 7380 , total_loss: 1.2488, data_loss: 1.3256: : 7386it [20:11,  6.10it/s]
1326it [00:01, 1085.80it/s]
2286it [00:50, 45.39it/s]
73152it [00:05, 14552.40it/s]


at epoch 4
train info: logloss loss:1.2487543693022647
eval info: group_auc:0.6482, mean_mrr:0.3072, ndcg@10:0.4006, ndcg@5:0.3355
at epoch 4 , train time: 1211.4 eval time: 100.9
CPU times: total: 1h 33min 30s
Wall time: 1h 27min 32s


<recommenders.models.newsrec.models.nrms.NRMSModel at 0x1ff87cd9fd0>

In [28]:
%%time
# Evaluamos post entreno
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

1326it [00:01, 866.26it/s] 
2286it [00:50, 45.43it/s]
73152it [00:04, 15039.04it/s]


{'group_auc': 0.6482, 'mean_mrr': 0.3072, 'ndcg@5': 0.3355, 'ndcg@10': 0.4006}
CPU times: total: 2min 26s
Wall time: 1min 40s


In [29]:
# Guardamos el modelo
model_path = os.path.join(data_path, "models/nrms")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "nrms"))

In [83]:
# Leemos el modelo guardado
epochs = 4
seed = 42
batch_size = 32

yaml_file = os.path.join(utils_path, 'nrms.yaml')

# 1. Cargar hparams desde YAML
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs,
                          show_step=10)

iterator = MINDIterator

# 2. Crear el modelo vacío
model = NRMSModel(hparams, iterator, seed=seed)

# 3. Cargar los pesos
model.model.load_weights("mind_small/models/nrms/nrms")

print("Pesos cargados correctamente.")

Pesos cargados correctamente.


In [84]:
%%time
# Evaluamos post importacion
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

1326it [00:02, 530.81it/s]
2286it [00:55, 41.50it/s]
73152it [00:05, 14306.03it/s]


{'group_auc': 0.6482, 'mean_mrr': 0.3072, 'ndcg@5': 0.3355, 'ndcg@10': 0.4006}
CPU times: total: 2min 37s
Wall time: 1min 47s


In [85]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)

1326it [00:01, 736.23it/s]
2286it [00:52, 43.14it/s]
73152it [00:05, 14105.24it/s]


In [86]:
output_file = os.path.join(data_path, 'recommendations/nrms_pred_small.json')

with open(output_file, 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1

        # Calcular el ranking
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()

        # Crear estructura JSON
        obj = {
            "impr_index": int(impr_index),
            "pred_rank": pred_rank
        }

        # Escribir como JSON en una línea
        f.write(json.dumps(obj) + "\n")

73152it [00:00, 80838.10it/s]


## NAML: Neural News Recommendation with Attentive Multi-View Learning

- **NAML is a neural, multi-view news recommendation approach.**
- **It uses the news title, body, category, and subcategory** to obtain the news representation.  
  It also uses the user’s behavior history to learn the user representation.
- **NAML employs additive attention** to learn informative representations by selecting the most relevant words and news articles.
- **Due to legal issues**, the MIND dataset does not release the full news bodies.  
  Therefore, **abstracts** are used instead.

In [8]:
import os
import sys
import numpy as np
import zipfile
from tqdm import tqdm
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.naml import NAMLModel
from recommenders.models.newsrec.io.mind_all_iterator import MINDAllIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.25 (main, Nov  3 2025, 22:44:01) [MSC v.1929 64 bit (AMD64)]
Tensorflow version: 2.10.0


In [9]:
epochs = 4
seed = 42
batch_size = 32

In [10]:
# Carpeta local donde tienes los datasets
data_path = './mind_small'

# Archivos de entrenamiento
train_news_file = os.path.join(data_path, 'MINDsmall_train', 'news.tsv')
train_behaviors_file = os.path.join(data_path, 'MINDsmall_train', 'behaviors.tsv')

# Archivos de validación
valid_news_file = os.path.join(data_path, 'MINDsmall_dev', 'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'MINDsmall_dev', 'behaviors.tsv')

# Archivos de utilidades (embedding, diccionarios, yaml)
utils_path = os.path.join(data_path, 'utils')
wordDict_file = os.path.join(utils_path, "word_dict_all.pkl")
wordEmb_file = os.path.join(utils_path, "embedding_all.npy")
userDict_file = os.path.join(utils_path, "uid2index.pkl")
vertDict_file = os.path.join(utils_path, "vert_dict.pkl")
subvertDict_file = os.path.join(utils_path, "subvert_dict.pkl")
yaml_file = os.path.join(utils_path, 'naml.yaml')

# Verificamos que existan los archivos
for f in [train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,
          wordEmb_file, userDict_file, wordDict_file, yaml_file]:
    if not os.path.exists(f):
        raise FileNotFoundError(f"Archivo no encontrado: {f}")

In [11]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          vertDict_file=vertDict_file, 
                          subvertDict_file=subvertDict_file,
                          batch_size=batch_size,
                          epochs=epochs)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 4, 'batch_size': 32, 'show_step': 100000, 'title_size': 30, 'body_size': 50, 'his_size': 50, 'vert_num': 17, 'subvert_num': 264, 'data_format': 'naml', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'naml', 'dense_activation': 'relu', 'loss': 'cross_entropy_loss', 'wordEmb_file': './mind_small\\utils\\embedding_all.npy', 'wordDict_file': './mind_small\\utils\\word_dict_all.pkl', 'userDict_file': './mind_small\\utils\\uid2index.pkl', 'vertDict_file': './mind_small\\utils\\vert_dict.pkl', 'subvertDict_file': './mind_small\\utils\\subvert_dict.pkl'}


In [12]:
iterator = MINDAllIterator

In [13]:
model = NAMLModel(hparams, iterator, seed=seed)

In [36]:
%%time
# Valores previos al entreno
print(model.run_eval(valid_news_file, valid_behaviors_file))

  updates=self.state_updates,
42386it [01:24, 498.72it/s]
73121it [02:51, 427.27it/s]
73152it [00:04, 14785.24it/s]


{'group_auc': 0.4888, 'mean_mrr': 0.2074, 'ndcg@5': 0.2098, 'ndcg@10': 0.2753}
CPU times: total: 8min 43s
Wall time: 5min 5s


In [37]:
%%time
# Entreno
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

7385it [39:41,  3.10it/s]
42386it [01:23, 508.07it/s]
73121it [02:50, 428.95it/s]
73152it [00:05, 14358.35it/s]


at epoch 1
train info: logloss loss:1.3725701858889192
eval info: group_auc:0.6467, mean_mrr:0.3023, ndcg@10:0.3953, ndcg@5:0.3341
at epoch 1 , train time: 2381.6 eval time: 303.3


7385it [39:32,  3.11it/s]
42386it [01:23, 506.17it/s]
73121it [02:49, 432.42it/s]
73152it [00:05, 14135.76it/s]


at epoch 2
train info: logloss loss:1.3085053350446025
eval info: group_auc:0.6546, mean_mrr:0.3062, ndcg@10:0.4019, ndcg@5:0.3413
at epoch 2 , train time: 2372.4 eval time: 302.2


7385it [39:33,  3.11it/s]
42386it [01:23, 510.22it/s]
73121it [02:48, 433.05it/s]
73152it [00:05, 14312.25it/s]


at epoch 3
train info: logloss loss:1.2834909335212603
eval info: group_auc:0.6599, mean_mrr:0.3129, ndcg@10:0.4081, ndcg@5:0.3476
at epoch 3 , train time: 2373.9 eval time: 301.3


7385it [39:32,  3.11it/s]
42386it [01:23, 507.42it/s]
73121it [02:49, 430.55it/s]
73152it [00:05, 14280.96it/s]


at epoch 4
train info: logloss loss:1.265041159187206
eval info: group_auc:0.661, mean_mrr:0.313, ndcg@10:0.408, ndcg@5:0.3455
at epoch 4 , train time: 2372.9 eval time: 302.8
CPU times: total: 3h 19min 34s
Wall time: 2h 58min 30s


<recommenders.models.newsrec.models.naml.NAMLModel at 0x200a80ce670>

In [38]:
%%time
# Evaluamos post entreno
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

42386it [01:23, 506.33it/s]
73121it [02:49, 430.23it/s]
73152it [00:05, 14575.62it/s]


{'group_auc': 0.661, 'mean_mrr': 0.313, 'ndcg@5': 0.3455, 'ndcg@10': 0.408}
CPU times: total: 8min 38s
Wall time: 5min 3s


In [39]:
# Guardamos el modelo
model_path = os.path.join(data_path, "models/naml")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "naml"))

In [90]:
# Leemos el modelo guardado
epochs = 4
seed = 42
batch_size = 32

yaml_file = os.path.join(utils_path, 'naml.yaml')

# 1. Cargar hparams desde YAML
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          vertDict_file=vertDict_file, 
                          subvertDict_file=subvertDict_file,
                          batch_size=batch_size,
                          epochs=epochs)

iterator = MINDAllIterator

# 2. Crear el modelo vacío
model = NAMLModel(hparams, iterator, seed=seed)

# 3. Cargar los pesos
model.model.load_weights("mind_small/models/naml/naml")

print("Pesos cargados correctamente.")

  super().__init__(name, **kwargs)


Pesos cargados correctamente.


In [57]:
%%time
# Evaluamos post importacion
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

  updates=self.state_updates,
42386it [02:00, 352.74it/s]
73121it [03:57, 307.37it/s]
73152it [00:05, 13330.16it/s]


{'group_auc': 0.661, 'mean_mrr': 0.313, 'ndcg@5': 0.3455, 'ndcg@10': 0.408}
CPU times: total: 12min 50s
Wall time: 6min 47s


In [91]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)

  updates=self.state_updates,
42386it [01:59, 355.10it/s]
73121it [03:44, 325.82it/s]
73152it [00:04, 14638.09it/s]


In [92]:
output_file = os.path.join(data_path, 'recommendations/naml_pred_small.json')

with open(output_file, 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1

        # Calcular el ranking
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()

        # Crear estructura JSON
        obj = {
            "impr_index": int(impr_index),
            "pred_rank": pred_rank
        }

        # Escribir como JSON en una línea
        f.write(json.dumps(obj) + "\n")

73152it [00:00, 89884.71it/s]


## NPA: Neural News Recommendation with Personalized Attention

- **NPA is a content-based news recommendation method.**
- **It uses a CNN** to learn news representations and learns user representations from the news articles they have clicked.
- **Personalized attention is applied at the word level** so that the model highlights important words according to the user.
- **Personalized attention is applied at the news level** so that the model highlights the most relevant historical news articles according to the user.

In [1]:
import os
import sys
import numpy as np
import zipfile
from tqdm import tqdm
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.npa import NPAModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.25 (main, Nov  3 2025, 22:44:01) [MSC v.1929 64 bit (AMD64)]
Tensorflow version: 2.10.0


In [2]:
epochs = 4
seed = 42
batch_size = 32

In [3]:
# Carpeta local donde tienes los datasets
data_path = './mind_small'

# Archivos de entrenamiento
train_news_file = os.path.join(data_path, 'MINDsmall_train', 'news.tsv')
train_behaviors_file = os.path.join(data_path, 'MINDsmall_train', 'behaviors.tsv')

# Archivos de validación
valid_news_file = os.path.join(data_path, 'MINDsmall_dev', 'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'MINDsmall_dev', 'behaviors.tsv')

# Archivos de utilidades (embedding, diccionarios, yaml)
utils_path = os.path.join(data_path, 'utils')
wordDict_file = os.path.join(utils_path, "word_dict.pkl")
wordEmb_file = os.path.join(utils_path, "embedding.npy")
userDict_file = os.path.join(utils_path, "uid2index.pkl")
yaml_file = os.path.join(utils_path, 'npa.yaml')

# Verificamos que existan los archivos
for f in [train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,
          wordEmb_file, userDict_file, wordDict_file, yaml_file]:
    if not os.path.exists(f):
        raise FileNotFoundError(f"Archivo no encontrado: {f}")

In [4]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs)
print(hparams)

HParams object with values {'support_quick_scoring': False, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 100, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 4, 'batch_size': 32, 'show_step': 100000, 'title_size': 10, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'npa', 'loss': 'cross_entropy_loss', 'wordEmb_file': './mind_small\\utils\\embedding.npy', 'wordDict_file': './mind_small\\utils\\word_dict.pkl', 'userDict_file': './mind_small\\utils\\uid2index.pkl'}


In [5]:
iterator = MINDIterator

In [6]:
model = NPAModel(hparams, iterator, seed=seed)

  super().__init__(name, **kwargs)


In [46]:
%%time
# Evaluamos pre entreno
print(model.run_eval(valid_news_file, valid_behaviors_file))

  updates=self.state_updates,
85657it [06:09, 231.81it/s]


{'group_auc': 0.5048, 'mean_mrr': 0.2248, 'ndcg@5': 0.233, 'ndcg@10': 0.2956}
CPU times: total: 9min 36s
Wall time: 6min 57s


In [47]:
%%time
# Entreno
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

7386it [09:36, 12.81it/s]
85657it [06:05, 234.65it/s]


at epoch 1
train info: logloss loss:1.4270442090414095
eval info: group_auc:0.6, mean_mrr:0.2679, ndcg@10:0.3543, ndcg@5:0.2925
at epoch 1 , train time: 576.4 eval time: 413.1


7386it [09:29, 12.97it/s]
85657it [06:05, 234.46it/s]


at epoch 2
train info: logloss loss:1.3440484522062364
eval info: group_auc:0.6038, mean_mrr:0.2671, ndcg@10:0.3561, ndcg@5:0.2923
at epoch 2 , train time: 569.7 eval time: 413.4


7386it [09:29, 12.96it/s]
85657it [06:06, 233.70it/s]


at epoch 3
train info: logloss loss:1.1953910842130349
eval info: group_auc:0.5984, mean_mrr:0.265, ndcg@10:0.3517, ndcg@5:0.2863
at epoch 3 , train time: 569.7 eval time: 414.4


7386it [09:30, 12.96it/s]
85657it [06:07, 233.25it/s]


at epoch 4
train info: logloss loss:1.0214095594451706
eval info: group_auc:0.5617, mean_mrr:0.2511, ndcg@10:0.3306, ndcg@5:0.2636
at epoch 4 , train time: 570.1 eval time: 415.2
CPU times: total: 1h 20min 29s
Wall time: 1h 5min 42s


<recommenders.models.newsrec.models.npa.NPAModel at 0x200f76a0ee0>

In [48]:
%%time
# Evaluamos post entreno
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

85657it [06:07, 233.15it/s]


{'group_auc': 0.5617, 'mean_mrr': 0.2511, 'ndcg@5': 0.2636, 'ndcg@10': 0.3306}
CPU times: total: 9min 31s
Wall time: 6min 56s


In [49]:
# Guardamos el modelo
model_path = os.path.join(data_path, "models/npa")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "npa"))

In [96]:
# Leemos el modelo guardado
epochs = 4
seed = 42
batch_size = 32

yaml_file = os.path.join(utils_path, 'npa.yaml')

# 1. Cargar hparams desde YAML
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs)

iterator = MINDIterator

# 2. Crear el modelo vacío
model = NPAModel(hparams, iterator, seed=seed)

# 3. Cargar los pesos
model.model.load_weights("mind_small/models/npa/npa")

print("Pesos cargados correctamente.")

Pesos cargados correctamente.


In [59]:
%%time
# Evaluamos post importacion
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

  updates=self.state_updates,
85657it [07:40, 185.82it/s]


{'group_auc': 0.5617, 'mean_mrr': 0.2511, 'ndcg@5': 0.2636, 'ndcg@10': 0.3306}
CPU times: total: 12min 22s
Wall time: 8min 30s


In [97]:
group_impr_indexes, group_labels, group_preds = model.run_slow_eval(valid_news_file, valid_behaviors_file)

85657it [07:18, 195.49it/s]


In [98]:
output_file = os.path.join(data_path, 'recommendations/npa_pred_small.json')

with open(output_file, 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1

        # Calcular el ranking
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()

        # Crear estructura JSON
        obj = {
            "impr_index": int(impr_index),
            "pred_rank": pred_rank
        }

        # Escribir como JSON en una línea
        f.write(json.dumps(obj) + "\n")

73152it [00:00, 73873.28it/s]
