## CODE TO GENERATE word_dict.pkl

In [1]:
import os
import pandas as pd
import re
import pickle
from collections import Counter

# --- Paths ---
base_path = "./mind_small"
train_news_path = os.path.join(base_path, "MINDsmall_train", "news.tsv")

utils_path = os.path.join(base_path, "utils")
os.makedirs(utils_path, exist_ok=True)

output_file = os.path.join(utils_path, "word_dict.pkl")

# --- Clean function ---
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9 ]+", " ", text)
    return text.strip()

# --- Read news.tsv ---
train_df = pd.read_csv(train_news_path, sep="\t", header=None,
                       names=["id", "category", "subcategory", "title", "abstract", "url", "ents_title", "ents_abs"])

# --- Mix titles ---
titles = list(train_df["title"].astype(str))

# --- Clean and tokenize ---
tokens = []
for t in titles:
    t = clean_text(t)
    tokens.extend(t.split())

# --- Vocabulary and frequency ---
counter = Counter(tokens)

# --- Make the dict word → index ---
# Special tokens
word_dict = {
    "<PAD>": 0,
    "<UNK>": 1
}

# Add frequent words
for i, (word, freq) in enumerate(counter.items(), start=2):
    word_dict[word] = i

# --- Save ---
with open(output_file, "wb") as f:
    pickle.dump(word_dict, f)

print(f"Diccionario creado con {len(word_dict)} palabras.")
print(f"Guardado en: {output_file}")

Diccionario creado con 31005 palabras.
Guardado en: ./mind_small\utils\word_dict.pkl


## CODE TO GENERATE word_dict_all.pkl

In [2]:
import os
import pandas as pd
import re
import pickle
from collections import Counter

# --- Paths ---
base_path = "./mind_small"
train_news_path = os.path.join(base_path, "MINDsmall_train", "news.tsv")

utils_path = os.path.join(base_path, "utils")
os.makedirs(utils_path, exist_ok=True)

output_file = os.path.join(utils_path, "word_dict_all.pkl")

# --- Clean function ---
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9 ]+", " ", text)
    return text.strip()

# --- Read news.tsv ---
train_df = pd.read_csv(train_news_path, sep="\t", header=None,
                       names=["id", "category", "subcategory", "title", "abstract", "url", "ents_title", "ents_abs"])

# --- Mix titles ---
titles = list(train_df["title"].astype(str))
abstracts = list(train_df["abstract"].astype(str))

# --- Clean and tokenize ---
tokens = []
for t in titles:
    t = clean_text(t)
    tokens.extend(t.split())
for a in abstracts:
    a = clean_text(a)
    tokens.extend(a.split())

# --- Vocabulary and frequency ---
counter = Counter(tokens)

# --- Make the dict word → index ---
# Special tokens
word_dict_all = {
    "<PAD>": 0,
    "<UNK>": 1
}

# Add frequent words
for i, (word, freq) in enumerate(counter.items(), start=2):
    word_dict_all[word] = i

# --- Save ---
with open(output_file, "wb") as f:
    pickle.dump(word_dict_all, f)

print(f"Diccionario creado con {len(word_dict_all)} palabras.")
print(f"Guardado en: {output_file}")

Diccionario creado con 54914 palabras.
Guardado en: ./mind_small\utils\word_dict_all.pkl


## CODE TO GENERATE vert_dict.pkl

In [8]:
import os
import pickle

# Ruta donde tienes el dataset
data_path = "./mind_small"
utils_path = os.path.join(data_path, "utils")
os.makedirs(utils_path, exist_ok=True)

# Fichero de noticias
news_file = os.path.join(data_path, "MINDsmall_train/news.tsv")

# Conjunto donde almacenaremos todas las categorías únicas
vert_set = set()

# Leer el archivo de noticias y extraer la columna "category"
with open(news_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) < 2:
            continue
        vert = parts[1]   # segunda columna → categoría
        vert_set.add(vert)

# Crear diccionario: categoría → índice
vert_dict = {vert: idx for idx, vert in enumerate(sorted(vert_set))}

# Guardar el diccionario como PKL
vert_dict_path = os.path.join(utils_path, "vert_dict.pkl")
with open(vert_dict_path, "wb") as f:
    pickle.dump(vert_dict, f)

print("Diccionario de categorías creado y guardado en:", vert_dict_path)
print("Categorías encontradas:", vert_dict)

Diccionario de categorías creado y guardado en: ./mind_small\utils\vert_dict.pkl
Categorías encontradas: {'autos': 0, 'entertainment': 1, 'finance': 2, 'foodanddrink': 3, 'health': 4, 'kids': 5, 'lifestyle': 6, 'middleeast': 7, 'movies': 8, 'music': 9, 'news': 10, 'northamerica': 11, 'sports': 12, 'travel': 13, 'tv': 14, 'video': 15, 'weather': 16}


## CODE TO GENERATE subvert_dict.pkl

In [9]:
import os
import pickle

# Ruta donde tienes el dataset
data_path = "./mind_small"
utils_path = os.path.join(data_path, "utils")
os.makedirs(utils_path, exist_ok=True)

# Fichero de noticias (el de TRAIN)
news_file = os.path.join(data_path, "MINDsmall_train/news.tsv")

# Conjunto donde almacenaremos todas las SUBcategorías únicas
subvert_set = set()

# Leer el archivo de noticias y extraer la columna "subategory"
with open(news_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) < 3:
            continue
        subvert = parts[2]   # tercera columna → subcategoría
        subvert_set.add(subvert)

# Crear diccionario: subcategoría → índice
subvert_dict = {subvert: idx for idx, subvert in enumerate(sorted(subvert_set))}

# Guardar el diccionario como PKL
subvert_dict_path = os.path.join(utils_path, "subvert_dict.pkl")
with open(subvert_dict_path, "wb") as f:
    pickle.dump(subvert_dict, f)

print("Diccionario de subcategorías creado y guardado en:", subvert_dict_path)
print("Subcategorías encontradas:", subvert_dict)

Diccionario de subcategorías creado y guardado en: ./mind_small\utils\subvert_dict.pkl
Subcategorías encontradas: {'ads-latingrammys': 0, 'ads-lung-health': 1, 'advice': 2, 'animals': 3, 'autosbuying': 4, 'autoscartech': 5, 'autosclassics': 6, 'autoscompact': 7, 'autosenthusiasts': 8, 'autoshybrids': 9, 'autoslosangeles': 10, 'autosluxury': 11, 'autosmidsize': 12, 'autosmotorcycles': 13, 'autosnews': 14, 'autosownership': 15, 'autospassenger': 16, 'autosresearch': 17, 'autosresearchguides': 18, 'autosreview': 19, 'autossema': 20, 'autossports': 21, 'autossuvs': 22, 'autostokyo': 23, 'autostrucks': 24, 'autosvans': 25, 'autosvideonew': 26, 'autosvideos': 27, 'awards': 28, 'awardstyle': 29, 'baseball': 30, 'baseball_mlb': 31, 'baseball_mlb_videos': 32, 'basketball_nba': 33, 'basketball_nba_videos': 34, 'basketball_ncaa': 35, 'basketball_ncaa_videos': 36, 'basketball_wnba': 37, 'beverages': 38, 'boxing': 39, 'boxing-mma': 40, 'cardio': 41, 'career-news': 42, 'causes': 43, 'causes-animals'

## CODE TO GENERATE uid2index.pkl

In [4]:
import os
import pickle

# ------------------------------
# Rutas
# ------------------------------
data_path = "./mind_small"
utils_path = os.path.join(data_path, "utils")
os.makedirs(utils_path, exist_ok=True)

train_behaviors = os.path.join(data_path, "MINDsmall_train", "behaviors.tsv")

output_file = os.path.join(utils_path, "uid2index.pkl")

# ------------------------------
# EXTRAER TODOS LOS USER IDS
# ------------------------------
user_ids = set()

def read_users(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) > 1:
                user_ids.add(parts[1])

print("Leyendo usuarios de train...")
read_users(train_behaviors)

print(f"Usuarios únicos encontrados: {len(user_ids)}")

# ------------------------------
# ASIGNAR ÍNDICES A CADA USUARIO
# ------------------------------
uid2index = {uid: idx for idx, uid in enumerate(sorted(user_ids))}

# ------------------------------
# GUARDAR ARCHIVO
# ------------------------------
with open(output_file, "wb") as f:
    pickle.dump(uid2index, f)

print(f"uid2index.pkl guardado en: {output_file}")

Leyendo usuarios de train...
Usuarios únicos encontrados: 50000
uid2index.pkl guardado en: ./mind_small\utils\uid2index.pkl


## CODE TO GENERATE embedding.npy

In [5]:
import os
import numpy as np
import pickle

# --------------------------
# Rutas
# --------------------------
data_path = "./mind_small"
utils_path = os.path.join(data_path, "utils")

word_dict_path = os.path.join(utils_path, "word_dict.pkl")
embedding_output = os.path.join(utils_path, "embedding.npy")

# Ruta del fichero GloVe descargado
glove_path = "./mind_small/utils/glove.6B.300d.txt"

# --------------------------
# Cargar diccionario
# --------------------------
with open(word_dict_path, "rb") as f:
    word_dict = pickle.load(f)

vocab_size = len(word_dict)
print("Tamaño del vocabulario:", vocab_size)

# --------------------------
# Cargar embeddings GloVe
# --------------------------
print("Cargando GloVe, esto puede tardar un poco...")

glove_embeddings = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        word = parts[0]
        vector = np.asarray(parts[1:], dtype="float32")
        glove_embeddings[word] = vector

embedding_dim = len(vector)
print("Dimensión de los embeddings:", embedding_dim)

# --------------------------
# Crear matriz de embeddings
# --------------------------
embedding_matrix = np.random.normal(
    loc=0.0, scale=0.1, size=(vocab_size, embedding_dim)
)

for word, idx in word_dict.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]

print("Embeddings asignados correctamente.")

# --------------------------
# Guardar matriz
# --------------------------
np.save(embedding_output, embedding_matrix)

print(f"embedding.npy guardado en: {embedding_output}")

Tamaño del vocabulario: 31005
Cargando GloVe, esto puede tardar un poco...
Dimensión de los embeddings: 300
Embeddings asignados correctamente.
embedding.npy guardado en: ./mind_small\utils\embedding.npy


## CODE TO GENERATE embedding_all.npy

In [3]:
import os
import numpy as np
import pickle

# --------------------------
# Rutas
# --------------------------
data_path = "./mind_small"
utils_path = os.path.join(data_path, "utils")

word_dict_path = os.path.join(utils_path, "word_dict_all.pkl")
embedding_output = os.path.join(utils_path, "embedding_all.npy")

# Ruta del fichero GloVe descargado
glove_path = "./mind_small/utils/glove.6B.300d.txt"

# --------------------------
# Cargar diccionario
# --------------------------
with open(word_dict_path, "rb") as f:
    word_dict = pickle.load(f)

vocab_size = len(word_dict)
print("Tamaño del vocabulario:", vocab_size)

# --------------------------
# Cargar embeddings GloVe
# --------------------------
print("Cargando GloVe, esto puede tardar un poco...")

glove_embeddings = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        word = parts[0]
        vector = np.asarray(parts[1:], dtype="float32")
        glove_embeddings[word] = vector

embedding_dim = len(vector)
print("Dimensión de los embeddings:", embedding_dim)

# --------------------------
# Crear matriz de embeddings
# --------------------------
embedding_matrix = np.random.normal(
    loc=0.0, scale=0.1, size=(vocab_size, embedding_dim)
)

for word, idx in word_dict.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]

print("Embeddings asignados correctamente.")

# --------------------------
# Guardar matriz
# --------------------------
np.save(embedding_output, embedding_matrix)

print(f"embedding.npy guardado en: {embedding_output}")

Tamaño del vocabulario: 54914
Cargando GloVe, esto puede tardar un poco...
Dimensión de los embeddings: 300
Embeddings asignados correctamente.
embedding.npy guardado en: ./mind_small\utils\embedding_all.npy


## CODE TO GENERATE lstur.yaml

In [4]:
import os
import yaml

data_path = "./mind_small"
utils_path = os.path.join(data_path, "utils")
os.makedirs(utils_path, exist_ok=True)
yaml_file = os.path.join(utils_path, "lstur.yaml")

config = {
    "data": {
        "title_size": 30,
        "his_size": 50,
        "data_format": "news",
        "npratio": 4
    },
    "info": {
        "metrics": ["group_auc", "mean_mrr", "ndcg@5;10"],
        "show_step": 100000
    },
    "model": {
        "attention_hidden_dim": 200,
        "word_emb_dim": 300,
        "dropout": 0.2,
        "filter_num": 400,
        "window_size": 3,
        "cnn_activation": "relu",
        "gru_unit": 400,
        "type": "ini",
        "model_type": "lstur"
    },
    "train": {
        "batch_size": 32,
        "epochs": 4,
        "learning_rate": 0.0001,
        "loss": "cross_entropy_loss",
        "optimizer": "adam",
        "support_quick_scoring": True
    }
}

with open(yaml_file, "w", encoding="utf-8") as f:
    yaml.dump(config, f, default_flow_style=False, sort_keys=False)

print(f"Archivo lstur.yaml creado en: {yaml_file}")

Archivo lstur.yaml creado en: ./mind_small\utils\lstur.yaml


## CODE TO GENERATE nrms.yaml

In [5]:
import yaml
import os

data_path = "./mind_small"
utils_path = os.path.join(data_path, "utils")
os.makedirs(utils_path, exist_ok=True)
yaml_file = os.path.join(utils_path, "nrms.yaml")

# Definir la estructura del YAML para NRMS
config = {
    "data": {
        "title_size": 30,
        "his_size": 50,
        "data_format": "news",
        "npratio": 4,
    },
    "info": {
        "metrics": ["group_auc", "mean_mrr", "ndcg@5;10"],
        "show_step": 100000,
    },
    "model": {
        "attention_hidden_dim": 200,
        "word_emb_dim": 300,
        "dropout": 0.2,
        "head_num": 20,
        "head_dim": 20,
        "model_type": "nrms",
    },
    "train": {
        "batch_size": 32,
        "epochs": 4,
        "learning_rate": 0.0001,
        "loss": "cross_entropy_loss",
        "optimizer": "adam",
        "support_quick_scoring": True,
    }
}

with open(yaml_file, "w", encoding="utf-8") as f:
    yaml.dump(config, f, default_flow_style=False, sort_keys=False)

print(f"Archivo nrms.yaml creado en: {yaml_file}")

Archivo nrms.yaml creado en: ./mind_small\utils\nrms.yaml


## CODE TO GENERATE naml.yaml

In [11]:
import os
import yaml

# Ruta donde se guardará el YAML
output_path = "./mind_small/utils/naml.yaml"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Configuración que queremos guardar
config = {
    "data": {
        "title_size": 30,
        "body_size": 50,
        "his_size": 50,
        "vert_num": 17,
        "subvert_num": 264,
        "data_format": "naml",
        "npratio": 4
    },
    "info": {
        "metrics": ["group_auc", "mean_mrr", "ndcg@5;10"],
        "show_step": 100000
    },
    "model": {
        "attention_hidden_dim": 200,
        "word_emb_dim": 300,
        "vert_emb_dim": 100,
        "subvert_emb_dim": 100,
        "dropout": 0.2,
        "filter_num": 400,
        "window_size": 3,
        "cnn_activation": "relu",
        "model_type": "naml",
        "dense_activation": "relu"
    },
    "train": {
        "batch_size": 32,
        "epochs": 4,
        "learning_rate": 0.0001,
        "loss": "cross_entropy_loss",
        "optimizer": "adam",
        "support_quick_scoring": True
    }
}

# Guardar en YAML
with open(output_path, "w") as f:
    yaml.dump(config, f, default_flow_style=False, sort_keys=False)

print(f"Fichero YAML generado en: {output_path}")

Fichero YAML generado en: ./mind_small/utils/naml.yaml


## CODE TO GENERATE npa.yaml

In [10]:
import yaml
import os

# Ruta donde se guardará el YAML
output_path = "./mind_small/utils/npa.yaml"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Diccionario con la estructura del YAML
config = {
    "data": {
        "title_size": 10,
        "his_size": 50,
        "data_format": "news",
        "npratio": 4
    },
    "info": {
        "metrics": ["group_auc", "mean_mrr", "ndcg@5;10"],
        "show_step": 100000
    },
    "model": {
        "attention_hidden_dim": 200,
        "word_emb_dim": 300,
        "user_emb_dim": 100,
        "dropout": 0.2,
        "filter_num": 400,
        "window_size": 3,
        "cnn_activation": "relu",
        "model_type": "npa"
    },
    "train": {
        "batch_size": 32,
        "epochs": 4,
        "learning_rate": 0.0001,
        "loss": "cross_entropy_loss",
        "optimizer": "adam",
        "support_quick_scoring": False
    }
}

# Guardar en YAML
with open(output_path, "w") as f:
    yaml.dump(config, f, default_flow_style=False, sort_keys=False)

print(f"Fichero YAML generado en: {output_path}")

Fichero YAML generado en: ./mind_small/utils/npa.yaml
