In [None]:
# Cell 1 – Install Required Libraries
print("Installing libraries for embedding and model training …")
!pip install -q --upgrade transformers torch torchvision torchaudio sentence-transformers
print("✔️  Libraries installed")


Installing libraries for embedding and model training …
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m134.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m67.0 MB/s[0m eta [36

In [None]:
# Cell 2 – Import Libraries and Set-Up
print("Importing …")
import os, random
import numpy as np
import pandas as pd
import torch, torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics import roc_auc_score, roc_curve
from tqdm.auto import tqdm
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Importing …
Using device: cuda


In [None]:
# Cell 3 – Load News Data
news_path = '/content/news.tsv'
try:
    news_df = pd.read_csv(news_path, sep='\t', header=None, names=[
        'article_id','category','subcategory','title','abstract',
        'url','title_entities','abstract_entities'
    ])
    print(f"Loaded {len(news_df):,} news rows")
except FileNotFoundError:
    print("🔸 news.tsv not found – creating tiny dummy set")
    news_df = pd.DataFrame({
        'article_id':[f'N{i}' for i in range(1,101)],
        'category':['news']*100, 'subcategory':['misc']*100,
        'title':[f'Title {i}' for i in range(1,101)],
        'abstract':[f'Abstract {i}' for i in range(1,101)],
        'url':['']*100,'title_entities':['']*100,'abstract_entities':['']*100
    })


Loaded 51,282 news rows


In [None]:
# Cell 4 – Load Behaviors Data
behaviors_path = '/content/behaviors.tsv'
try:
    behaviors_df = pd.read_csv(behaviors_path, sep='\t', header=None, names=[
        'impression_id','user_id','time','click_history','impressions'
    ])
    print(f"Loaded {len(behaviors_df):,} behavior rows")
except FileNotFoundError:
    print("🔸 behaviors.tsv not found – creating dummy set")
    user_ids  = [f'U{i}' for i in range(1,51)]
    art_ids   = [f'N{i}' for i in range(1,101)]
    behaviors_df = pd.DataFrame({
        'impression_id':range(1,501),
        'user_id':np.random.choice(user_ids,500),
        'time':['']*500,
        'click_history':[' '.join(np.random.choice(art_ids, np.random.randint(1,6))) for _ in range(500)],
        'impressions':[' '.join(f'{aid}-{np.random.randint(2)}' for aid in np.random.choice(art_ids,10)) for _ in range(500)]
    })


Loaded 156,965 behavior rows


In [None]:
# Cell 5 – Generate Text Embeddings
print("Generating MiniLM embeddings …")
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
sbert = SentenceTransformer(model_name, device=device)
texts = (news_df['title'].fillna('') + ' ' + news_df['abstract'].fillna('')).tolist()
article_text_embeddings = sbert.encode(texts, show_progress_bar=True)
article_ids_order = news_df['article_id'].values
article_id_to_idx = {aid:i for i,aid in enumerate(article_ids_order)}
print("Embeddings shape:", article_text_embeddings.shape)


Generating MiniLM embeddings …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1603 [00:00<?, ?it/s]

Embeddings shape: (51282, 384)


In [None]:
# Cell 6 – Prepare Pure-Text Article Embeddings  (no sentiment features)
article_embeddings = article_text_embeddings          # shape [N,384]
embedding_dim = article_embeddings.shape[1]           # 384
print("article_embeddings.shape:", article_embeddings.shape)

# ---- Build user embeddings (mean-pool of clicked articles) ----
print("Building user embeddings …")
user_embeddings = {}
skipped_users = skipped_clicks = 0
for _, row in tqdm(behaviors_df.iterrows(), total=len(behaviors_df)):
    uid = row['user_id']
    idxs = []
    if isinstance(row['click_history'], str) and row['click_history'].strip():
        for aid in row['click_history'].split():
            ii = article_id_to_idx.get(aid)
            if ii is not None: idxs.append(ii)
            else: skipped_clicks += 1
    if idxs:
        user_embeddings[uid] = np.mean(article_embeddings[idxs], axis=0)
    else:
        user_embeddings[uid] = np.zeros(embedding_dim, dtype=np.float32)
        skipped_users += 1
print(f"Users built: {len(user_embeddings):,}  | cold-start: {skipped_users:,}")


article_embeddings.shape: (51282, 384)
Building user embeddings …


  0%|          | 0/156965 [00:00<?, ?it/s]

Users built: 50,000  | cold-start: 3,238


In [None]:
# Cell 7 – Define TwinTowers Model
class TwinTowers(nn.Module):
    def __init__(self, in_dim, hidden=128, out=64):
        super().__init__()
        self.user_tower = nn.Sequential(nn.Linear(in_dim, hidden), nn.ReLU(),
                                        nn.Linear(hidden, out))
        self.art_tower  = nn.Sequential(nn.Linear(in_dim, hidden), nn.ReLU(),
                                        nn.Linear(hidden, out))
    def forward(self, u, a):
        return torch.sigmoid((self.user_tower(u.float()) *
                              self.art_tower(a.float())).sum(dim=1))
print("TwinTowers ready, input:", embedding_dim)


TwinTowers ready, input: 384


In [None]:
# Cell 8 – Prepare Train/Test Pairs  (identical logic)
print("Preparing pairs …")
train_pairs, test_pairs, session_dict = [], [], {}
pos_tr = neg_tr = pos_te = neg_te = 0
all_article_idxs = list(range(len(article_embeddings)))

train_df = behaviors_df.sample(frac=0.8, random_state=SEED).reset_index(drop=True)
test_df  = behaviors_df.drop(train_df.index).reset_index(drop=True)

def add_pairs(df, train=True):
    global pos_tr, neg_tr, pos_te, neg_te
    pairs = train_pairs if train else test_pairs
    for _, row in df.iterrows():
        uid = row['user_id']; if_emb = uid in user_embeddings
        if not if_emb: continue
        uemb = user_embeddings[uid]
        clicked = [article_id_to_idx[aid] for aid in row['click_history'].split()
                   if aid in article_id_to_idx] if isinstance(row['click_history'], str) else []
        # positive samples
        for idx in clicked[:3]:
            pairs.append((uemb, article_embeddings[idx], 1))
            if train: pos_tr += 1
            else: pos_te += 1
        # negatives
        impr_neg = [article_id_to_idx[aid] for aid,label in
                    (ip.split('-') for ip in row['impressions'].split())
                    if label=='0' and aid in article_id_to_idx] if isinstance(row['impressions'], str) else []
        neg_src = impr_neg[:3]
        need = 3 - len(neg_src)
        if need:                                  # random negatives
            pool = list(set(all_article_idxs) - set(clicked))
            neg_src += list(np.random.choice(pool, min(need,len(pool)), replace=False))
        for idx in neg_src:
            pairs.append((uemb, article_embeddings[idx], 0))
            if train: neg_tr += 1
            else: neg_te += 1
add_pairs(train_df, True);  add_pairs(test_df, False)
print(f"Train pairs: {len(train_pairs):,}  (pos {pos_tr:,}/neg {neg_tr:,})")
print(f"Test  pairs: {len(test_pairs):,}  (pos {pos_te:,}/neg {neg_te:,})")


Preparing pairs …
Train pairs: 739,311  (pos 362,595/neg 376,716)
Test  pairs: 184,573  (pos 90,394/neg 94,179)


In [None]:
# Cell 9 – Build DataLoaders
def to_tensor(arr): return torch.tensor(np.array(arr), dtype=torch.float32)
train_ds = TensorDataset(to_tensor([u for u,_,_ in train_pairs]),
                         to_tensor([a for _,a,_ in train_pairs]),
                         to_tensor([l for _,_,l in train_pairs]))
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)

test_users   = to_tensor([u for u,_,_ in test_pairs]).to(device)
test_arts    = to_tensor([a for _,a,_ in test_pairs]).to(device)
test_labels  = np.array([l for _,_,l in test_pairs])
print("Batches:", len(train_loader))


Batches: 5776


In [None]:
# Cell 10 – Train
model = TwinTowers(embedding_dim).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCELoss()
epochs = 5
for ep in range(1, epochs+1):
    model.train(); running = 0
    for ub, ab, lb in tqdm(train_loader, desc=f"Epoch {ep}/{epochs}"):
        ub, ab, lb = ub.to(device), ab.to(device), lb.to(device)
        pred = model(ub, ab)
        loss = loss_fn(pred, lb)
        opt.zero_grad(); loss.backward(); opt.step()
        running += loss.item()
    print(f"· ep{ep} mean loss {running/len(train_loader):.4f}")
print("✅ training complete")


Epoch 1/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep1 mean loss 0.1389


Epoch 2/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep2 mean loss 0.0769


Epoch 3/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep3 mean loss 0.0611


Epoch 4/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep4 mean loss 0.0515


Epoch 5/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep5 mean loss 0.0448
✅ training complete


In [None]:
# Cell 11 – Evaluation (AUC, plus optional metrics)
model.eval()

batch_size_eval = 512
pred_chunks = []

with torch.no_grad():
    for i in range(0, len(test_users), batch_size_eval):
        u_batch = test_users[i:i+batch_size_eval]
        a_batch = test_arts[i:i+batch_size_eval]
        pred_chunks.append(model(u_batch, a_batch).cpu().numpy())

preds = np.concatenate(pred_chunks).squeeze()   # shape [N_test]

auc = roc_auc_score(test_labels, preds)
print(f"AUC: {auc:.4f}")



AUC: 0.9973


In [None]:
# Cell 11 – Evaluation, ROC curve, MRR, Hit-Rate, Save
from sklearn.metrics import roc_auc_score, roc_curve
import json

model.eval()

# ---------- AUC & ROC ----------
batch_sz = 512
pred_chunks = []
with torch.no_grad():
    for i in range(0, len(test_users), batch_sz):
        ub = test_users[i:i+batch_sz]; ab = test_arts[i:i+batch_sz]
        pred_chunks.append(model(ub, ab).cpu().numpy())
preds = np.concatenate(pred_chunks).squeeze()
auc   = roc_auc_score(test_labels, preds)
fpr, tpr, thr = roc_curve(test_labels, preds)
print(f"AUC: {auc:.4f}")
print("First 5 ROC points:")
for i in range(min(5, len(fpr))):
    print(f"  FPR {fpr[i]:.4f} | TPR {tpr[i]:.4f} | threshold {thr[i]:.4f}")

# ---------- build session_dict (impression_id → list of tuples) ----------
session_dict = {}
for _, row in test_df.iterrows():
    uid = row['user_id']
    if uid not in user_embeddings: continue
    uemb = user_embeddings[uid]
    imp_id = row['impression_id']
    sess = []
    if isinstance(row['impressions'], str) and row['impressions'].strip():
        for ip in row['impressions'].split():
            aid, label = ip.split('-')
            if aid in article_id_to_idx:
                aemb = article_embeddings[article_id_to_idx[aid]]
                sess.append((uemb, aemb, int(label), aid))
    if sess: session_dict[imp_id] = sess

# ---------- MRR & Hit-Rate ----------
k_vals = [1,5,10]
hits = {k:0 for k in k_vals}
recip = []
with torch.no_grad():
    for sess in session_dict.values():
        u = torch.tensor([x[0] for x in sess], dtype=torch.float32).to(device)
        a = torch.tensor([x[1] for x in sess], dtype=torch.float32).to(device)
        labels = [x[2] for x in sess]
        aids   = [x[3] for x in sess]

        scores = model(u, a).cpu().numpy()
        order  = np.argsort(-scores)
        ranked = [aids[i] for i in order]

        clicked = {aid for aid,l in zip(aids,labels) if l==1}
        if not clicked: continue

        # MRR
        for rank, aid in enumerate(ranked, 1):
            if aid in clicked:
                recip.append(1/rank); break

        # Hit-Rate@k
        for k in k_vals:
            if any(aid in clicked for aid in ranked[:k]):
                hits[k] += 1

n_sessions = len(session_dict)
mrr = np.mean(recip) if recip else 0.0
print(f"\nMRR: {mrr:.4f}  (over {n_sessions} sessions)")
for k in k_vals:
    hr = hits[k] / n_sessions
    print(f"Hit-Rate@{k}: {hr:.4f}")

# ---------- Save model & metrics ----------
torch.save(model.state_dict(), "/content/twin_towers_model_no_sent.pth")

results = {
    "AUC": auc,
    "MRR": mrr,
    "sessions": n_sessions,
    **{f"HR@{k}": hits[k]/n_sessions for k in k_vals}
}
pd.DataFrame([results]).to_csv("/content/twin_towers_results.csv", index=False)

print("\nSaved model → /content/twin_towers_model_no_sent.pth")
print("Saved metrics → /content/twin_towers_results.csv")
print("\n=== Evaluation Complete ===")


AUC: 0.9973
First 5 ROC points:
  FPR 0.0000 | TPR 0.0000 | threshold inf
  FPR 0.0000 | TPR 0.0051 | threshold 1.0000
  FPR 0.0000 | TPR 0.0108 | threshold 1.0000
  FPR 0.0000 | TPR 0.0146 | threshold 1.0000
  FPR 0.0000 | TPR 0.0181 | threshold 1.0000


  u = torch.tensor([x[0] for x in sess], dtype=torch.float32).to(device)



MRR: 0.3129  (over 31393 sessions)
Hit-Rate@1: 0.1585
Hit-Rate@5: 0.4782
Hit-Rate@10: 0.6589

Saved model → /content/twin_towers_model_no_sent.pth
Saved metrics → /content/twin_towers_results.csv

=== Evaluation Complete ===


In [None]:
import os, random, numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics import roc_auc_score, roc_curve
from tqdm.auto import tqdm

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [None]:
news_path = '/content/news.tsv'
try:
    news_df = pd.read_csv(news_path, sep='\t', header=None, names=[
        'article_id','category','subcategory','title','abstract',
        'url','title_entities','abstract_entities'
    ])
    print(f"news rows: {len(news_df):,}")
except FileNotFoundError:
    print("news.tsv not found – creating dummy set")
    news_df = pd.DataFrame({
        'article_id':[f'N{i}' for i in range(1,101)],
        'category':['news']*100,'subcategory':['misc']*100,
        'title':[f'Title {i}' for i in range(1,101)],
        'abstract':[f'Abstract {i}' for i in range(1,101)],
        'url':['']*100,'title_entities':['']*100,'abstract_entities':['']*100
    })


news rows: 51,282


In [None]:
beh_path = '/content/behaviors.tsv'
try:
    behaviors_df = pd.read_csv(beh_path, sep='\t', header=None, names=[
        'impression_id','user_id','time','click_history','impressions'
    ])
    print(f"behavior rows: {len(behaviors_df):,}")
except FileNotFoundError:
    print("behaviors.tsv not found – creating dummy behaviors")
    uids = [f'U{i}' for i in range(1,51)]
    aids = [f'N{i}' for i in range(1,101)]
    behaviors_df = pd.DataFrame({
        'impression_id':range(1,501),
        'user_id':np.random.choice(uids,500),
        'time':['']*500,
        'click_history':[' '.join(np.random.choice(aids, np.random.randint(1,6))) for _ in range(500)],
        'impressions':[' '.join(f'{aid}-{np.random.randint(2)}' for aid in np.random.choice(aids,10)) for _ in range(500)]
    })


behavior rows: 156,965


In [None]:
print("Encoding titles & abstracts …")
enc = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
texts = (news_df['title'].fillna('') + ' ' + news_df['abstract'].fillna('')).tolist()
text_emb = enc.encode(texts, show_progress_bar=True)           # [N,384]
article_ids = news_df['article_id'].values
aid2idx = {aid:i for i,aid in enumerate(article_ids)}
article_embeddings = text_emb
embedding_dim = article_embeddings.shape[1]
print("article_embeddings.shape:", article_embeddings.shape)


Encoding titles & abstracts …


Batches:   0%|          | 0/1603 [00:00<?, ?it/s]

article_embeddings.shape: (51282, 384)


In [None]:
# Cell 6 – split + user embeddings, with padding
print("Splitting sessions …")
train_df = behaviors_df.sample(frac=0.8, random_state=SEED).reset_index(drop=True)
test_df  = behaviors_df.drop(train_df.index).reset_index(drop=True)

clicked_by_user = {}
for _, row in train_df.iterrows():
    if isinstance(row['click_history'], str):
        for aid in row['click_history'].split():
            idx = aid2idx.get(aid)
            if idx is not None:
                clicked_by_user.setdefault(row['user_id'], []).append(idx)

user_embeddings = {}
for uid, idxs in clicked_by_user.items():
    user_embeddings[uid] = np.mean(article_embeddings[idxs], axis=0)

# --- pad EVERY train+test user with zero vec if still missing ---
all_users = set(train_df['user_id']) | set(test_df['user_id'])
for uid in all_users:
    if uid not in user_embeddings:
        user_embeddings[uid] = np.zeros(embedding_dim, dtype=np.float32)

print(f"Users with clicks in train: {len(clicked_by_user):,}")
print(f"Total user embeddings (after padding): {len(user_embeddings):,}")


Splitting sessions …
Users with clicks in train: 45,349
Total user embeddings (after padding): 47,101


In [None]:
# --- ensure EVERY user has an embedding, even if they clicked nothing ---
all_users = set(train_df['user_id']) | set(test_df['user_id'])
for uid in all_users:
    if uid not in user_embeddings:          # no clicks at all
        user_embeddings[uid] = np.zeros(embedding_dim, dtype=np.float32)
print("Total user embeddings after padding:", len(user_embeddings))


Total user embeddings after padding: 47101


In [None]:
def build_pairs(df):
    pairs = []
    max_pos = max_neg = 3
    all_idxs = list(range(len(article_embeddings)))
    for _, row in df.iterrows():
        uvec = user_embeddings[row['user_id']]
        # positives
        pos = [aid2idx[aid] for aid in (row['click_history'].split() if isinstance(row['click_history'],str) else [])
               if aid in aid2idx][:max_pos]
        for idx in pos:
            pairs.append((uvec, article_embeddings[idx], 1))
        # negatives
        neg = []
        if isinstance(row['impressions'], str):
            for ip in row['impressions'].split():
                aid, lab = ip.split('-')
                if lab=='0' and aid in aid2idx:
                    neg.append(aid2idx[aid])
                if len(neg) == max_neg: break
        need = max_neg - len(neg)
        if need:
            pool = list(set(all_idxs) - set(pos))
            neg += list(np.random.choice(pool, need, replace=False))
        for idx in neg:
            pairs.append((uvec, article_embeddings[idx], 0))
    return pairs

train_pairs = build_pairs(train_df)
test_pairs  = build_pairs(test_df)
print(f"train pairs: {len(train_pairs):,}  | test pairs: {len(test_pairs):,}")


train pairs: 739,311  | test pairs: 184,573


In [None]:
def to_tensor(arr): return torch.tensor(np.array(arr), dtype=torch.float32)
train_ds = TensorDataset(
    to_tensor([u for u,_,_ in train_pairs]),
    to_tensor([a for _,a,_ in train_pairs]),
    to_tensor([l for _,_,l in train_pairs])
)
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)

test_users = to_tensor([u for u,_,_ in test_pairs]).to(device)
test_items = to_tensor([a for _,a,_ in test_pairs]).to(device)
test_labels = np.array([l for _,_,l in test_pairs])
print("train batches:", len(train_loader))


train batches: 5776


In [None]:
class TwinTowers(nn.Module):
    def __init__(self, in_dim, hid=128, out=64):
        super().__init__()
        self.u = nn.Sequential(nn.Linear(in_dim,hid), nn.ReLU(), nn.Linear(hid,out))
        self.i = nn.Sequential(nn.Linear(in_dim,hid), nn.ReLU(), nn.Linear(hid,out))
    def forward(self, u, v):
        return torch.sigmoid((self.u(u.float()) * self.i(v.float())).sum(1))

model = TwinTowers(embedding_dim).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCELoss()


In [None]:
epochs = 5
for ep in range(1, epochs+1):
    model.train(); total=0
    for ub, ib, lb in tqdm(train_loader, desc=f"Epoch {ep}/{epochs}"):
        ub, ib, lb = ub.to(device), ib.to(device), lb.to(device)
        pred = model(ub, ib)
        loss = loss_fn(pred, lb)
        opt.zero_grad(); loss.backward(); opt.step()
        total += loss.item()
    print(f"· ep{ep} mean loss {total/len(train_loader):.4f}")
print("✅ training done")


Epoch 1/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep1 mean loss 0.1378


Epoch 2/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep2 mean loss 0.0757


Epoch 3/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep3 mean loss 0.0606


Epoch 4/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep4 mean loss 0.0514


Epoch 5/5:   0%|          | 0/5776 [00:00<?, ?it/s]

· ep5 mean loss 0.0443
✅ training done


In [None]:
print("Evaluating …")
model.eval()

# ----- AUC -----
preds = []
bs = 512
with torch.no_grad():
    for i in range(0, len(test_users), bs):
        preds.append(model(test_users[i:i+bs], test_items[i:i+bs]).cpu().numpy())
preds = np.concatenate(preds).squeeze()
auc = roc_auc_score(test_labels, preds)
print(f"AUC: {auc:.4f}")

# ----- Build sessions for MRR / Hit-Rate -----
session_dict = {}
for _, row in test_df.iterrows():
    uid = row['user_id']
    if uid not in user_embeddings:
        continue
    uvec = user_embeddings[uid]
    ses = []
    if isinstance(row['impressions'], str):
        for ip in row['impressions'].split():
            aid, lab = ip.split('-')
            if aid in aid2idx:
                ses.append((uvec, article_embeddings[aid2idx[aid]], int(lab), aid))
    if ses:
        session_dict[row['impression_id']] = ses

# ----- Ranking metrics -----
k_vals = [1,5,10]
hits = {k:0 for k in k_vals}
recip = []
with torch.no_grad():
    for ses in session_dict.values():
        u = torch.tensor(np.array([x[0] for x in ses]), dtype=torch.float32).to(device)
        a = torch.tensor(np.array([x[1] for x in ses]), dtype=torch.float32).to(device)
        labs = [x[2] for x in ses]; aids = [x[3] for x in ses]
        scores = model(u,a).cpu().numpy()
        order = np.argsort(-scores)
        ranked = [aids[i] for i in order]
        clicked = {aid for aid,l in zip(aids,labs) if l==1}
        if not clicked:
            continue
        # MRR
        for r,aid in enumerate(ranked,1):
            if aid in clicked:
                recip.append(1/r); break
        # Hit-Rate
        for k in k_vals:
            if any(a in clicked for a in ranked[:k]):
                hits[k] += 1

n_sess = len(session_dict)
mrr = np.mean(recip) if recip else 0.0
print(f"MRR: {mrr:.4f}  (over {n_sess} sessions)")
for k in k_vals:
    print(f"Hit-Rate@{k}: {hits[k]/n_sess:.4f}")

# ----- Save -----
torch.save(model.state_dict(), "/content/twin_towers_model_text_only.pth")
pd.DataFrame([{'AUC':auc,'MRR':mrr,**{f'HR@{k}':hits[k]/n_sess for k in k_vals}}])\
  .to_csv("/content/twin_towers_results_text_only.csv", index=False)
print("Saved model & metrics.")


Evaluating …
AUC: 0.9969
MRR: 0.3018  (over 31393 sessions)
Hit-Rate@1: 0.1446
Hit-Rate@5: 0.4708
Hit-Rate@10: 0.6533
Saved model & metrics.


In [None]:
import random, numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics import roc_auc_score, roc_curve
from tqdm.auto import tqdm

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [None]:
TRAIN_DIR = '/content/MINDlarge_train'   # adjust if the folder name differs
DEV_DIR   = '/content/MINDlarge_dev'

train_news_path = f'{TRAIN_DIR}/news.tsv'
train_beh_path  = f'{TRAIN_DIR}/behaviors.tsv'
dev_beh_path    = f'{DEV_DIR}/behaviors.tsv'



In [None]:
print("Loading train news …")
news_df = pd.read_csv(train_news_path, sep='\t', header=None, names=[
        'article_id','category','subcategory','title','abstract',
        'url','title_entities','abstract_entities'
])
print(f"news rows: {len(news_df):,}")

print("Encoding titles & abstracts …")
enc = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
texts = (news_df['title'].fillna('') + ' ' + news_df['abstract'].fillna('')).tolist()
article_embeddings = enc.encode(texts, show_progress_bar=True)     # [N,384]
embedding_dim = article_embeddings.shape[1]
article_ids = news_df['article_id'].values
aid2idx = {aid:i for i,aid in enumerate(article_ids)}
print("article_embeddings.shape:", article_embeddings.shape)


Loading train news …
news rows: 51,282
Encoding titles & abstracts …


Batches:   0%|          | 0/1603 [00:00<?, ?it/s]

article_embeddings.shape: (51282, 384)


In [None]:
print("Loading behaviors …")
beh_train = pd.read_csv(train_beh_path, sep='\t', header=None, names=[
        'impression_id','user_id','time','click_history','impressions'
])
beh_dev = pd.read_csv(dev_beh_path, sep='\t', header=None, names=[
        'impression_id','user_id','time','click_history','impressions'
])
print(f"train sessions: {len(beh_train):,} | dev sessions: {len(beh_dev):,}")


Loading behaviors …
train sessions: 156,965 | dev sessions: 73,152


In [None]:
clicked_by_user = {}
for _, row in beh_train.iterrows():
    if isinstance(row['click_history'], str):
        for aid in row['click_history'].split():
            idx = aid2idx.get(aid)
            if idx is not None:
                clicked_by_user.setdefault(row['user_id'], []).append(idx)

user_embeddings = {u: np.mean(article_embeddings[idxs], axis=0)
                   for u, idxs in clicked_by_user.items()}

# cold-start pad for every user that appears in dev
for uid in beh_dev['user_id'].unique():
    if uid not in user_embeddings:
        user_embeddings[uid] = np.zeros(embedding_dim, dtype=np.float32)

print(f"Users with train clicks: {len(clicked_by_user):,}")
print(f"Total user embeddings:  {len(user_embeddings):,}")


Users with train clicks: 49,108
Total user embeddings:  93,342


In [None]:
def build_pairs(df, positives=3, negatives=3):
    pairs, all_idxs = [], list(range(len(article_embeddings)))
    for _, row in df.iterrows():
        uvec = user_embeddings[row['user_id']]
        # positives
        pos = [aid2idx[aid] for aid in (row['click_history'].split() if isinstance(row['click_history'], str) else [])
               if aid in aid2idx][:positives]
        for idx in pos:
            pairs.append((uvec, article_embeddings[idx], 1))
        # negatives
        neg = []
        if isinstance(row['impressions'], str):
            for ip in row['impressions'].split():
                aid, lab = ip.split('-')
                if lab == '0' and aid in aid2idx:
                    neg.append(aid2idx[aid])
                if len(neg) == negatives: break
        need = negatives - len(neg)
        if need:
            pool = list(set(all_idxs) - set(pos))
            neg += list(np.random.choice(pool, need, replace=False))
        for idx in neg:
            pairs.append((uvec, article_embeddings[idx], 0))
    return pairs

train_pairs = build_pairs(beh_train)
dev_pairs   = build_pairs(beh_dev)
print(f"train pairs: {len(train_pairs):,} | dev pairs: {len(dev_pairs):,}")


KeyError: 'U33207'