In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

import lightgbm as lgb
from sklearn.metrics import ndcg_score

# Project root
project_root = Path.cwd().parent
sys.path.append(str(project_root))
PRE_DIR = project_root / "data" / "preprocessed"
MODEL_DIR = project_root / "models"
MODEL_DIR.mkdir(exist_ok=True)

print("Project root:", project_root)


Project root: /Users/jackcao/Documents/github/literature-recommendation-system


In [3]:
# Load preprocessed data
papers = pd.read_csv(PRE_DIR / "papers_preprocessed.csv")
users = pd.read_csv(PRE_DIR / "users_preprocessed.csv")
inter = pd.read_csv(PRE_DIR / "interactions_preprocessed.csv")

# Load embedding matrix
embeddings = np.load(PRE_DIR / "paper_embeddings.npy")

papers.head(), inter.head()

(   paper_id                                    title  \
 0         0              Military staff happy event.   
 1         1  Possible standard former whether smile.   
 2         2               Shake evidence yeah cover.   
 3         3       Customer lay politics sure pretty.   
 4         4        Write animal forward dark tax if.   
 
                                             abstract  year topic_primary  \
 0  Senior nor ahead consider. Success light capit...  2005  MentalHealth   
 1  Maintain hair general let. Character material ...  2001           HIV   
 2  Season education easy space argue. Stage inter...  2001           MCH   
 3  Detail herself easy miss red. Nor arm line for...  2011           CVD   
 4  Health memory budget matter simply set. None c...  2015           NCD   
 
    paper_recency  
 0             20  
 1             24  
 2             24  
 3             14  
 4             10  ,
    user_id  paper_id     event            timestamp  label
 0       60

In [4]:
import sys
sys.path.append(str(project_root))

from src.models.candidate_gen import CandidateGenerator

cg = CandidateGenerator(pre_dir=PRE_DIR)


/Users/jackcao/Documents/github/literature-recommendation-system
[DEBUG] Looking for preprocessed data in: /Users/jackcao/Documents/github/literature-recommendation-system/data/preprocessed


In [5]:
train_rows = []

for user_id in users["user_id"].tolist():

    # Get top 200 candidates for this user
    paper_ids, sims = cg.get_top_n(user_id, n=200)

    # Convert paper_id list â†’ DataFrame
    df = pd.DataFrame({
        "user_id": user_id,
        "paper_id": paper_ids,
        "similarity": sims
    })

    # Merge with paper metadata
    df = df.merge(papers[["paper_id", "paper_recency", "topic_primary"]], on="paper_id", how="left")

    # Add topic match feature
    u = users[users.user_id == user_id].iloc[0]
    df["topic_match"] = (
        (df["topic_primary"] == u["research_focus_1"]) |
        (df["topic_primary"] == u["research_focus_2"])
    ).astype(int)

    # Label: 1 if user interacted with paper
    interacted = inter[inter.user_id == user_id]["paper_id"].tolist()
    df["label"] = df["paper_id"].isin(interacted).astype(int)

    train_rows.append(df)

train_df = pd.concat(train_rows, ignore_index=True)
train_df.head()


Unnamed: 0,user_id,paper_id,similarity,paper_recency,topic_primary,topic_match,label
0,0,565,0.680768,5,Respiratory,1,0
1,0,972,0.664908,4,NCD,1,0
2,0,122,0.635828,9,Cancer,0,0
3,0,694,0.633624,15,MCH,0,0
4,0,92,0.625718,17,Cancer,0,0


In [6]:
feature_cols = ["similarity", "paper_recency", "topic_match"]
label_col = "label"

X = train_df[feature_cols]
y = train_df[label_col]

# Group queries (each user is a group)
group = train_df.groupby("user_id").size().tolist()

X.shape, len(group)


((40000, 3), 200)

In [8]:
train_data = lgb.Dataset(X, label=y, group=group)

params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [5],
    "learning_rate": 0.05,
    "num_leaves": 32,
    "max_depth": -1,
}

ranker_model = lgb.train(
    params,
    train_data,
    num_boost_round=200,
)

# Save model
model_path = MODEL_DIR / "ranker.txt"
ranker_model.save_model(str(model_path))

print("Saved trained ranking model to:", model_path)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 283
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 3
Saved trained ranking model to: /Users/jackcao/Documents/github/literature-recommendation-system/models/ranker.txt


In [9]:
# Simple evaluation: generate top 5 and compare to interactions

def evaluate_user(user_id):
    paper_ids, sims = cg.get_top_n(user_id, n=200)

    df = pd.DataFrame({
        "paper_id": paper_ids,
        "similarity": sims,
    }).merge(papers[["paper_id", "paper_recency", "topic_primary"]], on="paper_id")

    u = users[users.user_id == user_id].iloc[0]
    df["topic_match"] = (
        (df["topic_primary"] == u["research_focus_1"]) |
        (df["topic_primary"] == u["research_focus_2"])
    ).astype(int)

    X_user = df[feature_cols]

    scores = ranker_model.predict(X_user)
    df["score"] = scores

    top5 = df.sort_values("score", ascending=False).head(5)["paper_id"].tolist()
    interacted = inter[inter.user_id == user_id]["paper_id"].tolist()

    recall5 = len(set(top5) & set(interacted)) / max(1, len(interacted))

    # NDCG@5 using actual labels
    labels = df["paper_id"].isin(interacted).astype(int).tolist()
    ndcg5 = ndcg_score([labels], [df["score"].tolist()], k=5)

    return recall5, ndcg5

recalls, ndcgs = [], []
for uid in users["user_id"].tolist():
    r, n = evaluate_user(uid)
    recalls.append(r)
    ndcgs.append(n)

print("Recall@5:", np.mean(recalls))
print("NDCG@5:", np.mean(ndcgs))


Recall@5: 0.05007601422055757
NDCG@5: 0.27071279329466563
