In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Load Movie Metadata

In [3]:
movies_enriched = pd.read_csv("movies_enriched.csv")
movies_reviews = pd.read_csv("movies_with_reviews.csv")

movies = movies_enriched.merge(
    movies_reviews,
    on=["movie_id", "title", "genres"],
    how="left"
)

text_cols = ["plot_summary", "tagline", "cast_bios", "crew_bios", "reviews"]
for c in text_cols:
    movies[c] = movies[c].fillna("")


**Build Input Text**

In [4]:
def build_movie_text(row):
    return (
        f"Title: {row['title']}. "
        f"Genres: {row['genres']}. "
        f"Plot: {row['plot_summary']}. "
        f"Tagline: {row['tagline']}. "
        f"Cast: {row['cast_bios']}. "
        f"Crew: {row['crew_bios']}. "
        f"Reviews: {row['reviews']}."
    )

movies["text"] = movies.apply(build_movie_text, axis=1)


**Load Frozen BERT Encoder**

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")

bert.to(device)
bert.eval()

for p in bert.parameters():
    p.requires_grad = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

**Chunking and Encoding Functions**

In [8]:
def chunk_text(text, tokenizer, max_tokens=510):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    return [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]


def encode_chunks(chunks, tokenizer, bert, device):
    embeddings = []

    for chunk in chunks:
        # Add CLS and SEP manually and create batch dimension
        input_ids = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
        input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

        attention_mask = torch.ones_like(input_ids)

        with torch.no_grad():
            out = bert(input_ids=input_ids, attention_mask=attention_mask)
            cls = out.last_hidden_state[:, 0, :]  # (1, hidden)
            embeddings.append(cls.squeeze(0))

    return torch.stack(embeddings, dim=0)


def embed_long_text(text, tokenizer, bert, device):
    chunks = chunk_text(text, tokenizer)
    chunk_embs = encode_chunks(chunks, tokenizer, bert, device)
    return chunk_embs.mean(dim=0)


**Generate Embeddings**

In [9]:
movie_embeddings = []

for text in tqdm(movies["text"]):
    emb = embed_long_text(text, tokenizer, bert, device)
    movie_embeddings.append(emb.cpu().numpy())

movie_content_embeddings = np.vstack(movie_embeddings)


100%|██████████| 3883/3883 [03:58<00:00, 16.25it/s]


In [10]:
print(movie_content_embeddings.shape)  # (num_movies, 768)


(3883, 768)


**Align with LightGCN Item Index**

Additional

# STEP 2 — Per-User Train–Test Split

**2.1 Load Ratings**

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [13]:
ratings = pd.read_csv(
    "ratings.dat",
    sep="::",
    engine="python",
    names=["user_id", "movie_id", "rating", "timestamp"]
)


**2.2 Per-User Split Function**

In [14]:
def per_user_split(df, min_interactions=5, test_ratio=0.2, seed=42):
    train_rows = []
    test_rows = []

    rng = np.random.RandomState(seed)

    for user_id, group in df.groupby("user_id"):
        if len(group) > min_interactions:
            idx = group.index.values
            test_size = int(len(idx) * test_ratio)
            test_idx = rng.choice(idx, size=test_size, replace=False)
            train_idx = np.setdiff1d(idx, test_idx)

            train_rows.append(df.loc[train_idx])
            test_rows.append(df.loc[test_idx])
        else:
            train_rows.append(group)

    train_df = pd.concat(train_rows).reset_index(drop=True)
    test_df = pd.concat(test_rows).reset_index(drop=True)

    return train_df, test_df


In [15]:
train_df, test_df = per_user_split(ratings)

print("Train interactions:", len(train_df))
print("Test interactions:", len(test_df))


Train interactions: 802553
Test interactions: 197656


In [16]:
assert set(tuple(x) for x in train_df.itertuples(index=False)).isdisjoint(set(tuple(x) for x in test_df.itertuples(index=False)))

# STEP 3 — Prepare Implicit Feedback for LightGCN

**3.1 Convert to Implicit**

In [17]:
train_df["implicit"] = 1
test_df["implicit"] = 1

**3.2 Map IDs to Indices**

LightGCN requires contiguous indices.

In [18]:
all_users = train_df["user_id"].unique()
all_items = train_df["movie_id"].unique()

user2idx = {u: i for i, u in enumerate(all_users)}
item2idx = {i: j for j, i in enumerate(all_items)}

idx2user = {i: u for u, i in user2idx.items()}
idx2item = {j: i for i, j in item2idx.items()}


In [19]:
train_df["u_idx"] = train_df["user_id"].map(user2idx)
train_df["i_idx"] = train_df["movie_id"].map(item2idx)
test_df["u_idx"] = test_df["user_id"].map(user2idx)
test_df["i_idx"] = test_df["movie_id"].map(item2idx)


In [None]:
test_df = test_df.dropna(subset=["u_idx", "i_idx"]).astype({"u_idx": int, "i_idx": int})


**3.3 Build Interaction Lists**

In [20]:
num_users = len(user2idx)
num_items = len(item2idx)

train_interactions = list(zip(train_df["u_idx"], train_df["i_idx"]))
test_interactions = list(zip(test_df["u_idx"], test_df["i_idx"]))


**Sanity Checks**

In [21]:
print(num_users, num_items)
print(train_df.head())
print(test_df.head())


6040 3680
   user_id  movie_id  rating  timestamp  implicit  u_idx  i_idx
0        1      1193       5  978300760         1      0      0
1        1       661       3  978302109         1      0      1
2        1       914       3  978301968         1      0      2
3        1      2355       5  978824291         1      0      3
4        1      1287       5  978302039         1      0      4
   user_id  movie_id  rating  timestamp  implicit  u_idx  i_idx
0        1      2797       4  978302039         1      0  604.0
1        1      1961       5  978301590         1      0  472.0
2        1      1207       4  978300719         1      0  134.0
3        1      2398       4  978302281         1      0  654.0
4        1      2692       4  978301570         1      0  179.0


In [29]:
ratings = pd.read_csv(
    "ratings.dat",
    sep="::",
    engine="python",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

# Use training split
train_df, _ = per_user_split(ratings)

# Get unique items in training in the order they appeared
item_ids_in_model = train_df["movie_id"].astype(int).unique()

print(len(item_ids_in_model))


3680


In [23]:
movie_id_to_row = {mid: i for i, mid in enumerate(movies["movie_id"])}


In [25]:
item_embs = np.load("item_embeddings.npy")

d_bert = movie_content_embeddings.shape[1]
d_gcn = item_embs.shape[1]

W = np.random.normal(scale=0.01, size=(d_bert, d_gcn))
bert_proj = movie_content_embeddings @ W


In [26]:
final_bert = np.zeros((len(item_ids_in_model), d_gcn))

for idx, movie_id in enumerate(item_ids_in_model):
    if movie_id in movie_id_to_row:
        final_bert[idx] = bert_proj[movie_id_to_row[movie_id]]
    else:
        final_bert[idx] = np.zeros(d_gcn)

In [46]:
np.save("bert_item_embeddings_final.npy", final_bert)


In [28]:
print(final_bert.shape)   # (num_items, d_gcn)
print(item_embs.shape)   # should match


(3680, 64)
(3706, 64)


In [30]:
final_bert = np.zeros_like(item_embs)
final_bert[:3680] = bert_proj[:3680]


In [31]:
from sklearn.preprocessing import LabelEncoder

ratings = pd.read_csv(
    'ratings.dat', sep='::', engine='python',
    names=['userId', 'movieId', 'rating', 'timestamp']
)

item_encoder = LabelEncoder()
ratings['item'] = item_encoder.fit_transform(ratings['movieId'])

# item_idx -> movieId
idx_to_movie = dict(enumerate(item_encoder.classes_))


In [36]:
item_emb_final = np.load("item_embeddings.npy")

In [37]:
print(len(idx_to_movie))        # should be 3706
print(item_emb_final.shape[0]) # should match


3706
3706


In [38]:
movie_id_to_row = {mid: i for i, mid in enumerate(movies['movie_id'])}


In [40]:
item_embs = np.load("item_embeddings.npy")

d_bert = movie_content_embeddings.shape[1]
d_gcn = item_embs.shape[1]

W = np.random.normal(scale=0.01, size=(d_bert, d_gcn))
bert_proj = movie_content_embeddings @ W


In [41]:
final_bert = np.zeros_like(item_embs)

missing = 0
for item_idx, movie_id in idx_to_movie.items():
    if movie_id in movie_id_to_row:
        final_bert[item_idx] = bert_proj[movie_id_to_row[movie_id]]
    else:
        missing += 1  # metadata missing

print(f"Missing metadata for {missing} items")


Missing metadata for 0 items


In [42]:
print(final_bert.shape)
print(item_embs.shape)


(3706, 64)
(3706, 64)


In [43]:
assert final_bert.shape == item_embs.shape
print("Alignment OK")


Alignment OK


In [44]:
# Check some random items
for i in np.random.choice(len(idx_to_movie), 5, replace=False):
    print(i, idx_to_movie[i], np.linalg.norm(final_bert[i]))


595 610 0.9303004
1950 2131 1.0170208
2769 2981 1.0384603
1523 1660 0.98901814
344 354 1.0103948


In [51]:
print(final_bert[0:10])

[[-3.71218193e-03  1.53530225e-01 -1.53252319e-01 -1.32888317e-01
   3.02849591e-01  2.03836307e-01 -1.32616952e-01 -6.61890507e-02
   8.99322852e-02 -1.62225753e-01  2.24300735e-02 -5.32949483e-03
  -5.38062379e-02  1.08198889e-01  2.47268647e-01  1.22694276e-01
   1.63998678e-01 -1.49259910e-01  4.46152464e-02 -4.91468096e-03
  -5.88112976e-03  1.11107826e-01  3.22930999e-02  2.58859068e-01
   2.43539289e-02  2.16327697e-01  6.47677332e-02  5.14685642e-03
  -4.55812700e-02  5.02546281e-02 -8.82900134e-02 -3.70853767e-02
   7.50600100e-02 -1.63785070e-01 -1.15384229e-01  1.89642087e-02
  -1.80702917e-02 -1.35372683e-01  8.17859322e-02  8.02030489e-02
   5.81909455e-02 -4.01284844e-02 -9.50864702e-02  1.90190986e-01
   6.21014759e-02  5.09071425e-02  1.14370780e-02  9.36541259e-02
  -3.80369723e-02  2.52319239e-02  1.06111102e-01 -1.03290603e-01
   7.22723156e-02 -8.34591836e-02 -6.91679819e-03 -6.70620203e-02
  -9.70630050e-02 -1.26089215e-01 -1.26153886e-01  8.15873817e-02
   1.72433