In [6]:
import pandas as pd
import re

# 读取 movies.csv
df = pd.read_csv("/root/autodl-tmp/LLM4RecWithQwen/data/movies.csv")

def clean_text(title, genres):
    # title: 去掉年份，全部小写
    title = re.sub(r"\(\d{4}\)", "", title).strip().lower()
    
    # genres: 用空格代替 |
    genres = genres.replace("|", " ").lower()
    
    # 拼在一起
    text = title + " " + genres
    return text

df["text"] = df.apply(lambda x: clean_text(x["title"], x["genres"]), axis=1)

df.head()
sentences = [row.split() for row in df["text"].tolist()]
print(sentences[:3])


[['toy', 'story', 'adventure', 'animation', 'children', 'comedy', 'fantasy'], ['jumanji', 'adventure', 'children', 'fantasy'], ['grumpier', 'old', 'men', 'comedy', 'romance']]


In [7]:
from gensim.models import Word2Vec

w2v_dim = 128   # embedding维度
w2v = Word2Vec(
    sentences,
    vector_size=w2v_dim,
    window=5,
    min_count=1,
    workers=4,
    sg=1  # 1=skip-gram，效果更好
)

print("Word2Vec 训练完成！")


Word2Vec 训练完成！


In [8]:
import numpy as np
from tqdm import tqdm

movie_embeddings = {}
missing_word = 0

for idx, row in df.iterrows():
    movie_id = row["movieId"]
    words = row["text"].split()

    vecs = []
    for w in words:
        if w in w2v.wv:
            vecs.append(w2v.wv[w])
        else:
            missing_word += 1

    # 平均 pooling
    if len(vecs) > 0:
        movie_embeddings[movie_id] = np.mean(vecs, axis=0)
    else:
        # 如果没有词，就用零向量
        movie_embeddings[movie_id] = np.zeros(w2v_dim)

print("缺失词数量:", missing_word)
print("电影 embedding 数量:", len(movie_embeddings))


缺失词数量: 0
电影 embedding 数量: 9742


In [10]:
import numpy as np

# 转成矩阵格式：按 movieId 排序
movie_ids = sorted(movie_embeddings.keys())
matrix = np.array([movie_embeddings[mid] for mid in movie_ids])

np.save("movie_embeddings.npy", matrix)
np.save("movie_ids.npy", np.array(movie_ids))

print("已保存 movie_embeddings.npy !")


已保存 movie_embeddings.npy !


In [12]:
import pandas as pd

ratings = pd.read_csv("/root/autodl-tmp/LLM4RecWithQwen/data/ratings.csv")  # userId,movieId,rating,timestamp
user_history = ratings.groupby("userId")["movieId"].apply(list)


In [16]:
import numpy as np

movie_emb = np.load("movie_embeddings.npy")  # shape: (num_movies, emb_dim)
movie_ids = np.load("movie_ids.npy")
movieId2index = {mid: i for i, mid in enumerate(movie_ids)}
emb_dim = movie_emb.shape[1]


In [17]:
import torch
import torch.nn as nn
from tqdm import tqdm
import numpy as np

# -----------------------------
# 自动获取 embedding 维度
# -----------------------------
emb_dim = movie_emb.shape[1]  # movie_emb 已经加载了 movie_embeddings.npy

# -----------------------------
# 用户塔模型
# -----------------------------
class UserTower(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, emb_dim),
            nn.ReLU(),
            nn.Linear(emb_dim, emb_dim)
        )
    def forward(self, movie_emb_list):
        return self.mlp(movie_emb_list.mean(dim=0))

device = "cuda" if torch.cuda.is_available() else "cpu"
user_tower = UserTower(emb_dim).to(device)

# -----------------------------
# 生成用户 embedding
# -----------------------------
user_embeddings_dict = {}
for uid, movies in tqdm(user_history.items()):
    valid_movies = [m for m in movies if m in movieId2index]
    if not valid_movies:
        user_embeddings_dict[uid] = np.zeros(emb_dim, dtype=np.float32)
        continue
    movie_vecs = torch.tensor([movie_emb[movieId2index[m]] for m in valid_movies],
                              dtype=torch.float32).to(device)
    with torch.no_grad():
        user_vec = user_tower(movie_vecs).cpu().numpy()
    user_embeddings_dict[uid] = user_vec

# -----------------------------
# 保存成 numpy 矩阵 + userIds
# -----------------------------
user_ids = sorted(user_embeddings_dict.keys())
matrix = np.array([user_embeddings_dict[uid] for uid in user_ids])
np.save("user_embeddings.npy", matrix)
np.save("user_ids.npy", np.array(user_ids))

print("用户 embedding 已生成并保存！")


  movie_vecs = torch.tensor([movie_emb[movieId2index[m]] for m in valid_movies],
610it [00:12, 48.29it/s]

用户 embedding 已生成并保存！



