In [2]:
import pandas as pd

In [3]:
from pathlib import Path

In [4]:
DATA_DIR = Path("../data/ml-1m")

In [5]:
ratings = pd.read_csv(
    DATA_DIR / "ratings.dat",
    sep="::", engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)

In [6]:
movies = pd.read_csv(
    DATA_DIR / "movies.dat",
    sep="::", engine="python",
    names=["movieId", "title", "genres"],
    encoding="latin-1"
)

In [7]:

print(ratings.shape, movies.shape)
ratings.head(), movies.head()

(1000209, 4) (3883, 3)


(   userId  movieId  rating  timestamp
 0       1     1193       5  978300760
 1       1      661       3  978302109
 2       1      914       3  978301968
 3       1     3408       4  978300275
 4       1     2355       5  978824291,
    movieId                               title                        genres
 0        1                    Toy Story (1995)   Animation|Children's|Comedy
 1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
 2        3             Grumpier Old Men (1995)                Comedy|Romance
 3        4            Waiting to Exhale (1995)                  Comedy|Drama
 4        5  Father of the Bride Part II (1995)                        Comedy)

In [8]:
# 用surprise 构造数据集 & 切分

In [9]:
from surprise import Dataset, Reader

In [10]:
from surprise.model_selection import train_test_split
import itertools

In [11]:
reader = Reader(rating_scale=(1,5))

In [12]:
data = Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)

In [13]:
trainset,testset = train_test_split(data, test_size = 0.2, random_state = 42)

In [14]:
train_ratings_count = sum(1 for _ in trainset.all_ratings())
(trainset.all_ratings()),len(testset)

(<generator object Trainset.all_ratings at 0x124d00d60>, 200042)

In [15]:

print(f"Train size: {train_ratings_count}")
print(f"Test size: {len(testset)}")

Train size: 800167
Test size: 200042


In [16]:
#训练 svd

In [17]:
from surprise import SVD
from surprise.model_selection import cross_validate

# 交叉验证评估 SVD 模型
algo = SVD(n_factors=50, n_epochs=20, reg_all=0.02, random_state=42)
cv = cross_validate(algo, data, measures=['RMSE','MAE'], cv=3, verbose=True)
cv


Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8822  0.8831  0.8839  0.8831  0.0007  
MAE (testset)     0.6942  0.6951  0.6950  0.6947  0.0004  
Fit time          3.35    3.16    3.45    3.32    0.12    
Test time         1.47    1.46    1.61    1.51    0.07    


{'test_rmse': array([0.88217523, 0.88310089, 0.88390952]),
 'test_mae': array([0.694161  , 0.69506549, 0.6949529 ]),
 'fit_time': (3.353254795074463, 3.1630747318267822, 3.4473540782928467),
 'test_time': (1.4732820987701416, 1.4607419967651367, 1.6050329208374023)}

In [18]:
# train/test evaluation

In [19]:
from surprise import accuracy

# 用全局 SVD 模型重新训练
algo = SVD(n_factors=50, n_epochs=20, reg_all=0.02, random_state=42)
algo.fit(trainset)

# 在测试集上预测
predictions = algo.test(testset)

# 计算 RMSE / MAE
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"Final Evaluation — RMSE: {rmse:.4f}, MAE: {mae:.4f}")

RMSE: 0.8706
MAE:  0.6839
Final Evaluation — RMSE: 0.8706, MAE: 0.6839


In [20]:
# 为指定用户生成 Top-N 推荐

In [21]:
import numpy as np
import pandas as pd

# 假设你已经有以下对象：
# - algo: 训练好的 SVD
# - trainset, testset: Surprise 的数据集
# - ratings, movies: 原始的 pandas DataFrame（在 Phase 2 读过）

def recommend_topn_for_user(algo, trainset, user_raw_id, N=10):
    """
    给原始用户ID（raw id，比如 1、2、3……）推荐 Top-N 电影（排除看过的）。
    返回 DataFrame: [movieId, est, title, genres]
    """
    # surprise 内部是 inner id，这里把 raw -> inner
    try:
        user_inner_id = trainset.to_inner_uid(user_raw_id)
    except ValueError:
        # 冷启动用户：训练集中没见过
        # 简单策略：推荐最热门或高评分-贝叶斯平均的电影（此处先给空）
        return pd.DataFrame(columns=["movieId", "est", "title", "genres"])

    # 该用户已看过的电影（inner ids）
    seen_inner_items = set(j for (j, _) in trainset.ur[user_inner_id])
    # 所有 inner items
    all_inner_items = set(range(trainset.n_items))
    # 待候选 = 未看过
    cand_inner_items = list(all_inner_items - seen_inner_items)

    # 预测每个候选电影的评分
    preds = []
    for inner_iid in cand_inner_items:
        raw_iid = trainset.to_raw_iid(inner_iid)  # 转回原始 movieId（字符串）
        est = algo.predict(user_raw_id, raw_iid).est
        preds.append((int(raw_iid), est))

    # Top-N
    preds.sort(key=lambda x: x[1], reverse=True)
    topn = preds[:N]

    # 拼电影信息
    topn_df = pd.DataFrame(topn, columns=["movieId", "est"])
    out = topn_df.merge(movies[["movieId", "title", "genres"]], on="movieId", how="left")
    # 排序美化
    out = out[["title", "genres", "movieId", "est"]].sort_values("est", ascending=False).reset_index(drop=True)
    return out

# 示例：给用户 1 推荐 Top-10
top10_u1 = recommend_topn_for_user(algo, trainset, user_raw_id=1, N=10)
top10_u1

Unnamed: 0,title,genres,movieId,est
0,"Shawshank Redemption, The (1994)",Drama,318,4.996461
1,"Silence of the Lambs, The (1991)",Drama|Thriller,593,4.912123
2,Sanjuro (1962),Action|Adventure,2905,4.902345
3,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama,2019,4.883954
4,"Third Man, The (1949)",Mystery|Thriller,1212,4.865285
5,"Bridge on the River Kwai, The (1957)",Drama|War,1250,4.847158
6,Rear Window (1954),Mystery|Thriller,904,4.826586
7,"Conformist, The (Il Conformista) (1970)",Drama,2925,4.82005
8,Pulp Fiction (1994),Crime|Drama,296,4.818425
9,Casablanca (1942),Drama|Romance|War,912,4.810864


In [22]:
import pandas as pd, os

# 确保模型文件夹存在
os.makedirs("../models", exist_ok=True)

# === 保存 SVD 模型 ===
import joblib
joblib.dump(algo, "../models/svd_model.pkl")
print("✅ Saved model to ../models/svd_model.pkl")

# === 生成贝叶斯平均回退表 ===
C = ratings["rating"].mean()
m = 50  # 最小投票阈值，可调
movie_stats = (
    ratings.groupby("movieId")["rating"]
           .agg(pop_count="size", mean_rating="mean")
           .reset_index()
)
movie_stats["bayes"] = (
    (movie_stats["pop_count"] * movie_stats["mean_rating"] + m * C)
    / (movie_stats["pop_count"] + m)
)

pop_fallback = (
    movie_stats.merge(movies[["movieId","title","genres"]], on="movieId", how="left")
               .sort_values("bayes", ascending=False)
)

pop_fallback.to_csv("../models/pop_fallback.csv", index=False)
print("✅ Saved bayesian fallback to ../models/pop_fallback.csv")

✅ Saved model to ../models/svd_model.pkl
✅ Saved bayesian fallback to ../models/pop_fallback.csv
