In [4]:
import pandas as pd

In [5]:
ratings = pd.read_csv("../data/ml-1m/ratings.dat", 
                      sep="::", 
                      engine="python", 
                      names=["userId", "movieId", "rating", "timestamp"])

In [10]:
movies = pd.read_csv("../data/ml-1m/movies.dat", 
                     sep="::", 
                     engine="python", 
                     names=["movieId", "title", "genres"],
                     encoding="latin-1")

In [11]:
users = pd.read_csv("../data/ml-1m/users.dat", 
                    sep="::", 
                    engine="python", 
                    names=["userId", "gender", "age", "occupation", "zip"])

In [12]:
#print(ratings.head())

   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [13]:
#print(movies.head())

   movieId                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [14]:
#print(users.head())

   userId gender  age  occupation    zip
0       1      F    1          10  48067
1       2      M   56          16  70072
2       3      M   25          15  55117
3       4      M   45           7  02460
4       5      M   25          20  55455


In [15]:
import pandas as pd

# 1.1 按“评分次数”作为热度（最常见、最稳）
pop_count = (
    ratings.groupby("movieId")["rating"]
           .count()
           .sort_values(ascending=False)
           .rename("popularity_count")
)
pop_count.head(10)

movieId
2858    3428
260     2991
1196    2990
1210    2883
480     2672
2028    2653
589     2649
2571    2590
1270    2583
593     2578
Name: popularity_count, dtype: int64

In [16]:
# 1.2 按“评分总分”作为热度（偏向高分电影，但会受打分尺度影响）
pop_sum = (
    ratings.groupby("movieId")["rating"]
           .sum()
           .sort_values(ascending=False)
           .rename("popularity_sum")
)
pop_sum.head(10)

movieId
2858    14800
260     13321
1196    12836
1210    11598
2028    11507
1198    11257
593     11219
2571    11178
2762    10835
589     10751
Name: popularity_sum, dtype: int64

In [17]:
def topn_by_count(n=10, min_votes=0):
    g = ratings.groupby("movieId")["rating"]
    size = g.size().rename("num_ratings")
    mean = g.mean().rename("avg_rating")
    df = pd.concat([size, mean], axis=1)
    df = df[df["num_ratings"] >= min_votes]           # 过滤冷门
    df = df.sort_values("num_ratings", ascending=False).head(n)
    return df.reset_index().merge(movies, on="movieId", how="left")[["movieId","title","num_ratings","avg_rating"]]

topn_by_count(n=10, min_votes=50)

Unnamed: 0,movieId,title,num_ratings,avg_rating
0,2858,American Beauty (1999),3428,4.317386
1,260,Star Wars: Episode IV - A New Hope (1977),2991,4.453694
2,1196,Star Wars: Episode V - The Empire Strikes Back...,2990,4.292977
3,1210,Star Wars: Episode VI - Return of the Jedi (1983),2883,4.022893
4,480,Jurassic Park (1993),2672,3.763847
5,2028,Saving Private Ryan (1998),2653,4.337354
6,589,Terminator 2: Judgment Day (1991),2649,4.058513
7,2571,"Matrix, The (1999)",2590,4.31583
8,1270,Back to the Future (1985),2583,3.990321
9,593,"Silence of the Lambs, The (1991)",2578,4.351823


In [18]:
def topn_by_bayesian(n=10, m=50):
    """
    贝叶斯均值：综合均分与评分量，避免冷门高分上榜。
      score = (v/(v+m))*R + (m/(v+m))*C
      v=电影评分数, R=电影均分, C=全局均分, m=平滑强度
    """
    g = ratings.groupby("movieId")["rating"]
    v = g.size().rename("num_ratings")
    R = g.mean().rename("avg_rating")
    C = ratings["rating"].mean()
    s = ((v/(v+m))*R + (m/(v+m))*C).rename("bayes_score")
    df = pd.concat([v, R, s], axis=1).sort_values("bayes_score", ascending=False).head(n)
    return df.reset_index().merge(movies, on="movieId", how="left")[["movieId","title","num_ratings","avg_rating","bayes_score"]]

topn_by_bayesian(n=10, m=50)

Unnamed: 0,movieId,title,num_ratings,avg_rating,bayes_score
0,318,"Shawshank Redemption, The (1994)",2227,4.554558,4.533192
1,858,"Godfather, The (1972)",2223,4.524966,4.504214
2,50,"Usual Suspects, The (1995)",1783,4.517106,4.491587
3,527,Schindler's List (1993),2304,4.510417,4.490687
4,2019,Seven Samurai (The Magnificent Seven) (Shichin...,628,4.56051,4.488316
5,1198,Raiders of the Lost Ark (1981),2514,4.477725,4.460249
6,1148,"Wrong Trousers, The (1993)",882,4.507937,4.458238
7,745,"Close Shave, A (1995)",657,4.520548,4.454142
8,260,Star Wars: Episode IV - A New Hope (1977),2991,4.453694,4.439355
9,904,Rear Window (1954),1050,4.47619,4.435526


In [19]:
TOPN = 10
GLOBAL_TOPN = topn_by_bayesian(n=TOPN, m=50)
GLOBAL_TITLES = GLOBAL_TOPN["title"].tolist()
GLOBAL_TITLES

['Shawshank Redemption, The (1994)',
 'Godfather, The (1972)',
 'Usual Suspects, The (1995)',
 "Schindler's List (1993)",
 'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)',
 'Raiders of the Lost Ark (1981)',
 'Wrong Trousers, The (1993)',
 'Close Shave, A (1995)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Rear Window (1954)']

In [20]:
import numpy as np

# 简单按时间切一刀：每个用户最后一条作为“测试”，其余为“训练”
ratings_sorted = ratings.sort_values("timestamp")
last_idx = ratings_sorted.groupby("userId")["timestamp"].idxmax()
test = ratings_sorted.loc[last_idx]
train = ratings_sorted.drop(last_idx)

# 用全局 Top-N 去“命中”用户的最后观看电影
test_join = test.merge(movies[["movieId","title"]], on="movieId", how="left")
hits = test_join["title"].isin(GLOBAL_TITLES).sum()
hit_rate = hits / len(test_join)
print(f"Hit-Rate@{TOPN}: {hit_rate:.3f}  (越高越好；Baseline 只做个参考)")

Hit-Rate@10: 0.011  (越高越好；Baseline 只做个参考)


In [21]:
GLOBAL_TOPN.to_csv("../data/global_topn_bayes.csv", index=False)
GLOBAL_TOPN.head()

Unnamed: 0,movieId,title,num_ratings,avg_rating,bayes_score
0,318,"Shawshank Redemption, The (1994)",2227,4.554558,4.533192
1,858,"Godfather, The (1972)",2223,4.524966,4.504214
2,50,"Usual Suspects, The (1995)",1783,4.517106,4.491587
3,527,Schindler's List (1993),2304,4.510417,4.490687
4,2019,Seven Samurai (The Magnificent Seven) (Shichin...,628,4.56051,4.488316
