In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pathlib import Path
import os
DATA_DIR = Path('/content/drive/MyDrive/ml-25m')  # 修改成你在 Drive 中的路径
assert (DATA_DIR/'movies.csv').exists(), f"movies.csv not found under {DATA_DIR}"

In [4]:
import pandas as pd
import numpy as np

def load_csv(path, usecols=None, dtype=None):
    return pd.read_csv(path, usecols=usecols, dtype=dtype, engine='pyarrow')

movies = load_csv(
    DATA_DIR/'movies.csv',
    usecols=['movieId','title','genres'],
    dtype={'movieId':'int32','title':'string','genres':'string'}
)
ratings = load_csv(
    DATA_DIR/'ratings.csv',
    usecols=['userId','movieId','rating','timestamp'],
    dtype={'userId':'int32','movieId':'int32','rating':'float32','timestamp':'int64'}
)

genome_scores_path = DATA_DIR/'genome-scores.csv'
genome_tags_path   = DATA_DIR/'genome-tags.csv'
HAVE_GENOME = genome_scores_path.exists() and genome_tags_path.exists()

if HAVE_GENOME:
    genome_scores = load_csv(
        genome_scores_path,
        usecols=['movieId','tagId','relevance'],
        dtype={'movieId':'int32','tagId':'int32','relevance':'float32'}
    )
    genome_tags = load_csv(
        genome_tags_path,
        usecols=['tagId','tag'],
        dtype={'tagId':'int32','tag':'string'}
    )

### 物品内容向量 —— (A) genres 快速模式

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer
from scipy import sparse

def split_genres(s):
    if pd.isna(s) or s=='(no genres listed)': return []
    return str(s).split('|')

movies['genre_list'] = movies['genres'].apply(split_genres)
mlb = MultiLabelBinarizer(sparse_output=True)
X_genres = mlb.fit_transform(movies['genre_list'])
genre_feature_names = [f'genre::{g}' for g in mlb.classes_]


### 3) 物品内容向量 —— (B) genome-tags 增强模式（若可用）
Pivot 成 (n_movies x n_tags) 的稀疏矩阵，再与 genres 拼接

In [6]:
X_items = X_genres
feature_names = genre_feature_names

if HAVE_GENOME:
    # 只保留出现过的电影
    genome_scores = genome_scores[genome_scores['movieId'].isin(movies['movieId'])]
    # 构造稀疏矩阵
    from scipy.sparse import coo_matrix
    # 注意：tagId 并非连续编号，转为连续索引
    tag_ids = genome_scores['tagId'].astype('int32').values
    unique_tags, tag_index = np.unique(tag_ids, return_inverse=True)
    movie_id_to_row = {mid:i for i,mid in enumerate(movies['movieId'].values)}
    rows = genome_scores['movieId'].map(movie_id_to_row).values
    cols = tag_index
    data = genome_scores['relevance'].values
    X_genome = coo_matrix((data, (rows, cols)), shape=(len(movies), len(unique_tags))).tocsr()

    # 可选：对 genome 列按列做 L2 归一化或 TF-IDF（这里先简单 L2）
    from sklearn.preprocessing import normalize
    X_genome = normalize(X_genome, norm='l2', axis=0)

    # 特征名
    tag_id_to_name = dict(zip(genome_tags['tagId'].astype(int), genome_tags['tag'].astype(str)))
    genome_feature_names = [f'genome::{tag_id_to_name.get(int(tid), str(tid))}' for tid in unique_tags]

    # 拼接：genres + genome
    X_items = sparse.hstack([X_genres, X_genome], format='csr')
    feature_names = genre_feature_names + genome_feature_names

# 最终索引：行对应 movies 的行顺序
movie_index = movies.reset_index(drop=True)
movie_index['title_lc'] = movie_index['title'].str.lower()

### 4) 相似度函数（物品相似 & 用户画像相似）

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def find_movie_row(title_or_substring):
    """
    小助手：宽松匹配标题（不区分大小写；优先精确匹配，否则返回包含该子串的前若干条）
    """
    q = str(title_or_substring).lower().strip()
    exact = movie_index[movie_index['title_lc']==q]
    if len(exact)>0:
        return exact.index[0], exact
    # 子串匹配
    subset = movie_index[movie_index['title_lc'].str.contains(q, regex=False)]
    if len(subset)==0:
        return None, pd.DataFrame()
    return subset.index[0], subset

def recommend_similar_items(seed_title, topk=10, exclude_same=True):
    """
    给定一个电影标题，按内容向量找最相似的电影
    """
    row_id, cand = find_movie_row(seed_title)
    if row_id is None:
        print("找不到匹配的电影；试试更精确的标题或子串。候选示例：")
        print(movie_index.sample(10, random_state=42)['title'].to_list())
        return

    sims = cosine_similarity(X_items[row_id], X_items).ravel()
    order = np.argsort(-sims)
    recs = []
    for idx in order:
        if exclude_same and idx == row_id:
            continue
        recs.append((int(movie_index.loc[idx, 'movieId']), float(sims[idx]), movie_index.loc[idx, 'title']))
        if len(recs) >= topk: break
    print(f"[Seed] {movie_index.loc[row_id, 'title']}")
    for r in recs:
        print(f"{r[2]}   (cos={r[1]:.4f})")
    return recs

### 5) User-based: 用用户历史构建“用户画像”（内容向量加权平均）
只用该用户的历史评分（或最近观看）做加权 → 推荐未看过的相似电影

In [8]:
ratings_user = ratings.merge(movies[['movieId']], on='movieId', how='inner')

def user_profile_vector(user_id, min_rating=4.0, use_recency=False, half_life_days=180):
    """
    从用户历史构建内容画像：
    - 取评分 >= min_rating 的电影
    - 权重 = rating（可选乘上时间衰减）
    - 返回 L2 归一化后的稀疏向量
    """
    df = ratings_user[ratings_user['userId']==user_id].copy()
    if len(df)==0:
        return None, None

    # 只用高分正反馈
    df = df[df['rating'] >= min_rating]
    if len(df)==0:
        return None, None

    w = df['rating'].astype('float32').values

    if use_recency:
        # 时间衰减
        ts = pd.to_datetime(df['timestamp'], unit='s')
        age_days = (ts.max() - ts).dt.days.clip(lower=0).astype('float32').values
        decay = 0.5 ** (age_days / float(half_life_days))
        w = w * decay

    # 聚合成用户向量
    mid_to_row = {int(movie_index.loc[i,'movieId']): i for i in range(len(movie_index))}
    rows = [mid_to_row.get(int(mid)) for mid in df['movieId'].values]
    rows = [r for r in rows if r is not None]
    if len(rows)==0:
        return None, None

    # 稀疏加权求和
    user_vec = None
    import scipy.sparse as sp
    for r_i, wi in zip(rows, w[:len(rows)]):
        v = X_items[r_i]
        user_vec = v.multiply(wi) if user_vec is None else (user_vec + v.multiply(wi))
    # 归一化
    from sklearn.preprocessing import normalize
    user_vec = normalize(user_vec, norm='l2', axis=1)
    return user_vec, set(df['movieId'].astype(int).tolist())

In [9]:
def recommend_for_user(user_id, topk=10, min_rating=4.0, use_recency=False):
    """
    基于“用户画像”做召回，过滤掉已看过的电影
    """
    uvec, seen = user_profile_vector(user_id, min_rating=min_rating, use_recency=use_recency)
    if uvec is None:
        print("该用户历史不足（或没有达到 min_rating 的正反馈）。换个 userId 或调低 min_rating。")
        return
    sims = cosine_similarity(uvec, X_items).ravel()
    order = np.argsort(-sims)
    recs = []
    for idx in order:
        mid = int(movie_index.loc[idx, 'movieId'])
        if mid in seen:
            continue
        recs.append((mid, float(sims[idx]), movie_index.loc[idx, 'title']))
        if len(recs) >= topk:
            break
    print(f"[User {user_id}] top-{topk} 内容推荐（min_rating={min_rating}, use_recency={use_recency}）")
    for r in recs:
        print(f"{r[2]}   (cos={r[1]:.4f})")
    return recs


In [10]:
_ = recommend_similar_items("Toy Story", topk=10)

# 示例：给某个用户推荐（先查看 userId 范围）
print("userId范围：", int(ratings['userId'].min()), " ~ ", int(ratings['userId'].max()))
_ = recommend_for_user(user_id=1, topk=10, min_rating=4.0, use_recency=False)

[Seed] Toy Story (1995)
Toy Story 2 (1999)   (cos=0.9978)
Monsters, Inc. (2001)   (cos=0.9977)
The Good Dinosaur (2015)   (cos=0.9946)
Antz (1998)   (cos=0.9939)
Toy Story Toons: Hawaiian Vacation (2011)   (cos=0.9926)
Emperor's New Groove, The (2000)   (cos=0.9925)
Toy Story Toons: Small Fry (2011)   (cos=0.9917)
Moana (2016)   (cos=0.9907)
Turbo (2013)   (cos=0.9906)
DuckTales: The Movie - Treasure of the Lost Lamp (1990)   (cos=0.9886)
userId范围： 1  ~  162541
[User 1] top-10 内容推荐（min_rating=4.0, use_recency=False）
Day for Night (La Nuit Américaine) (1973)   (cos=0.8669)
Moonrise Kingdom (2012)   (cos=0.8667)
Garden State (2004)   (cos=0.8665)
Harold and Maude (1971)   (cos=0.8663)
Punch-Drunk Love (2002)   (cos=0.8662)
City Lights (1931)   (cos=0.8661)
Graduate, The (1967)   (cos=0.8660)
Manhattan (1979)   (cos=0.8652)
Hairdresser's Husband, The (Le mari de la coiffeuse) (1990)   (cos=0.8650)
(500) Days of Summer (2009)   (cos=0.8650)


In [13]:
_ = recommend_similar_items("Toy Story", topk=10)

# 示例：给某个用户推荐（先查看 userId 范围）
print("userId范围：", int(ratings['userId'].min()), " ~ ", int(ratings['userId'].max()))
_ = recommend_for_user(user_id=548, topk=10, min_rating=3.5, use_recency=False)

[Seed] Toy Story (1995)
Toy Story 2 (1999)   (cos=0.9978)
Monsters, Inc. (2001)   (cos=0.9977)
The Good Dinosaur (2015)   (cos=0.9946)
Antz (1998)   (cos=0.9939)
Toy Story Toons: Hawaiian Vacation (2011)   (cos=0.9926)
Emperor's New Groove, The (2000)   (cos=0.9925)
Toy Story Toons: Small Fry (2011)   (cos=0.9917)
Moana (2016)   (cos=0.9907)
Turbo (2013)   (cos=0.9906)
DuckTales: The Movie - Treasure of the Lost Lamp (1990)   (cos=0.9886)
userId范围： 1  ~  162541
[User 548] top-10 内容推荐（min_rating=3.5, use_recency=False）
Another 48 Hrs. (1990)   (cos=0.8799)
Business, The (2005)   (cos=0.8797)
Money Train (1995)   (cos=0.8795)
Metro (1997)   (cos=0.8792)
Bronson (2009)   (cos=0.8694)
Super Cops, The (1974)   (cos=0.8692)
Black Shampoo (1976)   (cos=0.8692)
Fifth Commandment, The (2008)   (cos=0.8692)
Into the Night (1985)   (cos=0.8683)
Ichi the Killer (Koroshiya 1) (2001)   (cos=0.8681)


In [11]:
USER_ID = 1

user_hist = (
    ratings[ratings['userId'] == USER_ID]
    .merge(movies, on='movieId', how='left')
    .copy()
)

# 时间信息
user_hist['datetime'] = pd.to_datetime(user_hist['timestamp'], unit='s')
user_hist['year'] = user_hist['datetime'].dt.year.astype('Int16')

# 按时间排序（最早→最晚）
user_hist = user_hist.sort_values(['datetime','movieId']).reset_index(drop=True)

# ========= 3) 摘要信息打印 =========
n = len(user_hist)
date_min = user_hist['datetime'].min() if n else None
date_max = user_hist['datetime'].max() if n else None
mean_rating = float(user_hist['rating'].mean()) if n else np.nan

print(f"[User {USER_ID}] 总交互数: {n}")
print(f"时间范围: {date_min}  →  {date_max}")
print(f"平均评分: {mean_rating:.3f}")

# Top 10 电影（按评分时间最近的10条）
print("\n— 最近 10 条记录 —")
print(user_hist[['datetime','movieId','title','genres','rating']].tail(100).to_string(index=False))

# Top 10 电影（按评分时间最早的10条）
print("\n— 最早 10 条记录 —")
print(user_hist[['datetime','movieId','title','genres','rating']].head(100).to_string(index=False))

# 评分分布
print("\n— 评分分布 —")
print(user_hist['rating'].value_counts().sort_index().to_string())

# 常见类型（把多类型拆分统计）
def split_genres(s):
    if pd.isna(s) or s=='(no genres listed)': return []
    return str(s).split('|')

g_counts = (
    user_hist.assign(genre_list=user_hist['genres'].apply(split_genres))
             .explode('genre_list')
             .dropna(subset=['genre_list'])
             .groupby('genre_list').size().sort_values(ascending=False)
)
print("\n— 类型统计（前 15）—")
print(g_counts.head(15).to_string())

# ========= 4) 可选：导出该用户历史到 CSV =========
OUT_PATH = DATA_DIR.parent / f'user_{USER_ID}_history.csv'  # 保存到上级目录
user_hist[['datetime','movieId','title','genres','rating']].to_csv(OUT_PATH, index=False)
print(f"\n已导出：{OUT_PATH}")

[User 1] 总交互数: 70
时间范围: 2006-05-17 12:14:13  →  2006-05-17 15:34:15
平均评分: 3.814

— 最近 10 条记录 —
           datetime  movieId                                                                                    title                                              genres  rating
2006-05-17 12:14:13     5952                                            Lord of the Rings: The Two Towers, The (2002)                                   Adventure|Fantasy     4.0
2006-05-17 12:14:28     2012                                                       Back to the Future Part III (1990)                     Adventure|Comedy|Sci-Fi|Western     2.5
2006-05-17 12:14:39     2011                                                        Back to the Future Part II (1989)                             Adventure|Comedy|Sci-Fi     2.5
2006-05-17 12:14:57     1653                                                                           Gattaca (1997)                               Drama|Sci-Fi|Thriller     4.0
2006-05-17 12:2