In [1]:
import pandas as pd
import numpy as np

In [None]:
movies = pd.read_csv(r'D:\repos\recommendition\ml-32m\movies.csv')
tags = pd.read_csv(r'D:\repos\recommendition\ml-32m\tags.csv')
ratings = pd.read_csv(r'D:\repos\recommendition\ml-32m\ratings.csv')

# content based filtering

In [None]:
movies['genres'] = movies['genres'].str.split('|')

In [None]:
tags = tags[tags['tag'].notna()]
tags['tag'] = tags['tag'].str.lower()
tags = tags[~tags['tag'].str.fullmatch(r'\d+')]

In [None]:
movie_tags = tags.loc[:,['movieId','tag']].groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index()

In [None]:
movies = pd.merge(movies, movie_tags, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

In [None]:
movies.dropna()

In [None]:
movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
movies['combined'] = movies['genres_str'] + ' ' + movies['tag']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
movies = movies.reset_index()
title_to_index = pd.Series(movies.index, index=movies['title'])

In [None]:
def recommend_movies_sparse(title, top_n=5):
    if title not in title_to_index:
        return f"电影《{title}》不在数据集中。"
    
    idx = title_to_index[title]
    # 只计算一部电影与所有其他电影的相似度
    cosine_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    
    # 获取得分最高的前 N 个相似电影（排除自己）
    similar_indices = cosine_scores.argsort()[-top_n-1:-1][::-1]
    similar_movies = movies.iloc[similar_indices][['title', 'genres']]
    similar_movies['score'] = cosine_scores[similar_indices]
    
    return similar_movies

In [None]:
recommend_movies_sparse('Toy Story (1995)', 5)

# Collaborative Filtering

## item-based filtering

In [28]:
movies = pd.read_csv(r'D:\repos\recommendition\ml-32m\movies.csv')
ratings = pd.read_csv(r'D:\repos\recommendition\ml-32m\ratings.csv')

In [30]:
# ratings['movieId'] = pd.to_numeric(ratings['movieId'], errors='coerce')
# ratings['userId'] = pd.to_numeric(ratings['userId'], errors='coerce')
# ratings['rating'] = pd.to_numeric(ratings['rating'], errors='coerce')

ratings = ratings.dropna(subset=['movieId', 'userId', 'rating'])
ratings['movieId'] = ratings['movieId'].astype(int)
ratings['userId'] = ratings['userId'].astype(int)
ratings['rating'] = ratings['rating'].astype(float)

In [32]:
print("userId min:", ratings['userId'].min(), "max:", ratings['userId'].max())
print("movieId min:", ratings['movieId'].min(), "max:", ratings['movieId'].max())
print("rating min:", ratings['rating'].min(), "max:", ratings['rating'].max())


userId min: 1 max: 200948
movieId min: 1 max: 292757
rating min: 0.5 max: 5.0


In [42]:
from scipy.sparse import csr_matrix

# 重新编码 userId 和 movieId 为索引
user_map = {id: idx for idx, id in enumerate(ratings['userId'].unique())}
movie_map = {id: idx for idx, id in enumerate(ratings['movieId'].unique())}

ratings['user_index'] = ratings['userId'].map(user_map)
ratings['movie_index'] = ratings['movieId'].map(movie_map)

n_users = len(user_map)
n_movies = len(movie_map)

print("用户数:", n_users, "电影数:", n_movies)

# 确保没有负维度
assert n_users > 0 and n_movies > 0, "用户数或电影数为负或零！"

# 构建稀疏评分矩阵
rating_matrix = csr_matrix(
    (ratings['rating'], (ratings['user_index'], ratings['movie_index'])),
    shape=(n_users, n_movies)
)

print("稀疏评分矩阵 shape:", rating_matrix.shape)

用户数: 200948 电影数: 84432
稀疏评分矩阵 shape: (200948, 84432)


In [44]:
movie_user_matrix = rating_matrix.T

In [46]:
from sklearn.metrics.pairwise import cosine_similarity
movie_sim = cosine_similarity(movie_user_matrix, dense_output=False) 

In [50]:
def recommend_similar_movies(movie_id, top_n=5):
    # movie_map 是原始 movieId 到索引的映射
    if movie_id not in movie_map:
        return f"电影ID {movie_id} 不在数据集中"

    movie_idx = movie_map[movie_id]
    sim_scores = movie_sim[movie_idx].toarray().flatten()

    # 去掉自身
    sim_scores[movie_idx] = -1

    top_indices = np.argsort(sim_scores)[-top_n:][::-1]

    # 将索引反查回 movieId
    reverse_movie_map = {v: k for k, v in movie_map.items()}
    top_movie_ids = [reverse_movie_map[i] for i in top_indices]

    # 加载电影信息并返回
    movies = pd.read_csv(r'D:\repos\recommendition\ml-32m\movies.csv')
    return movies[movies['movieId'].isin(top_movie_ids)][['movieId', 'title', 'genres']]


In [None]:
recommend_similar_movies(1, top_n=5)  # 电影ID 1 是《Toy Story (1995)》

Unnamed: 0,movieId,title,genres
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
351,356,Forrest Gump (1994),Comedy|Drama|Romance|War
475,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
1237,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy


: 