In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MultiLabelBinarizer

# ==============================
# 1. 读取数据
# ==============================
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')

print("===> ratings.csv <===")
print(ratings.head(), "\n")
print(ratings.info(), "\n")

print("===> movies.csv <===")
print(movies.head(), "\n")
print(movies.info(), "\n")

===> ratings.csv <===
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None 

===> movies.csv <===
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

             

In [4]:

# ==============================
# 2. 用户特征提取
# ==============================
# (1) 用户平均评分
user_avg_rating = ratings.groupby('userId')['rating'].mean().rename('user_avg_rating')

# (2) 用户评分总数
user_rating_count = ratings.groupby('userId')['rating'].count().rename('user_rating_count')

# (3) 用户评分标准差
user_rating_std = ratings.groupby('userId')['rating'].std().rename('user_rating_std')

# (4) 用户最高评分 & 最低评分
user_rating_max = ratings.groupby('userId')['rating'].max().rename('user_rating_max')
user_rating_min = ratings.groupby('userId')['rating'].min().rename('user_rating_min')

# (5) 用户最近一次评分时间戳（或最早一次），可以帮助做“活跃度”相关分析
user_last_timestamp = ratings.groupby('userId')['timestamp'].max().rename('user_last_timestamp')

# 这里还可以提取更多特征，比如用户对各电影类型的偏好分布等，但那需要关联movies表，稍后示例。

# 先将上述特征合并到一个 user_features 表
user_features = pd.concat([
    user_avg_rating,
    user_rating_count,
    user_rating_std,
    user_rating_max,
    user_rating_min,
    user_last_timestamp
], axis=1).reset_index()

# 处理可能出现的NaN（例如对只有1条评分的用户，std为NaN）
user_features['user_rating_std'] = user_features['user_rating_std'].fillna(0.0)

print("===> Basic user features <===")
print(user_features.head(), "\n")

# ------------------------------
# (可选) 用户对电影类型的偏好
# ------------------------------
# 例如：计算用户对每种类型的平均评分（需要先把ratings和movies merge）
ratings_movies = pd.merge(ratings, movies, on='movieId', how='left')
# genres可能是 "Action|Adventure|Fantasy" 这样的字符串
# 先拆分成列表
ratings_movies['genre_list'] = ratings_movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])

# 构造一个DataFrame: userId, genre, rating
# 用explode来把一行多genre变成多行
ratings_genres = ratings_movies.explode('genre_list')
# 过滤掉 (no genres listed) 之类的
ratings_genres = ratings_genres[ratings_genres['genre_list'] != '(no genres listed)']

# 用户-类型 平均评分
user_genre_rating = ratings_genres.groupby(['userId','genre_list'])['rating'].mean().reset_index()
user_genre_rating = user_genre_rating.rename(columns={'genre_list': 'genre', 'rating': 'genre_avg_rating'})

# 为了存储方便，可以把不同genre的平均评分pivot到列上
user_genre_pivot = user_genre_rating.pivot(index='userId', columns='genre', values='genre_avg_rating')
# 可能出现NaN（用户没看过某类电影），可填充0或用户平均评分
user_genre_pivot = user_genre_pivot.fillna(0)

# 与前面的user_features合并
user_features = pd.merge(user_features, user_genre_pivot, on='userId', how='left')
# NaN填0（对于完全没看过某类电影的用户）
user_features = user_features.fillna(0)

print("===> User features after adding genre preference <===")
print(user_features.head(), "\n")


===> Basic user features <===
   userId  user_avg_rating  user_rating_count  user_rating_std  \
0       1         4.366379                232         0.800048   
1       2         3.948276                 29         0.805615   
2       3         2.435897                 39         2.090642   
3       4         3.555556                216         1.314204   
4       5         3.636364                 44         0.990441   

   user_rating_max  user_rating_min  user_last_timestamp  
0              5.0              1.0            965719662  
1              5.0              2.0           1445715340  
2              5.0              0.5           1306464293  
3              5.0              1.0           1007574542  
4              5.0              1.0            847435337   

===> User features after adding genre preference <===
   userId  user_avg_rating  user_rating_count  user_rating_std  \
0       1         4.366379                232         0.800048   
1       2         3.948276     

In [5]:


# ==============================
# 3. 电影特征提取
# ==============================
# (1) 电影平均评分
movie_avg_rating = ratings.groupby('movieId')['rating'].mean().rename('movie_avg_rating')

# (2) 电影评分总数
movie_rating_count = ratings.groupby('movieId')['rating'].count().rename('movie_rating_count')

# (3) 电影评分标准差
movie_rating_std = ratings.groupby('movieId')['rating'].std().rename('movie_rating_std')

# (4) 从title中提取上映年份
def extract_year(title):
    """从电影标题中用正则提取年份，如果没有匹配到就返回None"""
    pattern = r"\((\d{4})\)"
    match = re.search(pattern, str(title))
    if match:
        return int(match.group(1))
    else:
        return None

movies['year'] = movies['title'].apply(extract_year)

# (5) 电影类型One-Hot编码
# 先把 genres 拆分成列表
movies['genre_list'] = movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])
# 用 MultiLabelBinarizer 做多热编码
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(movies['genre_list'])
genre_encoded_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)
# 与movies合并
movies_extended = pd.concat([movies[['movieId','title','year','genres']], genre_encoded_df], axis=1)

# (6) 合并所有电影特征到 movie_features 表
movie_features = pd.merge(movies_extended, movie_avg_rating, on='movieId', how='left')
movie_features = pd.merge(movie_features, movie_rating_count, on='movieId', how='left')
movie_features = pd.merge(movie_features, movie_rating_std, on='movieId', how='left')

# 处理 NaN 标准差（只有1条评分的电影）
movie_features['movie_rating_std'] = movie_features['movie_rating_std'].fillna(0.0)

print("===> movie_features <===")
print(movie_features.head(), "\n")
print(movie_features.info(), "\n")


===> movie_features <===
   movieId                               title    year  \
0        1                    Toy Story (1995)  1995.0   
1        2                      Jumanji (1995)  1995.0   
2        3             Grumpier Old Men (1995)  1995.0   
3        4            Waiting to Exhale (1995)  1995.0   
4        5  Father of the Bride Part II (1995)  1995.0   

                                        genres  (no genres listed)  Action  \
0  Adventure|Animation|Children|Comedy|Fantasy                   0       0   
1                   Adventure|Children|Fantasy                   0       0   
2                               Comedy|Romance                   0       0   
3                         Comedy|Drama|Romance                   0       0   
4                                       Comedy                   0       0   

   Adventure  Animation  Children  Comedy  ...  Musical  Mystery  Romance  \
0          1          1         1       1  ...        0        0        0   
1  

In [6]:


# ==============================
# 4. 保存用户特征和电影特征
# ==============================
user_features.to_csv('user_features.csv', index=False, encoding='utf-8')
movie_features.to_csv('movie_features.csv', index=False, encoding='utf-8')
print("user_features.csv and movie_features.csv have been saved.\n")


user_features.csv and movie_features.csv have been saved.



In [7]:

# ==============================
# 5. 清洗并保存ratings表 (可选)
# ==============================
# 如果你想对ratings做简单处理，比如去除异常值、去除timestamp=0之类
# 这里仅做示例：假设没有特殊清洗需求，只是重命名一下
ratings_cleaned = ratings.rename(columns={'timestamp': 'ts'})
ratings_cleaned.to_csv('ratings_cleaned.csv', index=False, encoding='utf-8')
print("ratings_cleaned.csv has been saved.")


ratings_cleaned.csv has been saved.
