In [1]:
import pandas as pd
import numpy as np
df1=pd.read_csv('dataset/tmdb_5000_credits.csv')
df2=pd.read_csv('dataset/tmdb_5000_movies.csv')

In [2]:
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on='id')

In [3]:
df2['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

# Data processing

## Process plot descriptions

In [4]:
# Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
df2['overview'] = df2['overview'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2['overview'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

(4803, 20978)

In [5]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

## Process Credits, Genres and Keywords

In [7]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

In [8]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [9]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [10]:
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

In [11]:
df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]","[Action, Adventure, Crime]"


In [12]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [13]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)
    print(df2[feature].head(5))

0    [samworthington, zoesaldana, sigourneyweaver]
1       [johnnydepp, orlandobloom, keiraknightley]
2        [danielcraig, christophwaltz, léaseydoux]
3        [christianbale, michaelcaine, garyoldman]
4      [taylorkitsch, lynncollins, samanthamorton]
Name: cast, dtype: object
0       [cultureclash, future, spacewar]
1       [ocean, drugabuse, exoticisland]
2       [spy, basedonnovel, secretagent]
3    [dccomics, crimefighter, terrorist]
4        [basedonnovel, mars, medallion]
Name: keywords, dtype: object
0        jamescameron
1       goreverbinski
2           sammendes
3    christophernolan
4       andrewstanton
Name: director, dtype: object
0           [action, adventure, fantasy]
1           [adventure, fantasy, action]
2             [action, adventure, crime]
3                 [action, crime, drama]
4    [action, adventure, sciencefiction]
Name: genres, dtype: object


In [14]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

In [15]:
print(df2[['id', 'title', 'cast', 'director', 'keywords', 'genres', 'soup']].head())

       id                                     title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                            cast          director  \
0  [samworthington, zoesaldana, sigourneyweaver]      jamescameron   
1     [johnnydepp, orlandobloom, keiraknightley]     goreverbinski   
2      [danielcraig, christophwaltz, léaseydoux]         sammendes   
3      [christianbale, michaelcaine, garyoldman]  christophernolan   
4    [taylorkitsch, lynncollins, samanthamorton]     andrewstanton   

                              keywords                               genres  \
0     [cultureclash, future, spacewar]         [action, adventure, fantasy]   
1     [ocean, drugabuse, exoticisland]         [adventure, fantasy, action]   
2     [spy,

In [16]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error

# 加载数据
ratings = pd.read_csv('dataset/ratings_small.csv')

# 创建用户-电影评分矩阵
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# 转换为矩阵形式
matrix = user_movie_matrix.values
print(f"User-Movie Matrix Shape: {matrix.shape}")


User-Movie Matrix Shape: (671, 9066)


In [17]:
# SVD 分解
u, sigma, vt = svds(matrix, k=50)  # k 是潜在因子数
sigma = np.diag(sigma)

# 重构评分矩阵
predicted_ratings = np.dot(np.dot(u, sigma), vt)

# 将预测评分矩阵转为 DataFrame
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)


In [18]:
def predict_rating(user_id, movie_id):
    # 检查是否存在用户和电影
    if user_id in predicted_ratings_df.index and movie_id in predicted_ratings_df.columns:
        return predicted_ratings_df.loc[user_id, movie_id]
    else:
        # 如果用户或电影不存在，返回默认值（例如 0 或平均值）
        return 0  # 或者可以返回全局平均评分


In [19]:
# 示例
user_id = 1
movie_id = 110
predicted_score = predict_rating(user_id, movie_id)
print(f"Predicted rating for User {user_id} on Movie {movie_id}: {predicted_score:.2f}")


Predicted rating for User 1 on Movie 110: 0.12


In [22]:
class HybridRecommender:
    def __init__(self, df, ratings_df, latent_dim=50):
        self.df = df
        self.ratings_df = ratings_df
        self.latent_dim = latent_dim
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # 初始化时进行SVD分解
        self._initialize_svd()
        # 初始化TF-IDF
        self._initialize_tfidf()
        
    def _initialize_svd(self):
        print("Initializing SVD...")
        # 创建用户-电影评分矩阵
        user_movie_matrix = self.ratings_df.pivot(
            index='userId', 
            columns='movieId', 
            values='rating'
        ).fillna(0)
        
        # 保存用户和电影的映射关系
        self.user_ids = user_movie_matrix.index
        self.movie_ids = user_movie_matrix.columns
        
        # SVD分解
        matrix = user_movie_matrix.values
        self.u, self.sigma, self.vt = svds(matrix, k=self.latent_dim)
        print("SVD initialization completed")
        
    def _initialize_tfidf(self):
        print("Initializing TF-IDF...")
        # 确保overview列存在且已处理缺失值
        if 'overview' not in self.df.columns:
            raise ValueError("'overview' column not found in movie dataframe")
            
        overviews = self.df['overview'].fillna('')
        
        # 创建和训练TF-IDF向量化器
        tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = tfidf.fit_transform(overviews)
        print("TF-IDF initialization completed")
    
    def _prepare_user_features(self):
        # 从ratings中提取用户行为特征
        user_stats = self.ratings_df.groupby('userId').agg({
            'rating': ['count', 'mean', 'std']
        }).fillna(0)
        
        # 添加来自SVD的用户潜在特征
        user_latent = pd.DataFrame(
            self.u, 
            index=self.user_ids,
            columns=[f'latent_{i}' for i in range(self.latent_dim)]
        )
        
        # 合并所有用户特征
        user_features = pd.concat([user_stats, user_latent], axis=1)
        return StandardScaler().fit_transform(user_features)
    
    def _prepare_movie_features(self):
        # 电影基本特征
        numeric_features = ['budget', 'popularity', 'vote_average', 'vote_count']
        movie_features = self.df[numeric_features].fillna(0)
        
        # 添加TF-IDF特征
        tfidf_features = pd.DataFrame(
            self.tfidf_matrix.toarray(),
            index=self.df.index
        )
        
        # 添加来自SVD的电影潜在特征
        movie_latent = pd.DataFrame(
            self.vt.T,
            index=self.movie_ids,
            columns=[f'latent_{i}' for i in range(self.latent_dim)]
        )
        
        # 合并所有电影特征
        movie_features = pd.concat(
            [movie_features, tfidf_features, movie_latent],
            axis=1
        )
        return StandardScaler().fit_transform(movie_features)
    
    def prepare_features(self):
        print("Preparing features...")
        # 准备用户特征
        user_features = self._prepare_user_features()
        
        # 准备电影特征
        movie_features = self._prepare_movie_features()
        
        print("Feature preparation completed")
        return user_features, movie_features
        
    def train(self, epochs=10, batch_size=64):
        print("Starting training...")
        # 准备训练数据
        user_features, movie_features = self.prepare_features()
        
        # 创建数据集和数据加载器
        dataset = MovieDataset(
            user_features, 
            movie_features,
            self.ratings_df['rating'].values / 5.0  # 归一化评分
        )
        train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        # 初始化模型
        self.model = HybridMLP(
            user_features.shape[1], 
            movie_features.shape[1]
        ).to(self.device)
        
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.model.parameters())
        
        # 训练循环
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            for batch in train_loader:
                user_feat = batch['user_features'].to(self.device)
                movie_feat = batch['movie_features'].to(self.device)
                ratings = batch['rating'].to(self.device)
                
                # 前向传播
                predictions = self.model(user_feat, movie_feat)
                loss = criterion(predictions, ratings)
                
                # 反向传播
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            
            avg_loss = total_loss / len(train_loader)
            print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

In [23]:
# 导入必要的库
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import TfidfVectorizer

# 创建推荐系统实例
recommender = HybridRecommender(df2, ratings)

# 训练模型
recommender.train(epochs=10, batch_size=64)

Initializing SVD...
SVD initialization completed
Initializing TF-IDF...
TF-IDF initialization completed
Starting training...
Preparing features...


TypeError: Feature names are only supported if all input features have string names, but your input has ['str', 'tuple'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.