In [1]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
movies = pd.read_csv('../../data/lens_tmdb/cleaned/df_all.csv')
ratings = pd.read_csv('../../data/lens_tmdb/ratings_small.csv')

In [4]:
# Prepare data
movies = movies[['movieId', 'genre', 'director']]  # select features to use
data = pd.merge(ratings, movies, on='movieId')
data = data[['userId', 'movieId', 'rating', 'genre', 'director']]

# Label encoding
for col in ['userId', 'movieId', 'genre', 'director']:
    data[col] = LabelEncoder().fit_transform(data[col])

# Train test split
train, test = train_test_split(data)

In [5]:
# Define model
class HybridModel(nn.Module):
    def __init__(self, n_users, n_movies, n_genres, n_directors, emb_size):
        super().__init__()
        self.user_embedding = nn.Embedding(n_users, emb_size)
        self.movie_embedding = nn.Embedding(n_movies, emb_size)
        self.genre_embedding = nn.Embedding(n_genres, emb_size)
        self.director_embedding = nn.Embedding(n_directors, emb_size)
        self.fc = nn.Linear(emb_size*4, 1)

    def forward(self, user, movie, genre, director):
        user_emb = self.user_embedding(user)
        movie_emb = self.movie_embedding(movie)
        genre_emb = self.genre_embedding(genre)
        director_emb = self.director_embedding(director)
        x = torch.cat([user_emb, movie_emb, genre_emb, director_emb], dim=1)
        x = self.fc(x)
        return x

In [6]:
# Create model
model = HybridModel(
    n_users=data['userId'].nunique(),
    n_movies=data['movieId'].nunique(),
    n_genres=data['genre'].nunique(),
    n_directors=data['director'].nunique(),
    emb_size=100  # size of the embedding vectors
)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
# Convert data to PyTorch tensors
def to_tensor(df):
    user = torch.tensor(df['userId'].values)
    movie = torch.tensor(df['movieId'].values)
    genre = torch.tensor(df['genre'].values)
    director = torch.tensor(df['director'].values)
    rating = torch.tensor(df['rating'].values)
    return user, movie, genre, director, rating

train_data = to_tensor(train)
test_data = to_tensor(test)

# Training loop
for epoch in range(500):  # number of epochs
    user, movie, genre, director, rating = train_data
    optimizer.zero_grad()
    outputs = model(user, movie, genre, director).squeeze()
    loss = criterion(outputs, rating.float())
    loss.backward()
    optimizer.step()
    print('Epoch:', epoch, 'Loss:', loss.item())

Epoch: 0 Loss: 0.645596444606781
Epoch: 1 Loss: 0.6454698443412781
Epoch: 2 Loss: 0.6453450322151184
Epoch: 3 Loss: 0.6452221274375916
Epoch: 4 Loss: 0.6451009511947632
Epoch: 5 Loss: 0.6449815630912781
Epoch: 6 Loss: 0.6448639631271362
Epoch: 7 Loss: 0.6447480916976929
Epoch: 8 Loss: 0.6446338891983032
Epoch: 9 Loss: 0.6445212960243225
Epoch: 10 Loss: 0.6444104313850403
Epoch: 11 Loss: 0.644301176071167
Epoch: 12 Loss: 0.6441934704780579
Epoch: 13 Loss: 0.6440874338150024
Epoch: 14 Loss: 0.6439828276634216
Epoch: 15 Loss: 0.6438798308372498
Epoch: 16 Loss: 0.643778383731842
Epoch: 17 Loss: 0.6436783671379089
Epoch: 18 Loss: 0.6435797810554504
Epoch: 19 Loss: 0.6434826850891113
Epoch: 20 Loss: 0.6433870792388916
Epoch: 21 Loss: 0.6432927846908569
Epoch: 22 Loss: 0.6431999206542969
Epoch: 23 Loss: 0.6431083679199219
Epoch: 24 Loss: 0.6430181264877319
Epoch: 25 Loss: 0.6429293155670166
Epoch: 26 Loss: 0.6428418159484863
Epoch: 27 Loss: 0.6427554488182068
Epoch: 28 Loss: 0.642670452594757

In [11]:
# Evaluate
with torch.no_grad():
    user, movie, genre, director, rating = test_data
    outputs = model(user, movie, genre, director).squeeze()
    loss = criterion(outputs, rating.float())
    print('Test loss:', loss.item())

Test loss: 0.8459954857826233
