In [1]:
import torch
import pandas as pd
import numpy as np
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split

import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../../data/lens_tmdb/ratings_small.csv')

In [3]:
ratings_df = data[['userId', 'movieId', 'rating']]

In [4]:
ratings_df['rating'] = ratings_df['rating'].apply(lambda x: 1 if x >= 4 else 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['rating'] = ratings_df['rating'].apply(lambda x: 1 if x >= 4 else 0)


In [5]:
user_enc = LabelEncoder()
ratings_df['userId'] = user_enc.fit_transform(ratings_df['userId'].values)

item_enc = LabelEncoder()
ratings_df['movieId'] = item_enc.fit_transform(ratings_df['movieId'].values)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['userId'] = user_enc.fit_transform(ratings_df['userId'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['movieId'] = item_enc.fit_transform(ratings_df['movieId'].values)


In [6]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)


In [7]:
class MovieLensDataset(Dataset):
    def __init__(self, ratings_df, num_users, num_items):
        self.users = ratings_df['userId'].values.astype(np.int64)
        self.items = ratings_df['movieId'].values.astype(np.int64)
        self.ratings = ratings_df['rating'].values.astype(np.float32)

        self.num_users = num_users
        self.num_items = num_items

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]


In [8]:
num_users = ratings_df['userId'].nunique()
num_items = ratings_df['movieId'].nunique()

train_dataset = MovieLensDataset(train_df, num_users, num_items)
test_dataset = MovieLensDataset(test_df, num_users, num_items)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024)


In [9]:
class AutoEncoder(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_dim):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim)
        self.item_emb = nn.Embedding(num_items, embedding_dim)
        self.encoder = nn.Sequential(
            nn.Linear(embedding_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim*2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, users, items):
        user_emb = self.user_emb(users)
        item_emb = self.item_emb(items)
        x = torch.cat([user_emb, item_emb], dim=1)
        x = self.encoder(x)
        x = self.decoder(x)
        return torch.sigmoid(x).squeeze()


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoEncoder(num_users, num_items, embedding_dim=50, hidden_dim=100).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for users, items, ratings in train_loader:
        users = users.to(device)
        items = items.to(device)
        ratings = ratings.to(device)

        optimizer.zero_grad()
        outputs = model(users, items)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}')



Epoch 1/10, Train Loss: 0.6283373568631425
Epoch 2/10, Train Loss: 0.5580825684945795
Epoch 3/10, Train Loss: 0.5201122183588487
Epoch 4/10, Train Loss: 0.4897755573067484
Epoch 5/10, Train Loss: 0.4612781269640862
Epoch 6/10, Train Loss: 0.4274181572696831
Epoch 7/10, Train Loss: 0.3940652190129968
Epoch 8/10, Train Loss: 0.3555721459509451
Epoch 9/10, Train Loss: 0.31800510649439656
Epoch 10/10, Train Loss: 0.2836762883617908


In [12]:
def mae(y_true, y_pred):
    return torch.mean(torch.abs(y_true - y_pred))

def rmse(y_true, y_pred):
    return torch.sqrt(torch.mean((y_true - y_pred)**2))


In [13]:
model.eval()
test_loss = 0
test_mae = 0
test_rmse = 0
with torch.no_grad():
    for users, items, ratings in test_loader:
        users = users.to(device)
        items = items.to(device)
        ratings = ratings.to(device)

        outputs = model(users, items)
        loss = criterion(outputs, ratings)

        test_loss += loss.item()
        test_mae += mae(ratings, outputs).item()
        test_rmse += rmse(ratings, outputs).item()

print(f'Test Loss: {test_loss/len(test_loader)}')
print(f'Test MAE: {test_mae/len(test_loader)}')
print(f'Test RMSE: {test_rmse/len(test_loader)}')


Test Loss: 0.9784244567155838
Test MAE: 0.35926043093204496
Test RMSE: 0.5020737618207931


In [27]:
movies_df = pd.read_csv('../../data/lens_tmdb/cleaned/df_all.csv')

def get_movie_details(movie_id):
    original_id = item_enc.inverse_transform([movie_id])
    movie_row = movies_df[movies_df['movieId'] == original_id[0]]
    movie_name = movie_row['title'].values[0]
    genres = movie_row['genre'].values[0]
    vote_average = movie_row[movie_row['movieId'] == original_id[0]]['vote_average'].values[0]
    return movie_name, genres, vote_average


In [28]:
def recommend(user_id, model, num_items):
    user_ids = torch.tensor([user_id]*num_items).to(device)
    movie_ids = torch.tensor(range(num_items)).to(device)

    with torch.no_grad():
        predictions = model(user_ids, movie_ids)

    _, indices = torch.topk(predictions, 10)
    top_10_movies = indices.cpu().numpy()

    movie_details = [get_movie_details(movie_id) for movie_id in top_10_movies]

    return movie_details


In [29]:
user_id = 42  # Replace with the ID of the user you want to recommend movies to
recommendations = recommend(user_id, model, num_items)
for movie_name, genres, vote_average in recommendations:
    print(f'Movie: {movie_name}')
    print(f'Genres: {genres}')
    print(f'Vote Average: {vote_average}')
    print('\n')


Movie: Paperman
Genres: Animation
Vote Average: 8.0


Movie: Transamerica
Genres: Drama
Vote Average: 6.9


Movie: Mostly Martha
Genres: Romance
Vote Average: 6.6


Movie: The Crimson Pirate
Genres: Action
Vote Average: 6.9


Movie: Kikujiro
Genres: Comedy
Vote Average: 7.5


Movie: Smoke
Genres: Comedy
Vote Average: 7.2


Movie: What Time Is It There?
Genres: Drama
Vote Average: 7.5


Movie: Trouble in Paradise
Genres: Comedy
Vote Average: 7.3


Movie: Tom Jones
Genres: Adventure
Vote Average: 6.1


Movie: It Happened One Night
Genres: Comedy
Vote Average: 7.7




In [30]:
movies_df.columns

Index(['adult', 'budget', 'homepage', 'id', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title',
       'video', 'vote_average', 'vote_count', 'collection_name', 'genre',
       'production_company', 'production_country', 'spoken_language',
       'cast_names', 'director', 'keyword_list', 'movieId', 'imdbId'],
      dtype='object')