In [1]:
import random

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD

In [2]:
# data_csv_path = "../data/ml-latest-small/ratings.csv"
# movies_path = "../data/ml-latest-small/movies.csv"

data_csv_path = "../data/ml-25m/ratings.csv"
movies_path = "../data/ml-25m/movies.csv"

model_path = "./recommender_models/recommender.ckpt"

In [3]:
data = pd.read_csv(data_csv_path)
movies = pd.read_csv(movies_path)

data.sort_values(by="timestamp", inplace=True)

data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

random.sample(list(grp_by_train.groups), k=10)

59047


[40432, 62743, 129131, 41121, 35362, 142929, 44820, 6066, 76589, 54125]

In [4]:
model = Recommender(
    vocab_size=len(mapping) + 2,
    lr=1e-4,
    dropout=0.3,
)
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])



<All keys matched successfully>

In [11]:
# count values in movies["genres"]
print(len(movies.genres.to_list()), len(movies.movieId.to_list()))

62423 62423


In [20]:
movie_to_idx = {
    a: mapping[b]
    for a, b in zip(movies.title.tolist(), movies.movieId.tolist())
    if b in mapping
}

# movie_to_idx = {
#     a: mapping[b]
#     for a, b, _ in zip(
#         movies.title.tolist(), movies.movieId.tolist(), movies.genres.to_list()
#     )
#     if b in mapping
# }

# idx_to_movie = {v: k for k, v in movie_to_idx.items()}
idx_to_movie = {
    mapping[b]: (a, c)
    for a, b, c in zip(
        movies.title.tolist(), movies.movieId.tolist(), movies.genres.tolist()
    )
    if b in mapping
}

In [21]:
def predict(list_movies, model, movie_to_idx, idx_to_movie):
    ids = (
        [PAD] * (120 - len(list_movies) - 1)
        + [movie_to_idx[a] for a in list_movies]
        + [MASK]
    )

    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        prediction = model(src)

    masked_pred = prediction[0, -1].numpy()
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]

    return [idx_to_movie[a] for a in sorted_predicted_ids[:10] if a in idx_to_movie]

### Senario 1: Adventure/Fantasy


In [22]:
list_movies = [
    "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
    "Harry Potter and the Chamber of Secrets (2002)",
    "Harry Potter and the Prisoner of Azkaban (2004)",
    "Harry Potter and the Goblet of Fire (2005)",
]

top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

[('Ice Age (2002)', 'Adventure|Animation|Children|Comedy'),
 ('Harry Potter and the Order of the Phoenix (2007)',
  'Adventure|Drama|Fantasy|IMAX'),
 ("Pirates of the Caribbean: Dead Man's Chest (2006)",
  'Action|Adventure|Fantasy'),
 ('Shrek 2 (2004)', 'Adventure|Animation|Children|Comedy|Musical|Romance'),
 ('Harry Potter and the Half-Blood Prince (2009)',
  'Adventure|Fantasy|Mystery|Romance|IMAX'),
 ('Star Wars: Episode III - Revenge of the Sith (2005)',
  'Action|Adventure|Sci-Fi'),
 ('Avatar (2009)', 'Action|Adventure|Sci-Fi|IMAX'),
 ('Up (2009)', 'Adventure|Animation|Children|Drama'),
 ('I, Robot (2004)', 'Action|Adventure|Sci-Fi|Thriller'),
 ('Bruce Almighty (2003)', 'Comedy|Drama|Fantasy|Romance')]

### Senario 2: Action/Adventure


In [24]:
list_movies = [
    "Black Panther (2017)",
    "Avengers, The (2012)",
    "Avengers: Infinity War - Part I (2018)",
    "Logan (2017)",
    "Spider-Man (2002)",
    "Spider-Man 3 (2007)",
    "Spider-Man: Far from Home (2019)",
]

top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

[('Deadpool 2 (2018)', 'Action|Comedy|Sci-Fi'),
 ('X-Men Origins: Wolverine (2009)', 'Action|Sci-Fi|Thriller'),
 ('Thor: Ragnarok (2017)', 'Action|Adventure|Sci-Fi'),
 ("Pirates of the Caribbean: Dead Man's Chest (2006)",
  'Action|Adventure|Fantasy'),
 ('Doctor Strange (2016)', 'Action|Adventure|Sci-Fi'),
 ('Spider-Man 2 (2004)', 'Action|Adventure|Sci-Fi|IMAX'),
 ('I, Robot (2004)', 'Action|Adventure|Sci-Fi|Thriller'),
 ('Guardians of the Galaxy 2 (2017)', 'Action|Adventure|Sci-Fi'),
 ('X2: X-Men United (2003)', 'Action|Adventure|Sci-Fi|Thriller'),
 ('Matrix Reloaded, The (2003)', 'Action|Adventure|Sci-Fi|Thriller|IMAX')]

### Senario 3: Comedy


In [25]:
list_movies = [
    "Zootopia (2016)",
    "Toy Story 3 (2010)",
    "Toy Story 4 (2019)",
    "Finding Nemo (2003)",
    "Ratatouille (2007)",
    "The Lego Movie (2014)",
    "Ghostbusters (a.k.a. Ghost Busters) (1984)",
    "Ace Ventura: When Nature Calls (1995)",
]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

[('Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy'),
 ('Aladdin (1992)', 'Adventure|Animation|Children|Comedy|Musical'),
 ('Monsters, Inc. (2001)', 'Adventure|Animation|Children|Comedy|Fantasy'),
 ('Clueless (1995)', 'Comedy|Romance'),
 ('Mean Girls (2004)', 'Comedy'),
 ('Toy Story 2 (1999)', 'Adventure|Animation|Children|Comedy|Fantasy'),
 ('Forrest Gump (1994)', 'Comedy|Drama|Romance|War'),
 ("Bug's Life, A (1998)", 'Adventure|Animation|Children|Comedy'),
 ("Ferris Bueller's Day Off (1986)", 'Comedy'),
 ('Breakfast Club, The (1985)', 'Comedy|Drama')]