In [None]:
import numpy as np
import polars as pl
import torch
from torch import nn, functional as F

In [None]:
from zipfile import ZipFile
from urllib.request import urlretrieve
import os

urlretrieve("https://files.grouplens.org/datasets/movielens/ml-32m.zip", "../data/movielens.zip")
ZipFile("../data/movielens.zip", "r").extractall('../data/')
os.remove('../data/movielens.zip')


In [5]:
urlretrieve("https://www.kaggle.com/api/v1/datasets/download/asaniczka/tmdb-movies-dataset-2023-930k-movies", "../data/tmdb.zip")
ZipFile("../data/tmdb.zip", "r").extractall('../data/tmdb_dataset/')
os.remove('../data/tmdb.zip')

In [None]:
movies_df = pl.read_csv('../data/tmdb_dataset/TMDB_movie_dataset_v11.csv')
ratings_df = pl.read_csv('../data/ml-32m/ratings.csv')
links_df = pl.read_csv('../data/ml-32m/links.csv')

In [73]:
ratings_df.head()

userId,movieId,rating,timestamp
i64,i64,f64,i64
1,17,4.0,944249077
1,25,1.0,944250228
1,29,2.0,943230976
1,30,5.0,944249077
1,32,5.0,943228858


In [85]:
movies_df = links_df.join(movies_df,left_on="tmdbId",right_on="id")
movies_df = movies_df.drop(["title","status","backdrop_path","homepage","imdb_id","imdbId","poster_path"])
movies_df.head()

movieId,tmdbId,vote_average,vote_count,release_date,revenue,runtime,adult,budget,original_language,original_title,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,keywords
i64,i64,f64,i64,str,i64,i64,bool,i64,str,str,str,f64,str,str,str,str,str,str
79132,27205,8.364,34495,"""2010-07-15""",825532764,148,False,160000000,"""en""","""Inception""","""Cobb, a skilled thief who comm…",83.952,"""Your mind is the scene of the …","""Action, Science Fiction, Adven…","""Legendary Pictures, Syncopy, W…","""United Kingdom, United States …","""English, French, Japanese, Swa…","""rescue, mission, dream, airpla…"
109487,157336,8.417,32571,"""2014-11-05""",701729206,169,False,165000000,"""en""","""Interstellar""","""The adventures of a group of e…",140.241,"""Mankind was born on Earth. It …","""Adventure, Drama, Science Fict…","""Legendary Pictures, Syncopy, L…","""United Kingdom, United States …","""English""","""rescue, future, spacecraft, ra…"
58559,155,8.512,30619,"""2008-07-16""",1004558444,152,False,185000000,"""en""","""The Dark Knight""","""Batman raises the stakes in hi…",130.643,"""Welcome to a world without rul…","""Drama, Action, Crime, Thriller""","""DC Comics, Legendary Pictures,…","""United Kingdom, United States …","""English, Mandarin""","""joker, sadism, chaos, secret i…"
72998,19995,7.573,29815,"""2009-12-15""",2923706026,162,False,237000000,"""en""","""Avatar""","""In the 22nd century, a paraple…",79.932,"""Enter the world of Pandora.""","""Action, Adventure, Fantasy, Sc…","""Dune Entertainment, Lightstorm…","""United States of America, Unit…","""English, Spanish""","""future, society, culture clash…"
89745,24428,7.71,29166,"""2012-04-25""",1518815515,143,False,220000000,"""en""","""The Avengers""","""When an unexpected enemy emerg…",98.082,"""Some assembly required.""","""Science Fiction, Action, Adven…","""Marvel Studios""","""United States of America""","""English, Hindi, Russian""","""new york city, superhero, shie…"


In [None]:
genres = movies_df['genres'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of genres: ",len(genres))
production_companies = movies_df['production_companies'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of production_companies: ",len(production_companies))
production_countries = movies_df['production_countries'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of production_countries: ",len(production_countries))
spoken_languages = movies_df['spoken_languages'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of spoken_languages: ",len(spoken_languages))
keywords = movies_df['keywords'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of keywords: ",len(keywords))


Number of genres:  39
Number of production_companies:  56117
Number of production_countries:  358
Number of spoken_languages:  287
Number of keywords:  33602


In [87]:
# convert ids to categorical or string data type
movies_df = movies_df.with_columns(
    (pl.col("movieId").cast(str)).map_elements(lambda x: f"movie_{x}",return_dtype=pl.String).alias("movieId")
)

movies_df = movies_df.with_columns(
    (pl.col("movieId").cast(pl.Categorical))
)

ratings_df = ratings_df.with_columns([
    (pl.col("movieId").cast(str)).map_elements(lambda x: f"movie_{x}",return_dtype=pl.String).alias("movieId"),
    (pl.col("userId").cast(str)).map_elements(lambda x: f"user_{x}",return_dtype=pl.String).alias("userId"),
])

ratings_df = ratings_df.with_columns([
    pl.col("movieId").cast(pl.Categorical),
    pl.col("userId").cast(pl.Categorical),
])

In [93]:
from collections import Counter
from torchtext.vocab import vocab

# vocab for movie_ids
movie_ids = movies_df['movieId'].unique()
movie_counter = Counter(movie_ids)
movie_vocab = vocab(movie_counter, specials=['<unk>'])
movie_vocab_stoi = movie_vocab.get_stoi()
movie_title_dict = dict(zip(movies_df['movieId'].to_list(), movies_df['original_title'].to_list()))

# vocab for user_ids
user_ids = ratings_df['userId'].unique()
user_counter = Counter(user_ids)
user_vocab = vocab(user_counter, specials=['<unk>'])
user_vocab_stoi = user_vocab.get_stoi()

### Using timestamp to generate sequences

In [97]:
sequence_length = 25
step_size = 5

filtered_df = ratings_df.sort(["userId", "timestamp"])
filtered_df = filtered_df.with_columns([
    pl.col("userId").cum_count().over("userId").alias("idx")
])
for i in range(sequence_length):
    filtered_df = filtered_df.with_columns([
        pl.col("movieId").shift(-i).over("userId").alias(f"movie_{i}"),
        pl.col("rating").shift(-i).over("userId").alias(f"rating_{i}")
    ])

filtered_df = filtered_df.filter(pl.col("idx") % step_size == 0)
filtered_df = filtered_df.filter(
    pl.fold(
        acc=pl.lit(True),
        function=lambda acc, x: acc & x.is_not_null(),
        exprs=[pl.col(f"movie_{i}") for i in range(sequence_length)] +
              [pl.col(f"rating_{i}") for i in range(sequence_length)]
    )
)
filtered_df = filtered_df.with_columns([
    pl.concat_list([pl.col(f"movie_{i}") for i in range(sequence_length)]).alias("sequence"),
    pl.concat_list([pl.col(f"rating_{i}") for i in range(sequence_length)]).alias("rating_sequence")
])
result = filtered_df.select(["userId", "sequence", "rating_sequence"]).group_by(pl.col("userId"),maintain_order=True).agg(pl.col("sequence"),pl.col("rating_sequence"))

result = result.explode(["sequence", "rating_sequence"]).rename({
    "sequence": "sequence_movie_ids",
    "rating_sequence": "sequence_ratings"
})

In [98]:
result

userId,sequence_movie_ids,sequence_ratings
cat,list[cat],list[f64]
"""user_1""","[""movie_2882"", ""movie_541"", … ""movie_2640""]","[1.0, 5.0, … 5.0]"
"""user_1""","[""movie_1236"", ""movie_3030"", … ""movie_1199""]","[4.0, 4.0, … 2.0]"
"""user_1""","[""movie_166"", ""movie_232"", … ""movie_110""]","[5.0, 5.0, … 3.0]"
"""user_1""","[""movie_2352"", ""movie_2724"", … ""movie_1262""]","[3.0, 1.0, … 4.0]"
"""user_1""","[""movie_2336"", ""movie_260"", … ""movie_161""]","[5.0, 5.0, … 1.0]"
…,…,…
"""user_200948""","[""movie_7458"", ""movie_6534"", … ""movie_46970""]","[4.5, 0.5, … 3.5]"
"""user_200948""","[""movie_6863"", ""movie_49286"", … ""movie_57538""]","[4.0, 4.0, … 3.5]"
"""user_200948""","[""movie_54999"", ""movie_1059"", … ""movie_5171""]","[2.0, 4.0, … 2.0]"
"""user_200948""","[""movie_53974"", ""movie_53993"", … ""movie_80350""]","[1.0, 2.0, … 0.5]"


### Train Test Split

In [104]:
random_selection = np.random.rand(len(result)) <= 0.85

df_train_data = result.filter(random_selection)
train_data_raw = df_train_data[["userId", "sequence_movie_ids", "sequence_ratings"]].to_numpy()

df_test_data = result.filter(~random_selection)
test_data_raw = df_test_data[["userId", "sequence_movie_ids", "sequence_ratings"]].to_numpy()

### Creating DataLoaders

In [109]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class MovieSeqDataset(Dataset):
    def __init__(self, data, movie_vocab_stoi, user_vocab_stoi):
        self.data = data
        self.movie_vocab_stoi = movie_vocab_stoi
        self.user_vocab_stoi = user_vocab_stoi
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        user, movie_sequence, rating_sequence = self.data[idx]
        movie_data = [self.movie_vocab_stoi.get(item,movie_vocab_stoi['<unk>']) for item in movie_sequence]
        user_data = self.user_vocab_stoi[user]
        return torch.tensor(movie_data), torch.tensor(user_data), torch.tensor(rating_sequence)
    
def collate_batch(batch):
    movie_list = [item[0] for item in batch]
    user_list = [item[1] for item in batch]
    rating_list = [item[2] for item in batch]
    return pad_sequence(movie_list, padding_value=movie_vocab_stoi['<unk>'], batch_first=True), torch.stack(user_list), pad_sequence(rating_list, padding_value=3, batch_first=True)

In [110]:
BATCH_SIZE = 16

train_dataset = MovieSeqDataset(train_data_raw, movie_vocab_stoi, user_vocab_stoi)
val_dataset = MovieSeqDataset(test_data_raw, movie_vocab_stoi, user_vocab_stoi)

train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=BATCH_SIZE,shuffle=False, collate_fn=collate_batch)

In [118]:
for i, (movie_data, user_data, ratings_data) in enumerate(train_iter):
    print(movie_data.shape, user_data.shape, ratings_data.shape)
    break

torch.Size([16, 25]) torch.Size([16]) torch.Size([16, 25])
