In [1]:
import numpy as np
import polars as pl
import torch
from torch import nn, functional as F


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Program Files\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Program Files\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\arnav\Documents\VSCodeProjects\RecSysTRX\RecSysTRX\.venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\arnav\Documents\VSCodeProjects\RecSysTRX\RecSysTRX\.venv\lib\site-packages\traitlets\config\application.py", line 

### Get the movielens and tmdb dataset

In [None]:
from zipfile import ZipFile
from urllib.request import urlretrieve
import os

urlretrieve("https://files.grouplens.org/datasets/movielens/ml-32m.zip", "../data/movielens.zip")
ZipFile("../data/movielens.zip", "r").extractall('../data/')
os.remove('../data/movielens.zip')

urlretrieve("https://www.kaggle.com/api/v1/datasets/download/asaniczka/tmdb-movies-dataset-2023-930k-movies", "../data/tmdb.zip")
ZipFile("../data/tmdb.zip", "r").extractall('../data/tmdb_dataset/')
os.remove('../data/tmdb.zip')

### Dataset cleaning and preparation

In [2]:
movies_df = pl.read_csv('../data/tmdb_dataset/TMDB_movie_dataset_v11.csv')
ratings_df = pl.read_csv('../data/ml-32m/ratings.csv')
links_df = pl.read_csv('../data/ml-32m/links.csv')

In [3]:
ratings_df.head()

userId,movieId,rating,timestamp
i64,i64,f64,i64
1,17,4.0,944249077
1,25,1.0,944250228
1,29,2.0,943230976
1,30,5.0,944249077
1,32,5.0,943228858


In [4]:
movies_df = links_df.join(movies_df,left_on="tmdbId",right_on="id")
movies_df = movies_df.drop(["title","status","backdrop_path","homepage","imdb_id","imdbId","poster_path"])
movies_df.head()

movieId,tmdbId,vote_average,vote_count,release_date,revenue,runtime,adult,budget,original_language,original_title,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,keywords
i64,i64,f64,i64,str,i64,i64,bool,i64,str,str,str,f64,str,str,str,str,str,str
79132,27205,8.364,34495,"""2010-07-15""",825532764,148,False,160000000,"""en""","""Inception""","""Cobb, a skilled thief who comm…",83.952,"""Your mind is the scene of the …","""Action, Science Fiction, Adven…","""Legendary Pictures, Syncopy, W…","""United Kingdom, United States …","""English, French, Japanese, Swa…","""rescue, mission, dream, airpla…"
109487,157336,8.417,32571,"""2014-11-05""",701729206,169,False,165000000,"""en""","""Interstellar""","""The adventures of a group of e…",140.241,"""Mankind was born on Earth. It …","""Adventure, Drama, Science Fict…","""Legendary Pictures, Syncopy, L…","""United Kingdom, United States …","""English""","""rescue, future, spacecraft, ra…"
58559,155,8.512,30619,"""2008-07-16""",1004558444,152,False,185000000,"""en""","""The Dark Knight""","""Batman raises the stakes in hi…",130.643,"""Welcome to a world without rul…","""Drama, Action, Crime, Thriller""","""DC Comics, Legendary Pictures,…","""United Kingdom, United States …","""English, Mandarin""","""joker, sadism, chaos, secret i…"
72998,19995,7.573,29815,"""2009-12-15""",2923706026,162,False,237000000,"""en""","""Avatar""","""In the 22nd century, a paraple…",79.932,"""Enter the world of Pandora.""","""Action, Adventure, Fantasy, Sc…","""Dune Entertainment, Lightstorm…","""United States of America, Unit…","""English, Spanish""","""future, society, culture clash…"
89745,24428,7.71,29166,"""2012-04-25""",1518815515,143,False,220000000,"""en""","""The Avengers""","""When an unexpected enemy emerg…",98.082,"""Some assembly required.""","""Science Fiction, Action, Adven…","""Marvel Studios""","""United States of America""","""English, Hindi, Russian""","""new york city, superhero, shie…"


In [5]:
# Building vocab
genres = movies_df['genres'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of genres: ",len(genres))
production_companies = movies_df['production_companies'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of production_companies: ",len(production_companies))
production_countries = movies_df['production_countries'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of production_countries: ",len(production_countries))
spoken_languages = movies_df['spoken_languages'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of spoken_languages: ",len(spoken_languages))
keywords = movies_df['keywords'].map_elements(lambda x: x.split(','),return_dtype=pl.List(pl.String)).explode().unique().to_list()
print("Number of keywords: ",len(keywords))


Number of genres:  39
Number of production_companies:  56117
Number of production_countries:  358
Number of spoken_languages:  287
Number of keywords:  33602


In [6]:
# convert ids to categorical or string data type
movies_df = movies_df.with_columns(
    (pl.col("movieId").cast(str)).map_elements(lambda x: f"movie_{x}",return_dtype=pl.String).alias("movieId")
)

ratings_df = ratings_df.with_columns([
    (pl.col("movieId").cast(str)).map_elements(lambda x: f"movie_{x}",return_dtype=pl.String).alias("movieId"),
    (pl.col("userId").cast(str)).map_elements(lambda x: f"user_{x}",return_dtype=pl.String).alias("userId"),
])

ratings_df = ratings_df.with_columns([
    pl.col("movieId").cast(pl.Categorical),
    pl.col("userId").cast(pl.Categorical),
])

In [7]:
from collections import Counter
from torchtext.vocab import vocab

# vocab for movie_ids
movie_ids = movies_df['movieId'].unique()
movie_counter = Counter(movie_ids)
movie_vocab = vocab(movie_counter, specials=['<unk>'])
movie_vocab_stoi = movie_vocab.get_stoi()
movie_title_dict = dict(zip(movies_df['movieId'].to_list(), movies_df['original_title'].to_list()))

# vocab for user_ids
user_ids = ratings_df['userId'].unique()
user_counter = Counter(user_ids)
user_vocab = vocab(user_counter, specials=['<unk>'])
user_vocab_stoi = user_vocab.get_stoi()

### Using timestamp to generate sequences

In [None]:
import polars as pl

# Parameters
sequence_length = 4
min_history = 1  # (not used directly here but you can add filtering later)
step_size = 2

# Sort by user and timestamp
filtered_df = ratings_df.sort(["userId", "timestamp"])

# Add index per user
filtered_df = filtered_df.with_columns([
    pl.col("userId").cum_count().over("userId").alias("idx")
])

# Generate shifted columns
for i in range(sequence_length):
    filtered_df = filtered_df.with_columns(
        pl.col("movieId").shift(-i).over("userId").alias(f"movie_{i}")
    )

# Filter to every `step_size` rows to get sliding window
filtered_df = filtered_df.filter(pl.col("idx") % step_size == 0)

# Drop rows where any of the shifted movie columns are null
filtered_df = filtered_df.filter(
    pl.fold(
        acc=pl.lit(True),
        function=lambda acc, x: acc & x.is_not_null(),
        exprs=[pl.col(f"movie_{i}") for i in range(sequence_length)]
    )
)

# Combine into a sequence column
filtered_df = filtered_df.with_columns([
    pl.concat_list([pl.col(f"movie_{i}") for i in range(sequence_length)]).alias("sequence")
])

# Keep only userId and sequence
result = filtered_df.select(["userId", "sequence"]).group_by(pl.col("userId"),maintain_order=True).agg(pl.col("sequence"))

shape: (200_948, 2)
┌─────────────┬─────────────────────────────────┐
│ userId      ┆ sequence                        │
│ ---         ┆ ---                             │
│ cat         ┆ list[list[cat]]                 │
╞═════════════╪═════════════════════════════════╡
│ user_1      ┆ [["movie_2966", "movie_2890", … │
│ user_2      ┆ [["movie_380", "movie_296", … … │
│ user_3      ┆ [["movie_466", "movie_2268", …… │
│ user_4      ┆ [["movie_1210", "movie_1833", … │
│ user_5      ┆ [["movie_150", "movie_590", … … │
│ …           ┆ …                               │
│ user_200944 ┆ [["movie_1196", "movie_318", …… │
│ user_200945 ┆ [["movie_8874", "movie_2762", … │
│ user_200946 ┆ [["movie_736", "movie_95", … "… │
│ user_200947 ┆ [["movie_527", "movie_2571", …… │
│ user_200948 ┆ [["movie_2616", "movie_2431", … │
└─────────────┴─────────────────────────────────┘


In [21]:
print(result.filter(pl.col("userId")=="user_1")["sequence"].to_list())


[[['movie_2966', 'movie_2890', 'movie_3078', 'movie_2882'], ['movie_3078', 'movie_2882', 'movie_541', 'movie_838'], ['movie_541', 'movie_838', 'movie_1136', 'movie_1211'], ['movie_1136', 'movie_1211', 'movie_1236', 'movie_3030'], ['movie_1236', 'movie_3030', 'movie_2396', 'movie_2918'], ['movie_2396', 'movie_2918', 'movie_1197', 'movie_166'], ['movie_1197', 'movie_166', 'movie_232', 'movie_34'], ['movie_232', 'movie_34', 'movie_1923', 'movie_2324'], ['movie_1923', 'movie_2324', 'movie_2352', 'movie_2724'], ['movie_2352', 'movie_2724', 'movie_2599', 'movie_1288'], ['movie_2599', 'movie_1288', 'movie_1784', 'movie_2336'], ['movie_1784', 'movie_2336', 'movie_260', 'movie_2232'], ['movie_260', 'movie_2232', 'movie_1196', 'movie_2640'], ['movie_1196', 'movie_2640', 'movie_1748', 'movie_1965'], ['movie_1748', 'movie_1965', 'movie_1210', 'movie_32'], ['movie_1210', 'movie_32', 'movie_1199', 'movie_29'], ['movie_1199', 'movie_29', 'movie_1653', 'movie_2529'], ['movie_1653', 'movie_2529', 'movi