In [1]:
import pandas as pd
import torch
from tqdm import tqdm
import torchmetrics
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np

## Settings

In [2]:
WINDOW_SIZE = 20

## Data

In [3]:
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()

In [3]:
# users = pd.read_csv(
#     "ml-1m/users.dat",
#     sep="::",
#     names=["user_id", "sex", "age_group", "occupation", "zip_code"],
# )

ratings = pd.read_csv(
    "ml-1m/train_ratings.csv",
    sep=",",
)

movies = pd.read_csv(
    "ml-1m/titles.tsv", sep="\t"
)

genres = pd.read_csv(
    "ml-1m/genres.tsv", sep="\t"
)

In [4]:
## Movies
movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
movies.year = pd.Categorical(movies.year)
movies["year"] = movies.year.cat.codes
## Users
# users.sex = pd.Categorical(users.sex)
# users["sex"] = users.sex.cat.codes


# users.age_group = pd.Categorical(users.age_group)
# users["age_group"] = users.age_group.cat.codes


# users.occupation = pd.Categorical(users.occupation)
# users["occupation"] = users.occupation.cat.codes


# users.zip_code = pd.Categorical(users.zip_code)
# users["zip_code"] = users.zip_code.cat.codes

#Ratings
ratings['unix_timestamp'] = pd.to_datetime(ratings['time'],unit='s')


In [5]:
# Save primary csv's
if not os.path.exists('data'):
    os.makedirs('data')
    
    
# users.to_csv("data/users.csv",index=False)
movies.to_csv("data/movies.csv",index=False)
ratings.to_csv("data/ratings.csv",index=False)

In [6]:
## Movies
movies["movie_id"] = movies["item"].astype(str)
## Users
# users["user_id"] = users["user_id"].astype(str)

##Ratings 
ratings["item"] = ratings["item"].astype(str)
ratings["movie_id"] = ratings["item"].astype(str)
ratings["user_id"] = ratings["user"].astype(str)
ratings["rating"] = np.ones(len(ratings))
ratings.rating = ratings["rating"].apply(int)
ratings.rating = ratings["rating"].apply(str)


In [7]:
first =True
for i, group in genres.groupby("item")["genre"]:
    tmp = pd.DataFrame({"item" : i, "genre":"|".join(group)}, index=[0])
    if first:
        genre_df = tmp
        first = False
    else:
        genre_df = pd.concat([genre_df, tmp], axis = 0, sort=False)

movies = movies.merge(genre_df, on="item")

In [8]:
genres_list = list(set(genres["genre"]))

for genre in genres_list:
    movies[genre] = movies["genre"].apply(
        lambda values: int(genre in values.split("|"))
    )


### Transform the movie ratings data into sequences

First, let's sort the the ratings data using the `unix_timestamp`, and then group the
`movie_id` values and the `rating` values by `user_id`.

The output DataFrame will have a record for each `user_id`, with two ordered lists
(sorted by rating datetime): the movies they have rated, and their ratings of these movies.

In [17]:
ratings

Unnamed: 0,user,item,time,unix_timestamp,movie_id,user_id,rating
0,11,4643,1230782529,2009-01-01 04:02:09,4643,11,1
1,11,170,1230782534,2009-01-01 04:02:14,170,11,1
2,11,531,1230782539,2009-01-01 04:02:19,531,11,1
3,11,616,1230782542,2009-01-01 04:02:22,616,11,1
4,11,2140,1230782563,2009-01-01 04:02:43,2140,11,1
...,...,...,...,...,...,...,...
5154466,138493,44022,1260209449,2009-12-07 18:10:49,44022,138493,1
5154467,138493,4958,1260209482,2009-12-07 18:11:22,4958,138493,1
5154468,138493,68319,1260209720,2009-12-07 18:15:20,68319,138493,1
5154469,138493,40819,1260209726,2009-12-07 18:15:26,40819,138493,1


In [9]:
ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "movie_ids": list(ratings_group.item.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "timestamps": list(ratings_group.time.apply(list)),
    }
)


Now, let's split the `movie_ids` list into a set of sequences of a fixed length.
We do the same for the `ratings`. Set the `sequence_length` variable to change the length
of the input sequence to the model. You can also change the `step_size` to control the
number of sequences to generate for each user.

In [10]:
sequence_length = 8
step_size = 1


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del ratings_data["timestamps"]

After that, we process the output to have each sequence in a separate records in
the DataFrame. In addition, we join the user features with the ratings data.

In [11]:
ratings_data_movies = ratings_data[["user_id", "movie_ids"]].explode(
    "movie_ids", ignore_index=True
)
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
# ratings_data_transformed = ratings_data_transformed.join(
#     users.set_index("user_id"), on="user_id"
# )
ratings_data_transformed.movie_ids = ratings_data_transformed.movie_ids.apply(
    lambda x: ",".join(x)
)
ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
    lambda x: ",".join([str(v) for v in x])
)

# del ratings_data_transformed["zip_code"]

ratings_data_transformed.rename(
    columns={"movie_ids": "sequence_movie_ids", "ratings": "sequence_ratings"},
    inplace=True,
)

With `sequence_length` of 4 and `step_size` of 2, we end up with 498,623 sequences.

Finally, we split the data into training and testing splits, with 85% and 15% of
the instances, respectively, and store them to CSV files.

In [13]:
ratings_data_transformed

Unnamed: 0,user_id,sequence_movie_ids,sequence_ratings
0,11,46431705316162140272223132688,11111111
1,11,17053161621402722231326882428,11111111
2,11,531616214027222313268824283113,11111111
3,11,6162140272223132688242831131591,11111111
4,11,21402722231326882428311315912600,11111111
...,...,...,...
4966306,138493,43435459539966952660816337440224958,11111111
4966307,138493,545953996695266081633744022495868319,11111111
4966308,138493,5399669526608163374402249586831940819,11111111
4966309,138493,6952660816337440224958683194081927311,11111111


In [12]:
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
train_data = ratings_data_transformed[random_selection]
val_data = ratings_data_transformed[~random_selection]
test_data = ratings_data_transformed

train_data.to_csv("data/train_data.csv", index=False, sep=",")
val_data.to_csv("data/val_data.csv", index=False, sep=",")
test_data.to_csv("data/test_data.csv", index=False, sep=",")

In [None]:
test_data

Unnamed: 0,user_id,sequence_movie_ids,sequence_ratings
0,11,46431705316162140272223132688,11111111
4,11,21402722231326882428311315912600,11111111
9,11,311315912600816925725829375411367,11111111
12,11,8169257258293754113673247927444,11111111
19,11,7444539535694965025300051662515135836,11111111
...,...,...,...
4966270,138493,24201625269630204007240524064681,11111111
4966273,138493,30204007240524064681379363338636,11111111
4966295,138493,587253783349311014621611601690,11111111
4966302,138493,1690780550234048434354595399669526,11111111
