In [4]:
import torch
import pandas as pd
from torch_geometric.data import download_url, extract_zip
from sentence_transformers import SentenceTransformer

In [16]:
dataset_name = 'ml-latest-small'

url = f'https://files.grouplens.org/datasets/movielens/{dataset_name}.zip'
extract_zip(download_url(url, '.'), '.')

movies_path = f'./{dataset_name}/movies.csv'
ratings_path = f'./{dataset_name}/ratings.csv'

Using existing file ml-latest-small.zip
Extracting .\ml-latest-small.zip


In [17]:
movies_df = pd.read_csv(movies_path)
ratings_df = pd.read_csv(ratings_path)

In [22]:
genres = movies_df['genres'].str.get_dummies('|').values
genres

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
genres = torch.from_numpy(genres).to(torch.float)
genres

In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')
with torch.no_grad():
    titles = model.encode(movies_df['title'].tolist(), convert_to_tensor=True, show_progress_bar=True)
    titles = titles.cpu()

Batches:   0%|          | 0/305 [00:00<?, ?it/s]

In [10]:
titles

tensor([[-0.0828,  0.0530,  0.0536,  ...,  0.0226,  0.0538,  0.1030],
        [-0.1053,  0.1508, -0.0264,  ...,  0.0106, -0.0726,  0.0086],
        [-0.0988,  0.0176, -0.0527,  ..., -0.0120,  0.0303,  0.0004],
        ...,
        [-0.1115,  0.0310, -0.0177,  ...,  0.0147,  0.0299,  0.0200],
        [ 0.0366,  0.0137,  0.0315,  ..., -0.0516, -0.0143,  0.1012],
        [-0.0500, -0.0141, -0.0031,  ...,  0.0320,  0.0546, -0.0271]])

In [14]:
movie_features = torch.cat([genres, titles], dim=-1)

# We don't have user features, which is why we use an identity matrix
user_features = torch.eye(len(ratings_df['userId'].unique()))

In [15]:
user_features,movie_features

(tensor([[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.]]),
 tensor([[ 0.0000e+00,  0.0000e+00,  1.0000e+00,  ...,  2.2615e-02,
           5.3814e-02,  1.0297e-01],
         [ 0.0000e+00,  0.0000e+00,  1.0000e+00,  ...,  1.0561e-02,
          -7.2631e-02,  8.6105e-03],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.2006e-02,
           3.0255e-02,  4.1655e-04],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  1.4684e-02,
           2.9905e-02,  2.0007e-02],
         [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ..., -5.1593e-02,
          -1.4267e-02,  1.0123e-01],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  3.1982e-02,
           5.4629e-02, -2.7146e-02]]))

In [None]:
s = pd.Series(list('abca'))