In [35]:
import torch
from torch_geometric.nn import GCNConv
from torch_geometric.data import download_url, extract_zip
import pandas as pd

In [36]:
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')


Using existing file ml-latest-small.zip
Extracting .\ml-latest-small.zip


In [37]:
movies_path = './ml-latest-small/movies.csv'
ratings_path = './ml-latest-small/ratings.csv'

In [38]:
movies = pd.read_csv(movies_path, index_col='movieId')
rating = pd.read_csv(ratings_path)

In [39]:
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [40]:
genres = movies['genres'].str.get_dummies('|')
genres

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193583,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
193587,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
movie_feature = torch.from_numpy(genres.values).to(torch.float)
movie_feature

tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [42]:
assert movie_feature.size() == (9742, 20)

In [49]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [44]:
unique_user_Ids = rating['userId'].unique()
unique_user_Ids = pd.DataFrame(data = {
    'userid' : unique_user_Ids,
    'mappedId' : pd.RangeIndex(len(unique_user_Ids))
})
unique_user_Ids.head()

Unnamed: 0,userid,mappedId
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [50]:
rating_user_ids = pd.merge(rating['userId'],unique_user_Ids,left_on='userId', right_on='userid', how='left')
rating_user_ids = torch.from_numpy(rating_user_ids['mappedId'].values)

movie

In [48]:
rating_user_ids

Unnamed: 0,userId,userid,mappedId
0,1,1,0
1,1,1,0
2,1,1,0
3,1,1,0
4,1,1,0
...,...,...,...
100831,610,610,609
100832,610,610,609
100833,610,610,609
100834,610,610,609


In [51]:
rating_user_ids

tensor([  0,   0,   0,  ..., 609, 609, 609])