In [1]:
#https://www.youtube.com/watch?v=AQU3akndun4
#https://colab.research.google.com/drive/1_eR7DXBF3V4EwH946dDPOxeclDBeKNMD?usp=sharing#scrollTo=FTgGxZv6QQsO
import pandas as pd

In [2]:
anime_df = pd.read_csv("./datasets/anime.csv")
rating_df = pd.read_csv("./datasets/rating.csv")

In [3]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [5]:
sorted_df = anime_df.sort_values(by="anime_id").set_index("anime_id")

In [6]:
sorted_df = sorted_df.reset_index(drop=False)

In [7]:
movie_id_mapping = sorted_df["anime_id"]

In [8]:
node_features = sorted_df[["type","genre","episodes"]]

In [9]:
pd.set_option("mode.chained_assignment", None)

In [10]:
genres = node_features["genre"].str.split(",", expand=True)

In [11]:
node_features["main_genre"] = genres[0]

In [12]:
anime_node_features = pd.concat([node_features, pd.get_dummies(node_features["main_genre"])], axis=1, join='inner')

In [13]:
anime_node_features = pd.concat([anime_node_features, pd.get_dummies(anime_node_features["type"])], axis=1, join='inner')

In [14]:
anime_node_features.drop(["genre", "main_genre"], axis=1, inplace=True)

In [15]:
anime_node_features.head()

Unnamed: 0,type,episodes,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Supernatural,Thriller,Vampire,Yaoi,Movie,Music,ONA,OVA,Special,TV
0,TV,26,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Movie,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,TV,26,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,TV,26,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,TV,52,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
anime_node_features.drop(["type"], axis=1, inplace=True)

In [17]:
x = anime_node_features.to_numpy()
x

array([['26', 1, 0, ..., 0, 0, 1],
       ['1', 1, 0, ..., 0, 0, 0],
       ['26', 1, 0, ..., 0, 0, 1],
       ...,
       ['Unknown', 0, 0, ..., 0, 0, 1],
       ['Unknown', 0, 0, ..., 0, 0, 1],
       ['1', 0, 1, ..., 0, 0, 0]], dtype=object)

In [18]:
# Find out mean rating and number of ratings per user
mean_rating = rating_df.groupby("user_id")["rating"].mean().rename("mean")
num_rating = rating_df.groupby("user_id")["rating"].count().rename("count")
user_node_features = pd.concat([mean_rating, num_rating], axis=1)

# Remap user ID (to start at 0)
user_node_features = user_node_features.reset_index(drop=False)
user_id_mapping = user_node_features["user_id"]

# Only keep features 
user_node_features = user_node_features[["mean", "count"]]
user_node_features.head()

Unnamed: 0,mean,count
0,-0.712418,153
1,2.666667,3
2,7.382979,94
3,-1.0,52
4,4.263383,467


In [19]:
x = user_node_features.to_numpy()

In [20]:
x.shape

(73515, 2)

In [29]:
print(set(rating_df["anime_id"].unique()) - set(anime_df["anime_id"].unique()))
rating = rating_df[~rating_df["anime_id"].isin([30913, 30924, 20261])]

{30913, 30924, 20261}


In [30]:
labels = rating["rating"]

In [32]:
y = labels.to_numpy()
y

array([-1, -1, -1, ..., 10,  9,  9])

In [33]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [35]:
# Map anime IDs 
movie_map = movie_id_mapping.reset_index().set_index("anime_id").to_dict()
rating["anime_id"] = rating["anime_id"].map(movie_map["index"]).astype(int)
# Map user IDs
user_map = user_id_mapping.reset_index().set_index("user_id").to_dict()
rating["user_id"] = rating["user_id"].map(user_map["index"]).astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [36]:
rating

Unnamed: 0,user_id,anime_id,rating
0,0,10,-1
1,0,14,-1
2,0,58,-1
3,0,202,-1
4,0,217,-1
...,...,...,...
7813732,73513,7470,7
7813733,73513,7622,9
7813734,73513,8624,10
7813735,73514,718,9


In [41]:
edge_index = rating[["user_id", "anime_id"]].values.transpose()
edge_index

array([[    0,     0,     0, ..., 73513, 73514, 73514],
       [   10,    14,    58, ...,  8624,   718,  5226]])

In [42]:
from torch_geometric.data import HeteroData
data = HeteroData()
data['user'].x = user_node_features
data['movie'].x = anime_node_features

In [44]:
data['user', 'rating', 'movie'].edge_index = edge_index

In [45]:
data['user', 'movie'].y = y

In [46]:
data

HeteroData(
  [1muser[0m={ x=            mean  count
0      -0.712418    153
1       2.666667      3
2       7.382979     94
3      -1.000000     52
4       4.263383    467
...          ...    ...
73510   7.846154     13
73511   7.515152     33
73512  10.000000      1
73513   7.719388    196
73514   9.000000      2

[73515 rows x 2 columns] },
  [1mmovie[0m={ x=      episodes  Action  Adventure  Cars  Comedy  Dementia  Demons  Drama  \
0           26       1          0     0       0         0       0      0   
1            1       1          0     0       0         0       0      0   
2           26       1          0     0       0         0       0      0   
3           26       1          0     0       0         0       0      0   
4           52       0          1     0       0         0       0      0   
...        ...     ...        ...   ...     ...       ...     ...    ...   
12289       18       1          0     0       0         0       0      0   
12290        9       0  

In [48]:
data

HeteroData(
  [1muser[0m={ x=            mean  count
0      -0.712418    153
1       2.666667      3
2       7.382979     94
3      -1.000000     52
4       4.263383    467
...          ...    ...
73510   7.846154     13
73511   7.515152     33
73512  10.000000      1
73513   7.719388    196
73514   9.000000      2

[73515 rows x 2 columns] },
  [1mmovie[0m={ x=      episodes  Action  Adventure  Cars  Comedy  Dementia  Demons  Drama  \
0           26       1          0     0       0         0       0      0   
1            1       1          0     0       0         0       0      0   
2           26       1          0     0       0         0       0      0   
3           26       1          0     0       0         0       0      0   
4           52       0          1     0       0         0       0      0   
...        ...     ...        ...   ...     ...       ...     ...    ...   
12289       18       1          0     0       0         0       0      0   
12290        9       0  