In [80]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.loader import NeighborLoader, LinkNeighborLoader
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, GATConv, Linear, to_hetero



In [76]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Read data

In [2]:
movie_df = pd.read_csv('./datasets/movieLens100k/movie_info.csv', parse_dates=['release date'])
movie_df.drop(columns=['movie title', 'release date'], inplace=True, axis=1)
movie_df.rename(columns={'movie id': 'movie_id'}, inplace=True)
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   movie_id     1682 non-null   int64
 1   unknown      1682 non-null   int64
 2   Action       1682 non-null   int64
 3   Adventure    1682 non-null   int64
 4   Animation    1682 non-null   int64
 5   Children's   1682 non-null   int64
 6   Comedy       1682 non-null   int64
 7   Crime        1682 non-null   int64
 8   Documentary  1682 non-null   int64
 9   Drama        1682 non-null   int64
 10  Fantasy      1682 non-null   int64
 11  Film-Noir    1682 non-null   int64
 12  Horror       1682 non-null   int64
 13  Musical      1682 non-null   int64
 14  Mystery      1682 non-null   int64
 15  Romance      1682 non-null   int64
 16  Sci-Fi       1682 non-null   int64
 17  Thriller     1682 non-null   int64
 18  War          1682 non-null   int64
 19  Western      1682 non-null   int64
dtypes: int64

In [3]:
movie_df.head()

Unnamed: 0,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [4]:
user_df = pd.read_csv('./datasets/movieLens100k/user_demographics.csv')
user_df.drop(columns=['zip_code'], axis=1, inplace=True)
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   sex         943 non-null    object
 3   occupation  943 non-null    object
dtypes: int64(2), object(2)
memory usage: 29.6+ KB


In [5]:
user_df.head()

Unnamed: 0,user_id,age,sex,occupation
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other


In [6]:
rating_df = pd.read_csv('./datasets/movieLens100k/ratings.csv')
rating_df.drop(columns=['unix_timestamp'], axis=1, inplace=True)
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   100000 non-null  int64
 1   movie_id  100000 non-null  int64
 2   rating    100000 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB


In [7]:
movie_df.head()

Unnamed: 0,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


## Preprocess ()

In [8]:
movie_id_encoder, user_id_encoder = LabelEncoder(), LabelEncoder()
sex_encoder, occupation_encoder = LabelEncoder(), LabelEncoder()

movie_df['movie_id'] = movie_id_encoder.fit_transform(movie_df['movie_id'])
user_df['user_id'] = user_id_encoder.fit_transform(user_df['user_id'])
rating_df['user_id'] = user_id_encoder.transform(rating_df['user_id'])
rating_df['movie_id'] = movie_id_encoder.transform(rating_df['movie_id'])

user_df['sex'] = sex_encoder.fit_transform(user_df['sex'])
user_df['occupation'] = occupation_encoder.fit_transform(user_df['occupation'])

In [9]:
user_df.head()

Unnamed: 0,user_id,age,sex,occupation
0,0,24,1,19
1,1,53,0,13
2,2,23,1,20
3,3,24,1,19
4,4,33,0,13


In [10]:
len(rating_df)

100000

In [11]:
df = rating_df.join(user_df.set_index('user_id'), on='user_id', validate='m:1')
df = df.join(movie_df.set_index('movie_id'), on='movie_id', validate='m:1')
df.shape

(100000, 25)

In [12]:
df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,195,241,3,49,1,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,185,301,3,39,0,6,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,21,376,1,25,1,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,243,50,2,28,1,19,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,165,345,1,47,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df.sort_values('movie_id')

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
25741,83,0,2,32,1,6,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
93639,805,0,4,27,1,11,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
55726,767,0,5,29,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
49529,91,0,4,32,1,5,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
89079,418,0,4,37,1,9,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75323,862,1677,1,17,1,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67302,862,1678,3,17,1,18,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
80394,862,1679,2,17,1,18,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
92329,895,1680,3,28,1,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df.sort_values('user_id')

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
66567,0,54,5,24,1,19,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
62820,0,202,4,24,1,19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10207,0,182,5,24,1,19,0,1,0,0,...,0,0,1,0,0,0,1,1,0,0
9971,0,149,5,24,1,19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22496,0,67,4,24,1,19,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96823,942,426,4,22,1,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70902,942,11,5,22,1,18,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
84518,942,283,2,22,1,18,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
72321,942,61,3,22,1,18,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0


## Create hetero dataset

In [15]:
movie_df

Unnamed: 0,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1677,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1678,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1679,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1680,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
rating_df
# movie_df[movie_df.columns[1:]].values

Unnamed: 0,user_id,movie_id,rating
0,195,241,3
1,185,301,3
2,21,376,1
3,243,50,2
4,165,345,1
...,...,...,...
99995,879,475,3
99996,715,203,5
99997,275,1089,1
99998,12,224,2


In [54]:
data = HeteroData()

data['user'].x = torch.Tensor(user_df[user_df.columns[1:]].values)
data['movie'].x = torch.Tensor(movie_df[movie_df.columns[1:]].values)

data['user', 'rates', 'movie'].edge_index = torch.Tensor(rating_df[['user_id', 'movie_id']].values).to(torch.long).swapaxes(1, 0)
data['user', 'rates', 'movie'].edge_label = torch.Tensor(rating_df['rating'].values).to(torch.long)

data

HeteroData(
  [1muser[0m={ x=[943, 3] },
  [1mmovie[0m={ x=[1682, 19] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100000],
    edge_label=[100000]
  }
)

In [49]:
data.has_isolated_nodes(), data.has_self_loops(), data.is_undirected()

(False, False, False)

In [55]:
data = T.ToUndirected()(data)
#data = T.AddSelfLoops()(data)
# data = T.NormalizeFeatures()(data)

data

HeteroData(
  [1muser[0m={ x=[943, 3] },
  [1mmovie[0m={ x=[1682, 19] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100000],
    edge_label=[100000]
  },
  [1m(movie, rev_rates, user)[0m={
    edge_index=[2, 100000],
    edge_label=[100000]
  }
)

In [60]:
del data['movie', 'rev_rates', 'user'].edge_label
data

HeteroData(
  [1muser[0m={ x=[943, 3] },
  [1mmovie[0m={ x=[1682, 19] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100000],
    edge_label=[100000]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 100000] }
)

In [64]:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)

In [69]:
# different types of nodes in Hetero graph
node_types, edge_types = data.metadata()
print('Different types of nodes in graph:',node_types)
print('Different types of edges in graph:',edge_types)

Different types of nodes in graph: ['user', 'movie']
Different types of edges in graph: [('user', 'rates', 'movie'), ('movie', 'rev_rates', 'user')]


In [68]:
weight = torch.bincount(train_data['user', 'movie'].edge_label)
weight = weight.max() / weight
weight

tensor([   inf, 5.6057, 2.9977, 1.2643, 1.0000, 1.6152])

In [70]:
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [72]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        # these convolutions have been replicated to match the number of edge types
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

In [73]:
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)
        
    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        # concat user and movie embeddings
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)
        # concatenated embeddings passed to linear layer
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

In [74]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        # z_dict contains dictionary of movie and user embeddings returned from GraphSage
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [77]:
model = Model(hidden_channels=32).to(device)

In [78]:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [79]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'movie'].edge_label_index)
    target = train_data['user', 'movie'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [81]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [83]:
for epoch in range(1, 300):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d},\tLoss: {loss:.4f},\tTrain: {train_rmse:.4f}, '
          f'\tVal: {val_rmse:.4f},\tTest: {test_rmse:.4f}')

Epoch: 001,	Loss: 3.0477,	Train: 1.2656, 	Val: 1.2727,	Test: 1.2719
Epoch: 002,	Loss: 3.0175,	Train: 1.1609, 	Val: 1.1678,	Test: 1.1670
Epoch: 003,	Loss: 2.9603,	Train: 1.1344, 	Val: 1.1405,	Test: 1.1401
Epoch: 004,	Loss: 2.9769,	Train: 1.2588, 	Val: 1.2651,	Test: 1.2651
Epoch: 005,	Loss: 3.0055,	Train: 1.1158, 	Val: 1.1221,	Test: 1.1210
Epoch: 006,	Loss: 2.9971,	Train: 1.1926, 	Val: 1.1998,	Test: 1.1980
Epoch: 007,	Loss: 2.9463,	Train: 1.2088, 	Val: 1.2161,	Test: 1.2140
Epoch: 008,	Loss: 2.9560,	Train: 1.1118, 	Val: 1.1183,	Test: 1.1168
Epoch: 009,	Loss: 3.0050,	Train: 1.2366, 	Val: 1.2429,	Test: 1.2425
Epoch: 010,	Loss: 2.9773,	Train: 1.1589, 	Val: 1.1648,	Test: 1.1644
Epoch: 011,	Loss: 2.9456,	Train: 1.1480, 	Val: 1.1540,	Test: 1.1534
Epoch: 012,	Loss: 2.9533,	Train: 1.2212, 	Val: 1.2276,	Test: 1.2269
Epoch: 013,	Loss: 2.9622,	Train: 1.1484, 	Val: 1.1548,	Test: 1.1537
Epoch: 014,	Loss: 2.9456,	Train: 1.1623, 	Val: 1.1687,	Test: 1.1677
Epoch: 015,	Loss: 2.9376,	Train: 1.2109, 	Val: 1

In [29]:
# from torch_geometric.nn import SAGEConv, GATConv, Linear, to_hetero


# class GNN(torch.nn.Module):
#     def __init__(self, hidden_channels, out_channels):
#         super().__init__()
#         self.conv1 = SAGEConv((-1, -1), hidden_channels)
#         self.conv2 = SAGEConv((-1, -1), out_channels)

#     def forward(self, x, edge_index):
#         x = self.conv1(x, edge_index).relu()
#         x = self.conv2(x, edge_index)
#         return x


# class GAT(torch.nn.Module):
#     def __init__(self, hidden_channels, out_channels):
#         super().__init__()
#         self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
#         self.lin1 = Linear(-1, hidden_channels)
#         self.conv2 = GATConv((-1, -1), out_channels, add_self_loops=False)
#         self.lin2 = Linear(-1, out_channels)

#     def forward(self, x, edge_index):
#         x = self.conv1(x, edge_index) + self.lin1(x)
#         x = x.relu()
#         x = self.conv2(x, edge_index) + self.lin2(x)
#         return x


# gnn_model = GNN(hidden_channels=64, out_channels=1)
# gnn_model = to_hetero(gnn_model, data.metadata(), aggr='sum')

# gat_model = GAT(hidden_channels=64, out_channels=1)
# gat_model = to_hetero(gat_model, data.metadata(), aggr='sum')

In [30]:
model = gnn_model

In [31]:
def train(model, optimizer, loss_f):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['paper'].train_mask
    loss = loss_f
    loss.backward()
    optimizer.step()
    return float(loss)

In [32]:
with torch.no_grad():  # Initialize lazy modules.
     out = model(data.x_dict, data.edge_index_dict)
     out

In [33]:
data.edge_index_dict

{('user',
  'rates',
  'movie'): tensor([[ 195,  185,   21,  ...,  275,   12,   11],
         [ 241,  301,  376,  ..., 1089,  224,  202]]),
 ('movie',
  'rev_rates',
  'user'): tensor([[ 241,  301,  376,  ..., 1089,  224,  202],
         [ 195,  185,   21,  ...,  275,   12,   11]])}

In [34]:
train_loader = LinkNeighborLoader(
    train_data,
    # Sample 15 neighbors for each node and each edge type for 2 iterations:
    num_neighbors=[4] * 2,
    # Use a batch size of 128 for sampling training nodes of type "paper":
    batch_size=4,
    edge_label_index=(('user', 'rates', 'movie'), train_data[('user', 'rates', 'movie')].edge_label_index), 
    edge_label=train_data[('user', 'rates', 'movie')].edge_label
    )

batch = next(iter(train_loader))

In [35]:
batch

HeteroData(
  [1muser[0m={ x=[73, 3] },
  [1mmovie[0m={ x=[69, 19] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 75],
    edge_attr=[75, 1],
    edge_label=[4],
    edge_label_index=[2, 4]
  },
  [1m(movie, rev_rates, user)[0m={
    edge_index=[2, 76],
    edge_attr=[76, 1]
  }
)

In [36]:
batch[('user', 'rates', 'movie')].edge_attr

tensor([[2.],
        [3.],
        [3.],
        [3.],
        [3.],
        [4.],
        [4.],
        [5.],
        [5.],
        [5.],
        [4.],
        [3.],
        [5.],
        [5.],
        [4.],
        [4.],
        [1.],
        [4.],
        [2.],
        [5.],
        [2.],
        [4.],
        [4.],
        [3.],
        [5.],
        [1.],
        [3.],
        [2.],
        [3.],
        [2.],
        [5.],
        [4.],
        [4.],
        [3.],
        [5.],
        [3.],
        [3.],
        [4.],
        [5.],
        [3.],
        [3.],
        [4.],
        [4.],
        [3.],
        [3.],
        [4.],
        [5.],
        [4.],
        [3.],
        [5.],
        [3.],
        [5.],
        [4.],
        [4.],
        [3.],
        [3.],
        [3.],
        [2.],
        [3.],
        [3.],
        [3.],
        [5.],
        [3.],
        [2.],
        [4.],
        [3.],
        [4.],
        [3.],
        [2.],
        [4.],
        [3.],
      

In [37]:
train_data.edge_label

AttributeError: 'HeteroData' has no attribute 'edge_label'