# Setup

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"
os.environ["CUDA_LAUNCH_BLOCKING"]="0, 1, 2, 3, 4"

In [2]:
import torch
import random
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch_geometric.data import HeteroData
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.loader import DataLoader, NeighborLoader, HGTLoader, NeighborLoader
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score

In [4]:
torch.cuda.is_available()

False

In [5]:
torch.__version__

'1.13.1+cu117'

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
items_csv = "../Datasets/ml-100k/Text/items.csv"
train_ratings = "../Datasets/ml-100k/Text/u1.base"
test_ratings = "../Datasets/ml-100k/Text/u1.test"
item_path = "../Datasets/ml-100k/"

In [8]:
n_users = 943
n_items = 1682

## Construct Graph

In [9]:
class MovielensDataset():
    def __init__(self, ratings = train_ratings, item_path = item_path, device = device):
        self.video_embeddings = pd.read_csv(item_path + "Video/embeddings.csv").to_numpy()
        self.audio_embeddings = pd.read_csv(item_path + "Audio/embeddings.csv").to_numpy()
        self.meta_embeddings = pd.read_csv(item_path + "Meta/embeddings.csv").to_numpy()
        self.text_embeddings = pd.read_csv(item_path + "Text/embeddings.csv").to_numpy()
        self.user_embeddings = pd.read_csv(item_path + "User/embeddings.csv").to_numpy()
        self.ratings = pd.read_csv(ratings, sep='\t', 
                                   names=['user_id', 'movie_id', 'rating', 'unix_timestamp'],encoding='latin-1')
        self.indices = None
        self.device = device
        self.data = None
        self.n_users = None
        self.n_items = None
        self.dataset = HeteroData()
        self.fill_ratings()
        self.embeddings()
    
    def fill_ratings(self, threshold=4):
        self.n_users = self.ratings.user_id.unique().shape[0]
        self.n_items = self.ratings.movie_id.unique().shape[0]
        self.edge_index = []
        self.edge_label = []
        
        self.data = np.zeros((n_users, n_items))
        for line in self.ratings.itertuples():
            if(line[3] >= 1):
                self.data[line[1] - 1, line[2] - 1] = line[3]
                self.edge_index.append(torch.tensor([line[1] - 1, line[2] - 1], dtype = torch.long))
                self.edge_label.append(line[3] - 1)

        self.edge_index = torch.stack(self.edge_index, 1).to(self.device)
        self.edge_label = torch.tensor(self.edge_label, dtype = torch.long).to(self.device)
    
    def embeddings(self):
        self.audio_embeddings = np.nan_to_num(self.audio_embeddings)
        self.video_embeddings = np.nan_to_num(self.video_embeddings)
        self.audio_embeddings = normalize(self.audio_embeddings, axis = 0)
        
#         self.dataset['movies'].x = torch.tensor(self.text_embeddings, dtype = torch.float).to(self.device)
#         self.dataset['users'].x  = torch.tensor(self.user_embeddings, dtype = torch.float).to(self.device)
        self.dataset['movies'].x = torch.tensor(self.data.T, dtype = torch.float).to(self.device)
        self.dataset['users'].x  = torch.tensor(self.data, dtype = torch.float).to(self.device)
        self.dataset['users', 'likes', 'movies'].edge_index = self.edge_index
        self.dataset['users', 'likes', 'movies'].edge_label  = self.edge_label

In [10]:
train_data = MovielensDataset(ratings = train_ratings).dataset
test_data = MovielensDataset(ratings = test_ratings).dataset

In [11]:
train_data

HeteroData(
  [1mmovies[0m={ x=[1682, 943] },
  [1musers[0m={ x=[943, 1682] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 80000],
    edge_label=[80000]
  }
)

In [12]:
test_data

HeteroData(
  [1mmovies[0m={ x=[1682, 943] },
  [1musers[0m={ x=[943, 1682] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 20000],
    edge_label=[20000]
  }
)

## DataLoader

In [18]:
import torch_geometric.transforms as T
import torch_geometric.utils as utils

train_data  = T.ToUndirected()(train_data)
test_data  = T.ToUndirected()(test_data)
train_data, val_data, temp = T.RandomLinkSplit(edge_types=[('users', 'likes', 'movies')], 
                                            rev_edge_types=[('movies', 'rev_likes', 'users')],
                                            # is_undirected = True,
                                            num_val = 0,
                                            num_test = 0)(train_data)

In [19]:
# print(val_data['users', 'movies'].edge_label.unique())

In [20]:
train_data

HeteroData(
  [1mmovies[0m={ x=[1682, 943] },
  [1musers[0m={ x=[943, 1682] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 80000],
    edge_label=[160000],
    edge_label_index=[2, 160000]
  },
  [1m(movies, rev_likes, users)[0m={
    edge_index=[2, 80000],
    edge_label=[80000]
  },
  [1m(users, rev_rev_likes, movies)[0m={
    edge_index=[2, 80000],
    edge_label=[80000]
  }
)

In [21]:
val_data

HeteroData(
  [1mmovies[0m={ x=[1682, 943] },
  [1musers[0m={ x=[943, 1682] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 80000],
    edge_label=[0],
    edge_label_index=[2, 0]
  },
  [1m(movies, rev_likes, users)[0m={
    edge_index=[2, 80000],
    edge_label=[80000]
  },
  [1m(users, rev_rev_likes, movies)[0m={
    edge_index=[2, 80000],
    edge_label=[80000]
  }
)

In [22]:
test_data

HeteroData(
  [1mmovies[0m={ x=[1682, 943] },
  [1musers[0m={ x=[943, 1682] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 20000],
    edge_label=[20000]
  },
  [1m(movies, rev_likes, users)[0m={
    edge_index=[2, 20000],
    edge_label=[20000]
  },
  [1m(users, rev_rev_likes, movies)[0m={
    edge_index=[2, 20000],
    edge_label=[20000]
  }
)

In [23]:
test_data['users', 'movies'].edge_label_index = test_data['users', 'movies'].edge_index

TypeError: HeteroData.get_edge_store() missing 1 required positional argument: 'dst'

In [None]:
# from torch_geometric.loader import LinkNeighborLoader

# loader = LinkNeighborLoader(
#     train_data,
#     num_neighbors=[30] * 2,
#     batch_size=128,
#     edge_label_index=train_data['users', 'likes', 'movies'].edge_index,
# )

loader = HGTLoader(
    train_data,
    # Sample 512 nodes per type and per iteration for 4 iterations
    num_samples={key: [512] * 4 for key in train_data.node_types},
    # Use a batch size of 128 for sampling training nodes of type paper
    batch_size=128,
    input_nodes='movies',
)

In [None]:
sampled_data = next(iter(loader))

In [None]:
weight = torch.bincount(train_data['users', 'movies'].edge_label)
weight = weight.max() / weight

In [None]:
def weighted_mse_loss(pred, target, weight=weight):
    target = target.long()
    weight = weight[target].to(pred.dtype)
    loss = (pred - target.to(pred.dtype)).pow(2)
    return ((weight * loss).mean(), loss.mean())

In [None]:
import torch.nn as nn
from tqdm import tqdm
import torch.nn.functional as F
from collections import OrderedDict
import torch_geometric.nn as nng

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels = 100, out_channels = 100):
        super().__init__()
        self.conv1 = nng.GATConv((-1, -1), hidden_channels)#, train_data.metadata())
        self.conv2 = nng.GATConv((-1, -1), out_channels)#, train_data.metadata())
        self.conv3 = nng.GATConv((-1, -1), out_channels)#, train_data.metadata())
#         self.conv1 = nng.HGTConv(-1, hidden_channels, train_data.metadata())
#         self.conv2 = nng.HGTConv(-1, out_channels, train_data.metadata())
#         self.conv3 = nng.HGTConv(-1, out_channels, train_data.metadata())

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.conv3(x, edge_index).relu()
        return x
    
class SiameseDecoder(torch.nn.Module):
    def __init__(self, hidden_channels = 100, out_channels = 300):
        super().__init__()
#         self.encoder_user = nn.Sequential(OrderedDict([
#             ('linr1', nn.Linear(hidden_channels, 256)),
#             ('relu1', nn.Sigmoid()),
#             ('linr2', nn.Linear(256, out_channels)),
#             ('relu2', nn.LeakyReLU()),
#         ]))
        
#         self.encoder_item = nn.Sequential(OrderedDict([
#             ('linr1', nn.Linear(hidden_channels, 256)),
#             ('relu1', nn.Sigmoid()),
#             ('linr2', nn.Linear(256, out_channels)),
#             ('relu2', nn.LeakyReLU()),
#         ]))
        
        self.siamese = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(hidden_channels, 200)),
            ('relu1', nn.LeakyReLU()),
            ('linr2', nn.Linear(200, 100)),
#             ('relu2', nn.LeakyReLU()),
#             ('linr3', nn.Linear(256, 100)),
#             ('relu3', nn.LeakyReLU()),
        ]))
        
        self.ffn = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(2000, 164)),
            ('actv1', nn.ReLU()),
            ('linr2', nn.Linear(164, 1)),
        ]))

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
#         print(z_dict['movies'][row])
#         z_i = self.encoder_item(z_dict['movies'][row])
        z_i = self.siamese(z_dict['movies'][row])
#         z_u = self.encoder_user(z_dict['users'][row])
        z_u = self.siamese(z_dict['users'][row])
#         z = self.ffn(torch.cat((z_i, z_u), axis=1))
        z = self.ffn(torch.cat((z_dict['users'][row], z_dict['movies'][row]), axis=1))
        return(z_i, z_u, z)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = nng.to_hetero(self.encoder, train_data.metadata(), aggr='sum')
        self.decoder = SiameseDecoder(hidden_channels)
        
    def forward(self, data):
        x_dict, edge_index_dict, edge_label_index = data.x_dict, data.edge_index_dict, data['users', 'movies'].edge_label_index
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)
    
    def train(self, data = train_data, epochs = 1000):
        self.pred_loss = weighted_mse_loss
        self.embd_loss = nn.CosineEmbeddingLoss()
        self.optim = torch.optim.Adam(self.parameters(), lr = 1e-4)
        self.train_loss = []
        self.val_loss = []
        for i in tqdm(range(epochs)):
            self.optim.zero_grad()
            otpt = self.forward(data)
            trgt = data['users', 'movies'].edge_label.float()
            loss, loss_ = self.pred_loss(otpt[2], trgt)
            loss += self.embd_loss(otpt[0], otpt[1], torch.div(trgt, 2, rounding_mode='floor') * 2 - 1)
            loss.backward()
            self.optim.step()
            self.train_loss.append(torch.sqrt(loss_).cpu().detach().numpy())
            self.val_loss.append(self.test().cpu().detach().numpy())
        return(self.train_loss, self.val_loss)
            
    def test(self, data = test_data):
        with torch.no_grad():
            otpt = self.forward(data)
            trgt = data['users', 'movies'].edge_label.float()
            _, loss = self.pred_loss(otpt[2], trgt)
            return(torch.sqrt(loss))

model = Model(hidden_channels = 1000).to(device)

In [None]:
output = model.train(train_data)

In [None]:
model.test(test_data)

In [None]:
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

fig, ax = plt.subplots()
ax2 = ax.twinx()
ax.plot(output[0][50:], label = "Train Loss", color = "orange")
ax2.plot(output[1][50:], label = "Test Loss")
fig.legend([ax, ax2], labels = ["Train Loss", "Test Loss"], loc = "upper right")
plt.show()

In [None]:
pred = model(train_data)[2].cpu().detach().numpy()

In [None]:
true = train_data['users', 'movies'].edge_label.cpu().detach().numpy()

In [None]:
list(zip(pred, true))