# Setup

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"
os.environ["CUDA_LAUNCH_BLOCKING"]="0, 1, 2, 3, 4"

In [2]:
import torch
import random
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch_geometric.data import HeteroData
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.loader import DataLoader, NeighborLoader, HGTLoader
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score

In [4]:
torch.cuda.is_available()

False

In [5]:
torch.__version__

'1.13.1+cu117'

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
items_csv = "../Datasets/ml-100k/Text/items.csv"
train_ratings = "../Datasets/ml-100k/Text/u1.base"
test_ratings = "../Datasets/ml-100k/Text/u1.test"
item_path = "../Datasets/ml-100k/"

In [8]:
n_users = 943
n_items = 1682

## Construct Graph

In [9]:
# class MovielensDataset():
#     def __init__(self, ratings = train_ratings, item_path = item_path, device = device):
#         self.video_embeddings = pd.read_csv(item_path + "Video/embeddings.csv").to_numpy()
#         self.audio_embeddings = pd.read_csv(item_path + "Audio/embeddings.csv").to_numpy()
#         self.meta_embeddings = pd.read_csv(item_path + "Meta/embeddings.csv").to_numpy()
#         self.text_embeddings = pd.read_csv(item_path + "Text/embeddings.csv").to_numpy()
#         self.user_embeddings = pd.read_csv(item_path + "User/embeddings.csv").to_numpy()
#         self.ratings = pd.read_csv(ratings, sep='\t', 
#                                    names=['user_id', 'movie_id', 'rating', 'unix_timestamp'],encoding='latin-1')
#         self.indices = None
#         self.device = device
#         self.data = None
#         self.n_users = None
#         self.n_items = None
#         self.dataset = HeteroData()
#         self.fill_ratings()
#         self.embeddings()
    
#     def fill_ratings(self, threshold=4):
#         self.n_users = self.ratings.user_id.unique().shape[0]
#         self.n_items = self.ratings.movie_id.unique().shape[0]
#         self.edge_index = []
#         self.edge_label = []
        
#         self.data = np.zeros((n_users, n_items))
#         for line in self.ratings.itertuples():
#             if(line[3] >= 1):
#                 self.data[line[1] - 1, line[2] - 1] = line[3]
#                 self.edge_index.append(torch.tensor([line[1] - 1, line[2] - 1], dtype = torch.long))
#                 self.edge_label.append(line[3] - 1)

#         self.edge_index = torch.stack(self.edge_index, 1).to(self.device)
#         self.edge_label = torch.tensor(self.edge_label, dtype = torch.long).to(self.device)
    
#     def embeddings(self):
#         self.audio_embeddings = np.nan_to_num(self.audio_embeddings)
#         self.video_embeddings = np.nan_to_num(self.video_embeddings)
#         self.audio_embeddings = normalize(self.audio_embeddings, axis = 0)
        
#         self.dataset['movies'].x = torch.tensor(self.audio_embeddings, dtype = torch.float).to(self.device)
#         self.dataset['users'].x  = torch.tensor(self.user_embeddings, dtype = torch.float).to(self.device)
#         self.dataset['users', 'likes', 'movies'].edge_index = self.edge_index
#         self.dataset['users', 'likes', 'movies'].edge_label  = self.edge_label

In [10]:
class MovielensDataset():
    def __init__(self, ratings = train_ratings, item_path = item_path, device = device):
        self.video_embeddings = pd.read_csv(item_path + "Video/embeddings.csv").to_numpy()
        self.audio_embeddings = pd.read_csv(item_path + "Audio/embeddings.csv").to_numpy()
        self.meta_embeddings = pd.read_csv(item_path + "Meta/embeddings.csv").to_numpy()
        self.text_embeddings = pd.read_csv(item_path + "Text/embeddings.csv").to_numpy()
        self.user_embeddings = pd.read_csv(item_path + "User/embeddings.csv").to_numpy()
        self.ratings = pd.read_csv(ratings, sep='\t', 
                                   names=['user_id', 'movie_id', 'rating', 'unix_timestamp'],encoding='latin-1')
        self.indices = None
        self.device = device
        self.data = None
        self.n_users = None
        self.n_items = None
        self.dataset = HeteroData()
        self.fill_ratings()
        self.embeddings()
    
    def fill_ratings(self, threshold=4):
        self.n_users = self.ratings.user_id.unique().shape[0]
        self.n_items = self.ratings.movie_id.unique().shape[0]
        self.edge_index = []
        self.edge_label = []
        
        self.data = np.zeros((n_users, n_items))
        for line in self.ratings.itertuples():
            if(line[3] >= 1):
                self.data[line[1] - 1, line[2] - 1] = line[3]
                self.edge_index.append(torch.tensor([line[1] - 1, line[2] - 1], dtype = torch.long))
                self.edge_label.append(line[3] - 1)
        
        self.edge_index = torch.stack(self.edge_index, 1).to(self.device)
        self.edge_label = torch.tensor(self.edge_label, dtype = torch.long).to(self.device)
    
    def embeddings(self):
#         self.audio_embeddings = np.nan_to_num(self.audio_embeddings)
#         self.video_embeddings = np.nan_to_num(self.video_embeddings)
#         self.audio_embeddings = normalize(self.audio_embeddings, axis = 0)
        in_ = torch.cat( (torch.tensor(self.text_embeddings, dtype = torch.float).to(self.device),
                        torch.tensor(self.video_embeddings, dtype = torch.float).to(self.device)), axis=1)
#         print(in_.shape)
        self.dataset['movies'].x  = in_
#         self.dataset['movies'].x = torch.tensor(self.meta_embeddings, dtype = torch.float).to(self.device)
        self.dataset['users'].x  = torch.tensor(self.user_embeddings, dtype = torch.float).to(self.device)
#         self.dataset['movies'].x = torch.tensor(self.data.T, dtype = torch.float).to(self.device)
#         self.dataset['users'].x  = torch.tensor(self.data, dtype = torch.float).to(self.device)
        self.dataset['users', 'likes', 'movies'].edge_index = self.edge_index
        self.dataset['users', 'likes', 'movies'].edge_label  = self.edge_label

In [11]:
train_data = MovielensDataset(ratings = train_ratings).dataset
test_data = MovielensDataset(ratings = test_ratings).dataset

In [12]:
train_data

HeteroData(
  [1mmovies[0m={ x=[1682, 1408] },
  [1musers[0m={ x=[943, 1220] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 80000],
    edge_label=[80000]
  }
)

In [13]:
test_data

HeteroData(
  [1mmovies[0m={ x=[1682, 1408] },
  [1musers[0m={ x=[943, 1220] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 20000],
    edge_label=[20000]
  }
)

## DataLoader

In [25]:
import torch_geometric.transforms as T

train_data  = T.ToUndirected()(train_data)
test_data  = T.ToUndirected()(test_data)
train_data, val_data, temp = T.RandomLinkSplit(edge_types=[('users', 'likes', 'movies')], 
                                            rev_edge_types=[('movies', 'rev_likes', 'users')],
                                            num_val = 0,
                                            num_test = 0)(train_data)

In [15]:
train_data

HeteroData(
  [1mmovies[0m={ x=[1682, 1408] },
  [1musers[0m={ x=[943, 1220] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 80000],
    edge_label=[160000],
    edge_label_index=[2, 160000]
  },
  [1m(movies, rev_likes, users)[0m={
    edge_index=[2, 80000],
    edge_label=[80000]
  }
)

In [16]:
val_data

HeteroData(
  [1mmovies[0m={ x=[1682, 1408] },
  [1musers[0m={ x=[943, 1220] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 80000],
    edge_label=[0],
    edge_label_index=[2, 0]
  },
  [1m(movies, rev_likes, users)[0m={
    edge_index=[2, 80000],
    edge_label=[80000]
  }
)

In [17]:
test_data

HeteroData(
  [1mmovies[0m={ x=[1682, 1408] },
  [1musers[0m={ x=[943, 1220] },
  [1m(users, likes, movies)[0m={
    edge_index=[2, 20000],
    edge_label=[20000]
  },
  [1m(movies, rev_likes, users)[0m={
    edge_index=[2, 20000],
    edge_label=[20000]
  }
)

In [18]:
test_data['users', 'movies'].edge_label_index = test_data['users', 'movies'].edge_index

In [19]:
# train_loader = HGTLoader(
#     train_data,
#     # Sample 15 neighbors for each node and each edge type for 2 iterations:
#     num_neighbors = [15] * 2,
#     # Use a batch size of 128 for sampling training nodes of type "paper":
#     batch_size = 10,
#     input_nodes = torch.arange(0, n_users)
# )

In [20]:
# sampled_data = next(iter(train_loader))

In [21]:
weight = torch.bincount(train_data['users', 'movies'].edge_label)
weight = weight.max() / weight

In [22]:
def weighted_mse_loss(pred, target, weight=weight):
    target = target.long()
    weight = weight[target].to(pred.dtype)
    loss = (pred - target.to(pred.dtype)).pow(2)
    return ((weight * loss).mean(), loss.mean())

In [23]:
import torch.nn as nn
from tqdm import tqdm
import torch.nn.functional as F
import torch_geometric.nn as nng 
from collections import OrderedDict
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = nng.GATConv((-1, -1), hidden_channels)#, train_data.metadata())
        self.conv2 = nng.GATConv((-1, -1), hidden_channels)#, train_data.metadata())
        self.conv3 = nng.GATConv((-1, -1), out_channels)#, train_data.metadata())

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).sigmoid()
        x = self.conv2(x, edge_index).relu()
        return x
    
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = nn.Linear(hidden_channels, hidden_channels)
        self.lin3 = nn.Linear(hidden_channels, hidden_channels)
        self.lin4 = nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['users'][row], z_dict['movies'][col]], dim=-1)
        z = self.lin1(z).sigmoid()
#         z = self.lin2(z).relu()
#         z = self.lin3(z).sigmoid()
        z = self.lin4(z).relu()
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = nng.to_hetero(self.encoder, train_data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)
        
    def forward(self, X):
        x_dict, edge_index_dict, edge_label_index = X.x_dict, X.edge_index_dict, X['users', 'movies'].edge_label_index
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)
    
    def train(self, train_data = train_data, epochs = 1000):
        self.loss = weighted_mse_loss
        self.optim = torch.optim.Adam(self.parameters(), lr = 1e-4)
        self.train_loss = []
        self.val_loss = []
        for i in tqdm(range(epochs)):
            self.optim.zero_grad()
            otpt = self.forward(train_data)
            trgt = train_data['users', 'movies'].edge_label.float()
            loss, loss_ = self.loss(otpt, trgt)
            loss.backward()
            self.optim.step()
            self.train_loss.append(torch.sqrt(loss_).cpu().detach().numpy())
            self.val_loss.append(self.test()[0].cpu())
        return(self.train_loss, self.val_loss)
            
    def test(self, data = test_data):
        with torch.no_grad():
            otpt_cont = self.forward(data)
            print(otpt_cont)
            trgt_cont = data['users', 'movies'].edge_label.float()
            loss, loss_ = self.loss(otpt_cont, trgt_cont)
            trgt = torch.div(trgt_cont, 4, rounding_mode="floor").cpu()
            otpt = (otpt_cont > 2.5).float().cpu()
            print(classification_report(trgt, otpt, zero_division=0))
        return(torch.sqrt(loss_),
               precision_score(trgt, otpt, zero_division=0, average='weighted'), 
               recall_score(trgt, otpt, average='weighted'))

model = Model(hidden_channels = 500).to(device)

ValueError: 'add_self_loops' attribute set to 'True' on module 'GATConv((-1, -1), 500, heads=1)' for use with edge type(s) '[('users', 'likes', 'movies'), ('movies', 'rev_likes', 'users')]'. This will lead to incorrect message passing results.

In [None]:
output = model.train(train_data, epochs= 1000)

In [None]:
model.test(train_data)

In [None]:
min(output[1])

In [None]:
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

fig, ax = plt.subplots()
ax2 = ax.twinx()
ax.plot(output[0][50:], label = "Train Loss", color = "orange")
ax2.plot(output[1][50:], label = "Test Loss")
fig.legend([ax, ax2], labels = ["Train Loss", "Test Loss"], loc = "upper right")
plt.show()

In [None]:
pred = model(train_data).cpu().detach().numpy()

In [None]:
true = train_data['users', 'movies'].edge_label.cpu().detach().numpy()

In [None]:
list(zip(pred, true))