## Install Libraries & Prepare the Working Directory

In [201]:
# %pip install scikit-optimize
# %pip install boto3
# %pip install dgl==0.9.0
# %pip install dgl-cu101

from torch.multiprocessing import Pool, Process, set_start_method
import dgl.function as dgl_func
import dgl
import torch
import dgl.function as fn
import dgl.nn.pytorch as dglnn
from dgl.dataloading import MultiLayerNeighborSampler, as_edge_prediction_sampler
import pandas as pd
import copy
import torch
import torch.nn as nn
import torch.nn.functional as torch_nn_func
import dgl
import dgl.nn.pytorch as dglnn
import dgl.function as dgl_func
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from datetime import timedelta
from collections import defaultdict
from typing import Tuple
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from datetime import datetime
import textwrap
import math
import sys
import time
import pickle



try:
    set_start_method('spawn')
except RuntimeError:
    pass

## Đọc dữ liệu user, product, interact từ csv

In [202]:
user_feat_df = pd.read_csv('./collect_data/user.csv')
interact_df = pd.read_csv('./collect_data/rating.csv')
product_feat_df = pd.read_csv('./collect_data/product.csv')
interact_df.sort_values(by='Timestamp')
interact_df.head()
print(user_feat_df.shape)
print(product_feat_df.shape)

(6040, 4)
(33, 12)


In [203]:
duplicate_rows = interact_df.duplicated(subset=['UserID', 'ItemID'], keep=False)
num_duplicates = duplicate_rows.sum()
num_duplicates

0

In [204]:
interact_df

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,U00006040,P0017,3,956704584
1,U00006037,P0017,4,956709701
2,U00006036,P0024,2,956710067
3,U00006036,P0028,4,956710566
4,U00006035,P0025,1,956710846
...,...,...,...,...
15924,U00003391,P0005,2,1046188297
15925,U00003391,P0003,3,1046188641
15926,U00003391,P0025,3,1046189237
15927,U00003391,P0026,4,1046189862


In [205]:
interact_df[interact_df['Rating'] < 3].shape

(2270, 4)

## Chia tập train và test

In [206]:
test_size = 0.28
split_idx = int(len(interact_df) * (1 - test_size))
rating_train_df = interact_df[: split_idx]
rating_test_df = interact_df[split_idx: ]
rating_all_df = interact_df[:]

In [207]:
print(rating_train_df.head())
print(interact_df.shape)
print(rating_train_df.shape)
print(rating_test_df.shape)
print(rating_all_df.shape)

      UserID ItemID  Rating  Timestamp
0  U00006040  P0017       3  956704584
1  U00006037  P0017       4  956709701
2  U00006036  P0024       2  956710067
3  U00006036  P0028       4  956710566
4  U00006035  P0025       1  956710846
(15929, 4)
(11468, 4)
(4461, 4)
(15929, 4)


In [208]:
rating_all_df.shape

(15929, 4)

## Xử lý và đặt id node cho user trên tập train

In [209]:
user_id_train_df = pd.DataFrame(rating_train_df['UserID'].unique(), columns=['UserID'])
user_id_train_df.shape
user_id_train_df['user_new_id'] = user_id_train_df.index
user_id_train_df.shape

(3334, 2)

In [210]:
user_id_table = user_id_train_df[['UserID', 'user_new_id']]
user_id_table.to_csv('user_id_table.csv', index=False)

## Xử lý và đặt id node cho sản phẩm (lấy tất cả sản phẩm)

In [211]:
train_product = rating_train_df['ItemID'].unique().tolist()
all_product = product_feat_df['ItemID'].unique().tolist()

unbuy_product = [product for product in all_product if product not in train_product]
train_product.extend(unbuy_product)  

In [212]:
rating_train_df.sort_values(by='ItemID', ignore_index=True, inplace=False)  

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,U00004506,P0001,5,964981763
1,U00005557,P0001,5,959440917
2,U00005206,P0001,4,961629983
3,U00001552,P0001,4,974742562
4,U00002849,P0001,4,972509670
...,...,...,...,...
11463,U00003847,P0032,4,965880572
11464,U00003850,P0032,5,965880197
11465,U00004562,P0032,4,967430875
11466,U00003650,P0033,3,966460690


In [213]:

product_id_all_df = pd.DataFrame(train_product, columns=['ItemID'])
product_id_all_df['product_new_id'] = product_id_all_df.index
product_id_all_df.shape

(33, 2)

In [214]:
rating_train_df.sort_values(by='ItemID', ignore_index=True, inplace=False)  

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,U00004506,P0001,5,964981763
1,U00005557,P0001,5,959440917
2,U00005206,P0001,4,961629983
3,U00001552,P0001,4,974742562
4,U00002849,P0001,4,972509670
...,...,...,...,...
11463,U00003847,P0032,4,965880572
11464,U00003850,P0032,5,965880197
11465,U00004562,P0032,4,967430875
11466,U00003650,P0033,3,966460690


In [215]:
product_id_all_df.sort_values(by='ItemID', ignore_index=True, inplace=False)  

Unnamed: 0,ItemID,product_new_id
0,P0001,11
1,P0002,18
2,P0003,14
3,P0004,6
4,P0005,13
5,P0006,15
6,P0007,12
7,P0008,16
8,P0009,28
9,P0010,19


In [216]:
product_id_table = product_id_all_df[['ItemID', 'product_new_id']]
product_id_table.to_csv('product_id_table.csv', index=False)

## Xử lý df rating trên tập train

In [217]:
rating_train_df = rating_train_df.merge(user_id_train_df, how='left', on='UserID')
rating_train_df = rating_train_df.merge(product_id_all_df, how='left', on='ItemID')

rating_train_df.drop_duplicates(subset=['Rating', 'user_new_id', 'product_new_id'], keep='last', inplace=True)
rating_train_df.sort_values(by=['Rating', 'user_new_id', 'product_new_id'], ignore_index=True, inplace=True) 
rating_train_df.sort_values(by='Timestamp', ignore_index=True, inplace=True)  
rating_train_df.shape

(11468, 6)

## Xử lý df rating trên tập test

In [218]:
rating_test_df = rating_test_df.merge(user_id_train_df, how='inner', on='UserID')
rating_test_df = rating_test_df.merge(product_id_all_df, how='inner', on='ItemID')
user_id_rating_test_src_df = rating_test_df['user_new_id'].values
product_id_rating_test_dst_df = rating_test_df['product_new_id'].values
ground_truth_test = (user_id_rating_test_src_df, product_id_rating_test_dst_df)
user_id_rating_test_src_df.shape

(1045,)

In [219]:
rating_train_df[(rating_train_df['UserID'] == 'U00002389') & (rating_train_df['Rating'] == 1)]


Unnamed: 0,UserID,ItemID,Rating,Timestamp,user_new_id,product_new_id


In [220]:
rating_test_df[(rating_test_df['UserID'] == 'U00002389') & (rating_test_df['Rating'] == 1)]


Unnamed: 0,UserID,ItemID,Rating,Timestamp,user_new_id,product_new_id


## Tạo ma trận liền kề trên tập train
- src: các id node user
- dst: các id node product
- w: điểm rating của user cho product


In [221]:
rating_train_df.shape

(11468, 6)

In [222]:
adjacency_dict = {}

adjacency_dict['user_product_rating'] = rating_train_df.Rating.values
adjacency_dict['user_product_src'] = rating_train_df.user_new_id.values
adjacency_dict['user_product_dst'] = rating_train_df.product_new_id.values

In [223]:
np.unique(adjacency_dict['user_product_dst']).size

33

## Tạo graph Heterogeneous để train
- 2 loại cạnh: rating và bought-by
- 2 loại node: user và node

In [224]:
graph_schema = {('user', 'rating', 'product'): list(zip(adjacency_dict['user_product_src'], adjacency_dict['user_product_dst'])),
                ('product', 'bought-by', 'user'): list(zip(adjacency_dict['user_product_dst'], adjacency_dict['user_product_src']))
               }

In [225]:
len(list(zip(adjacency_dict['user_product_src'], adjacency_dict['user_product_dst'])))

11468

In [226]:
hetero_graph = dgl.heterograph(graph_schema)
print('Graph --> \n', hetero_graph) 

Graph --> 
 Graph(num_nodes={'product': 33, 'user': 3334},
      num_edges={('product', 'bought-by', 'user'): 11468, ('user', 'rating', 'product'): 11468},
      metagraph=[('product', 'user', 'bought-by'), ('user', 'product', 'rating')])


## Rút trích đặc tính của user train
- Hiện tại chỉ có giới tính

In [227]:
user_feat_df.head()
user_feat_df.shape

(6040, 4)

In [228]:
user_id_train_df.shape

(3334, 2)

In [229]:
# Join với user được training để lấy những feature của các user đó và bỏ các user còn lại -> có 3664 user được training
user_feat_train_df = user_feat_df.merge(user_id_train_df, how='inner', on='UserID')
ids = user_feat_train_df.user_new_id.values.astype(int)
feats = np.stack((user_feat_train_df.F.values, user_feat_train_df.M.values), axis=1)

user_feat = np.zeros((hetero_graph.number_of_nodes('user'), 2))
user_feat[ids] = feats

user_feat = torch.tensor(user_feat).float()

In [230]:
user_feat.shape

torch.Size([3334, 2])

In [231]:
user_feat_train_df.shape

(3334, 5)

## Rút trích đặc tính của product
- 10 hương vị

In [232]:
print(product_feat_df.shape)
print(product_id_all_df.shape)

(33, 12)
(33, 2)


In [233]:
# có 33 sp và cả 33 được đi training 
product_feat_df = product_feat_df.merge(product_id_all_df, how='left', on='ItemID')
product_feat_df = product_feat_df[product_feat_df.product_new_id < hetero_graph.number_of_nodes('product')]  

ids = product_feat_df.product_new_id.values.astype(int)

feats = np.stack((product_feat_df.Type.values,
                  product_feat_df.Coffee.values,
                  product_feat_df.Tea.values,
                  product_feat_df.Chocolate.values,
                  product_feat_df.Matcha.values,
                  product_feat_df.Cream.values,
                  product_feat_df.Vani.values,
                  product_feat_df.Milk.values,
                  product_feat_df.Pumpkin.values,
                  product_feat_df.Cheese.values,
                  ),
                 axis=1)

product_feat = np.zeros((hetero_graph.number_of_nodes('product'), feats.shape[1]))
product_feat[ids] = feats
product_feat = torch.tensor(product_feat).float()
product_feat.shape

torch.Size([33, 10])

## Thêm các giá trị vào node và cạnh cho graph train

### Thêm các giá trị vào 2 node user và product 

In [234]:
features_dict = {}
features_dict['user_feat'] = user_feat
features_dict['product_feat'] = product_feat

In [235]:
hetero_graph.nodes['user'].data['features'] = features_dict['user_feat']
hetero_graph.nodes['product'].data['features'] = features_dict['product_feat']

### Thêm các giá trị vào cạnh rating và bought-by

In [236]:
hetero_graph.edges['rating'].data['rating'] = torch.tensor(adjacency_dict['user_product_rating'])
hetero_graph.edges['bought-by'].data['rating'] = torch.tensor(adjacency_dict['user_product_rating'])

In [237]:
torch.tensor(adjacency_dict['user_product_rating'])

tensor([3, 4, 2,  ..., 1, 4, 4])

## Node Embeddings

* We calculate Node embeddings to represent nodes as vectors an capture the topology of the network
* The embeddings --> based on similarity.
* We will use the embeddings for prediction tasks.

In [238]:
class NodeEmbedding(nn.Module):
    def __init__(self, input_features, output_features,):
        super().__init__()
        self.project_features = nn.Linear(input_features, output_features)

    def forward(self, node_features):
        return self.project_features(node_features)

## Message passing layer

In [239]:

class MessagePassing(nn.Module):
    
      def __init__(self, input_features, output_features, dropout,):
        super().__init__()
        self._in_neigh_feats, self._in_self_feats = input_features
        self._output_features = output_features 
        self.dropout = nn.Dropout(dropout)
        self.fc_self = nn.Linear(self._in_self_feats, output_features, bias=False)
        self.fc_neighbour = nn.Linear(self._in_neigh_feats, output_features, bias=False)
        self.fc_pre_agg = nn.Linear(self._in_neigh_feats, self._in_neigh_feats, bias=False)
      
      def forward(self, graph, x):
        
        h_neighbor, h_self = x
        h_self = self.dropout(h_self)
        h_neighbor = self.dropout(h_neighbor)
        

        graph.srcdata['h'] = torch_nn_func.relu(self.fc_pre_agg(h_neighbor))
        graph.update_all(dgl_func.copy_u('h', 'm'), dgl_func.mean('m', 'neigh'))
        h_neighbor = graph.dstdata['neigh']

        #message passing
        z = self.fc_self(h_self) + self.fc_neighbour(h_neighbor)
        z = torch_nn_func.relu(z)

        z_normalization = z.norm(2, 1, keepdim=True)
        z_normalization = torch.where(z_normalization == 0, torch.tensor(1.).to(z_normalization), z_normalization)
        z = z / z_normalization

        return z

## Calculate Similarity between Users & Products

Ref: https://docs.dgl.ai/en/0.6.x/guide/training-edge.html#guide-training-edge-classification

### Prediction for NN Similarity

In [240]:
class NnSimilarityPredictingLayer(nn.Module):

    def __init__(self, embedding_layer_dim: int):
        super(NnSimilarityPredictingLayer, self).__init__()
        self.hidden_layer_1 = nn.Linear(2 * embedding_layer_dim, hidden_layer_1_output_dim)
        self.hidden_layer_2 = nn.Linear(hidden_layer_1_output_dim, hidden_layer_2_output_dim)
        self.output = nn.Linear(hidden_layer_2_output_dim, 1)
        self.relu_layer = nn.ReLU()
        self.sigmoid_layer = nn.Sigmoid()

    def forward(self, x):
        x = self.hidden_layer_1(x)
        x = self.relu_layer(x)
        x = self.hidden_layer_2(x)
        x = self.relu_layer(x)
        x = self.output(x)
        x = self.sigmoid_layer(x)
        return x

class NnPredictingModule(nn.Module):
    def __init__(self, predicting_layer, embed_dim: int):
        super(NnPredictingModule, self).__init__()
        self.layer_nn = NnSimilarityPredictingLayer(embed_dim)

    def forward(self, graph, h ):
        ratings_dict = {}
        edge_types_list = ['user', 'product']

        for edge_type in graph.canonical_etypes:
            if edge_type[0] in edge_types_list and edge_type[2] in edge_types_list:
                u_type, _, v_type = edge_type
                src_nid, dst_nid = graph.all_edges(etype=edge_type) 
                emb_heads = h[u_type][src_nid]
                emb_tails = h[v_type][dst_nid]
                cat_embed = torch.cat((emb_heads, emb_tails), 1)
                ratings = self.layer_nn(cat_embed)
                ratings_dict[edge_type] = torch.flatten(ratings)
        
        ratings_dict = {key: torch.unsqueeze(ratings_dict[key], 1) for key in ratings_dict.keys()}

        return ratings_dict

### Prediction for Cosine Similarity

In [241]:
class CosinePrediction(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, graph, h):
        with graph.local_scope():
            for edge_type in graph.canonical_etypes:
                try:
                    graph.nodes[edge_type[0]].data['norm_h'] = torch_nn_func.normalize(h[edge_type[0]], p=2, dim=-1)
                    graph.nodes[edge_type[2]].data['norm_h'] = torch_nn_func.normalize(h[edge_type[2]], p=2, dim=-1)
                    graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos'), etype=edge_type)
                except ValueError:
                   print("Cosine similarity fucntion is not correct!")
            ratings = graph.edata['cos']
        return ratings

### Prediction for Dot-Product Similarity

Refs:
- https://docs.dgl.ai/en/0.6.x/guide/minibatch-link.html
- https://issueexplorer.com/issue/dmlc/dgl/3447

In [242]:
class DotProductPredictor(nn.Module):

    def __init__(self):
      super().__init__()

    def forward(self, graph, h):
      
      with graph.local_scope():
        for edge_type in graph.canonical_edge_types:
          try:
            graph.n_data['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=edge_type)
          except KeyError:
            pass
        ratings = graph.edge_data['score']

      return ratings

### Prediction for PairWise Distance Similarity

In [243]:
class PairWiseDistancePredictor(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, graph, h):

        with graph.local_scope():
            for edge_type in graph.canonical_edge_types:
                try:
                    graph.nodes[edge_type[0]].data['h'] = h[edge_type[0]]
                    graph.nodes[edge_type[2]].data['h'] = h[edge_type[2]]
                    graph.apply_edges(lambda edges: {'pwdist': nn.PairwiseDistance('h', 'h')}, etype=edge_type )
                except KeyError:
                    pass
            ratings = graph.edge_data['pwdist']

        return ratings

## GNN Model

In [244]:
class GNNModel(nn.Module):

    def __init__(self, g, n_layers: int, dim_dict, dropout, pred, aggregator_hetero, embedding_layer,):

        super().__init__()
        self.embedding_layer = embedding_layer

        if embedding_layer:
            self.user_embed = NodeEmbedding(dim_dict['user'], dim_dict['hidden'])
            self.item_embed = NodeEmbedding(dim_dict['product'], dim_dict['hidden'])

        self.layers = nn.ModuleList()

        # input layer
        if not embedding_layer:
            self.layers.append(
                dglnn.HeteroGraphConv(
                    {etype[1]: MessagePassing((dim_dict[etype[0]], dim_dict[etype[2]]), dim_dict['hidden'], dropout) for etype in g.canonical_etypes}, 
                    aggregate=aggregator_hetero)
                    )

        # hidden layers
        for i in range(n_layers - 2):
            self.layers.append(
                dglnn.HeteroGraphConv(
                    {etype[1]: MessagePassing((dim_dict['hidden'], dim_dict['hidden']), dim_dict['hidden'], dropout) for etype in g.canonical_etypes},
                    aggregate=aggregator_hetero)
                    )

        # output layer
        self.layers.append(
            dglnn.HeteroGraphConv(
                {etype[1]: MessagePassing((dim_dict['hidden'], dim_dict['hidden']), dim_dict['out'], dropout) for etype in g.canonical_etypes}, 
                aggregate=aggregator_hetero)
                )

        if pred == 'cos':
            self.pred_fn = CosinePrediction()
        elif pred == 'nn':
            self.pred_fn = NnPredictingModule(NnSimilarityPredictingLayer, dim_dict['out'])
        elif pred == 'dotprod':
            self.pred_fn = DotProductPredictor()
        elif pred == 'pw':
            self.pred_fn = PairWiseDistancePredictor()
        else:
            raise KeyError('Prediction function does not exist')
            sys.exit(1)

    def get_repr(self, blocks, h):

        for i in range(len(blocks)):    
            layer = self.layers[i]
            h = layer(blocks[i], h)
          
        return h

    def forward(self, blocks, h, pos_g, neg_g, embedding_layer: bool=True, ):
        if embedding_layer:
            h['user'] = self.user_embed(h['user'])
            h['product'] = self.item_embed(h['product'])

        h = self.get_repr(blocks, h)
        pos_score = self.pred_fn(pos_g, h)
        neg_score = self.pred_fn(neg_g, h)

        return h, pos_score, neg_score

## Định nghĩa hàm lưu các kết quả epoch

In [245]:
def save_txt(data_to_save, filepath, mode='a'):

    with open(filepath, mode) as text_file:
        text_file.write(data_to_save + '\n')

## Định nghĩa hàm loss

### Max Margin Loss


In [246]:
def max_margin_loss(pos_score, neg_score, delta, neg_sample_size, negative_mask, cuda, device):

    all_scores = torch.empty(0)

    if cuda:
        all_scores = all_scores.to(device)

    for etype in pos_score.keys():
        neg_score_tensor = neg_score[etype]
        pos_score_tensor = pos_score[etype]
        neg_score_tensor = neg_score_tensor.reshape(-1, neg_sample_size)
        
        negative_mask_tensor = negative_mask[etype].reshape(-1, neg_sample_size)
       
        if cuda:
            negative_mask_tensor = negative_mask_tensor.to(device)
        scores = (delta- pos_score_tensor + neg_score_tensor-negative_mask_tensor).clamp(min=0)
        
        if etype == edge_types[0]:
            all_scores = torch.cat((all_scores, scores), 0)

    return torch.mean(all_scores)


### Bayesian Personalized Ranking Loss

In [247]:
def bpr_loss(pos_score, neg_score, delta, neg_sample_size, negative_mask, cuda, device):

    all_scores = torch.empty(0)

    if cuda:
        all_scores = all_scores.to(device)

    for etype in pos_score.keys():
        neg_score_tensor = neg_score[etype]
        pos_score_tensor = pos_score[etype]
        neg_score_tensor = neg_score_tensor.reshape(-1, neg_sample_size)

        negative_mask_tensor = negative_mask[etype].reshape(-1, neg_sample_size)
        

        if cuda:
            negative_mask_tensor = negative_mask_tensor.to(device)

        logsig = nn.LogSigmoid()
        scores = - logsig(pos_score_tensor - neg_score_tensor - negative_mask_tensor)
        
        if etype == edge_types[0]: 
          all_scores = torch.cat((all_scores, scores), 0)

    return torch.mean(all_scores)

### Cross Entropy Loss

In [248]:
def cross_entropy_loss(positive_score, negative_score, delta, negative_sample_size , negative_mask, cuda, device):
    
    all_scores = torch.empty(0)
    
    if cuda:
        all_scores = all_scores.to(device)
    
    for edge_type in positive_score.keys():
        negative_score_tensor = negative_score[edge_type].reshape(-1, 1)
        positive_score_tensor = positive_score[edge_type]
    
        negative_mask_tensor = negative_mask[edge_type].reshape(-1, 1)
    
        if cuda:
            negative_mask_tensor = negative_mask_tensor.to(device)
    
        relu = nn.ReLU()

        if edge_type == edge_types[0]:
            all_scores = torch.cat((all_scores, positive_score_tensor), 0)
            negative_score_tensor = negative_score_tensor - negative_mask_tensor
            all_scores = torch.cat((all_scores, negative_score_tensor), 0).reshape(-1, 1)
            all_scores = relu(all_scores)
            labels = torch.cat([torch.ones(positive_score_tensor.shape[0]), torch.zeros(negative_score_tensor.shape[0])]).reshape(-1, 1)

            if cuda:
                all_scores = all_scores.to(device) 
                labels = labels.to(device)
        
    loss = torch_nn_func.binary_cross_entropy_with_logits(all_scores, labels)

    return loss

## Chia graph train thành các tập data để train và validate

In [249]:
def train_validation_split(valid_graph, ground_truth_test, etypes, valid_size, reverse_etype):
    # valid 0.15 - training 0.85
    np.random.seed(11)

    all_eids_dict, valid_eids_dict, train_eids_dict = {}, {}, {}
    valid_uids_all, valid_iids_all = [], []

    for etype in etypes:
      all_eids = np.arange(valid_graph.number_of_edges(etype))
      all_eids_dict[etype] = all_eids
      # laasy index từ 0.85:1 -> tổng có 0.15
      valid_eids = all_eids[int(len(all_eids) * (1 - valid_size)):]
      valid_eids_dict[etype] = valid_eids
    
    all_eids_dict[etypes[0]] = all_eids
    valid_eids = all_eids[int(len(all_eids) * (1 - valid_size)):]

    valid_uids, valid_iids = valid_graph.find_edges(valid_eids, etype=etypes[0])
    valid_uids_all.extend(valid_uids.tolist())
    valid_iids_all.extend(valid_iids.tolist())
        
    ground_truth_valid = (np.array(valid_uids_all), np.array(valid_iids_all))
    valid_uids = np.array(np.unique(valid_uids_all))

    train_graph = valid_graph.clone()
    
    # từ graph ban đầu xóa đi các cạnh được validation -> còn cạnh để training 
    for etype in etypes:
      train_graph.remove_edges(valid_eids_dict[etype], etype=etype)
      train_eids = np.arange(train_graph.number_of_edges(etype))
      train_eids_dict[etype] = train_eids

    validation_graph = valid_graph.clone()

    # từ graph ban đầu xóa đi các cạnh được training -> còn cạnh để validation 
    for etype in etypes:
      validation_graph.remove_edges(train_eids_dict[etype], etype=etype)

    train_uids, train_iids =train_graph.find_edges(train_eids_dict[etypes[0]], etype=etypes[0])
    unique_train_uids = np.unique(train_uids)

    ground_truth_train = (np.array(train_uids), np.array(train_iids))
    train_uids = np.array(np.unique(train_uids))
    test_uids, _ = ground_truth_test
    test_uids = np.unique(test_uids)
    all_iids = np.arange(valid_graph.num_nodes('product'))
    
    # Chuyển đổi tất cả các mảng numpy thành kiểu torch.int64
    for etype in etypes:
        all_eids_dict[etype] = torch.tensor(all_eids_dict[etype], dtype=torch.int64)
        valid_eids_dict[etype] = torch.tensor(valid_eids_dict[etype], dtype=torch.int64)
        train_eids_dict[etype] = torch.tensor(train_eids_dict[etype], dtype=torch.int64)

    ground_truth_train = (torch.tensor(ground_truth_train[0], dtype=torch.int64), torch.tensor(ground_truth_train[1], dtype=torch.int64))
    ground_truth_valid = (torch.tensor(ground_truth_valid[0], dtype=torch.int64), torch.tensor(ground_truth_valid[1], dtype=torch.int64))
    train_uids = torch.tensor(train_uids, dtype=torch.int64)
    valid_uids = torch.tensor(valid_uids, dtype=torch.int64)
    test_uids = torch.tensor(test_uids, dtype=torch.int64)
    all_iids = torch.tensor(all_iids, dtype=torch.int64)

    return train_graph, train_eids_dict, valid_eids_dict, train_uids, valid_uids, test_uids, all_iids, ground_truth_train, ground_truth_valid, all_eids_dict

## Chia tập data chia thành các batch nhỏ

In [250]:

class NegativeSampler(object):
    def __init__(self, g, k):
        self.g = g
        self.k = k
        self.low_rating_edges = (g.edges['rating'].data['rating'] < 3).nonzero().squeeze()

        self.user_ids, self.product_ids = g.edges(etype='rating')
        self.num_users = g.number_of_nodes('user')
        self.num_products = g.number_of_nodes('product')

    def __call__(self, g, eids_dict):
        result_dict = {}
        for etype, eids in eids_dict.items():
            src, _ = g.find_edges(eids, etype=etype)
            src = src.repeat_interleave(self.k)
            if etype == 'rating':
                # Create negative samples from low rating edges
                filtered_dst_indices = self.low_rating_edges[
                    torch.randint(0, len(self.low_rating_edges), (len(src),))
                ]
                dst_low_rating = self.user_ids[filtered_dst_indices]
                
                # Create negative samples from non-existent edges
                neg_src = torch.randint(0, self.num_users, (len(src),))
                neg_dst = torch.randint(0, self.num_products, (len(src),))
                
                # Ensure negative edges do not exist in the graph
                mask = self.g.has_edges_between(neg_src, neg_dst, etype='rating')
                while mask.any():
                    neg_dst[mask] = torch.randint(0, self.num_products, (mask.sum().item(),))
                    mask = self.g.has_edges_between(neg_src, neg_dst, etype='rating')
                
                dst = torch.cat([dst_low_rating, neg_dst], dim=0)
                src = torch.cat([src, neg_src], dim=0)
            else:
                dst = torch.randint(0, self.num_products, (len(src),))
            result_dict[etype] = (src, dst)
        return result_dict

In [251]:
def load_data(validation_graph, train_graph, train_edge_ids_dict, validation_edge_ids_dict, train_u_ids, validation_u_ids, test_u_ids, all_i_ids, 
                          number_workers, embedding_layer, number_layers, neighbor_sampler, neg_sample_size,  edge_batch_size, 
                          node_batch_size):
    number_workers = 0 
    if embedding_layer:
        number_layers -= 1
    if neighbor_sampler == 'full':
        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(number_layers)
    elif neighbor_sampler == 'partial':
        sampler = dgl.dataloading.MultiLayerNeighborSampler([50, 40 ], replace=False)
    else:
        print('Neighbor sampler does not exit')
        sys.exit(1)

    # Tạo bộ negative sampler tổng hợp
    validation_sampler_n = NegativeSampler(validation_graph, neg_sample_size)
    train_sampler_n = NegativeSampler(train_graph, neg_sample_size)
    
    # sampler_n = dgl.dataloading.negative_sampler.Uniform(neg_sample_size)
    # train_sampler_n = sampler_n
    # validation_sampler_n = sampler_n

    #loading nodes
    nodeLoad_train = dgl.dataloading.DataLoader(train_graph, {'user': train_u_ids, 'product': all_i_ids}, sampler, batch_size=node_batch_size, shuffle=True, 
                                                      drop_last=False, num_workers=number_workers,)

    nodeLoad_validation = dgl.dataloading.DataLoader(validation_graph, {'user': validation_u_ids, 'product': all_i_ids}, sampler, batch_size=node_batch_size, 
                                                           shuffle=True, drop_last=False, num_workers=number_workers,)

    
    # Tạo edge prediction sampler cho training
    train_sampler = as_edge_prediction_sampler(
        sampler, exclude='reverse_types',
        reverse_etypes={'rating': 'bought-by', 'bought-by': 'rating'},
        negative_sampler=train_sampler_n
    )

    # Tạo DataLoader cho training
    edgeLoad_train = dgl.dataloading.DataLoader(
        train_graph, train_edge_ids_dict, train_sampler,
        batch_size=edge_batch_size, shuffle=True, drop_last=False, num_workers=number_workers
    )

    # Tạo edge prediction sampler cho validation
    validation_sampler = as_edge_prediction_sampler(
        sampler, exclude='reverse_types',
        reverse_etypes={'rating': 'bought-by', 'bought-by': 'rating'},
        negative_sampler=validation_sampler_n
    )

    # Tạo DataLoader cho validation
    edgeLoad_validation = dgl.dataloading.DataLoader(
        validation_graph, validation_edge_ids_dict, validation_sampler,
        batch_size=edge_batch_size, shuffle=True, drop_last=False, num_workers=number_workers
    )

    
    test_node_ids = {'user': test_u_ids, 'product': all_i_ids}

    nodeLoad_test = dgl.dataloading.DataLoader(validation_graph, test_node_ids, sampler, batch_size=node_batch_size, shuffle=True, drop_last=False, 
                                                     num_workers=number_workers)

    return edgeLoad_train, edgeLoad_validation, nodeLoad_train, nodeLoad_validation, nodeLoad_test


## Thông số cấu hình

In [252]:
edge_types= [('user', 'rating', 'product'), ('product', 'bought-by', 'user')]
reverse_edge_types= {('user', 'rating', 'product'): ('product', 'bought-by', 'user'), 
                 ('product', 'bought-by', 'user') : ('user', 'rating', 'product') }
validation_size = .15
cuda = torch.cuda.is_available()
number_workers = 1 if cuda else 0
device = torch.device('cuda') if cuda else torch.device('cpu')

embedding_layer = True  
number_layers = 3
neighbor_sampler = 'partial' 
block_sampler = [50, 40 ]
neg_sample_size = 15 
edge_batch_size = 4096 
node_batch_size = 128

# GNN Conv Layer parameters:
out_dim = 64 
hidden_dim = 256
validation_graph = hetero_graph
prediction = 'cos' 
aggregator_hetero = 'mean' 
dropout = 0.3 

# nn PredLayer parameters:
hidden_layer_1_output_dim = 256 
hidden_layer_2_output_dim = 128 

num_epochs = 15 
delta = 0.6 

optimizer=torch.optim.Adam 
lr = 0.001 
weight_decay = 1e-5
loss_function = max_margin_loss 
patience = 20
result_filepath = './result/output.txt'
k = 10 #number of recommendations
cuda

True

In [253]:
validation_graph.edata['rating'][('product', 'bought-by', 'user')]

tensor([3, 4, 2,  ..., 1, 4, 4])

In [254]:
validation_graph.edges['bought-by'].data['rating']

tensor([3, 4, 2,  ..., 1, 4, 4])

## Thực hiện chia dữ liệu để train và validate

In [255]:
train_graph, train_edge_ids_dict, validation_edge_ids_dict, train_u_ids, validation_u_ids, test_u_ids, all_i_ids, ground_truth_train, ground_truth_validation, all_edge_ids_dict = train_validation_split(hetero_graph, ground_truth_test, edge_types, validation_size, reverse_edge_types)

edgeLoad_train, edgeLoad_validation, nodeLoad_train, nodeLoad_validation, nodeLoad_test = load_data(validation_graph, train_graph, train_edge_ids_dict, 
                                                                                                                      validation_edge_ids_dict, train_u_ids, validation_u_ids, 
                                                                                                                      test_u_ids, all_i_ids, number_workers,  
                                                                                                                      embedding_layer, number_layers, neighbor_sampler, 
                                                                                                                      neg_sample_size = neg_sample_size, 
                                                                                                                      edge_batch_size = edge_batch_size,  # edge_batch_size = 4096,
                                                                                                                     node_batch_size = node_batch_size ) # node_batch_size = 128 )

## Các Batches cho Train, Validation và Test Data

In [256]:
train_edge_ids_len = 0
validation_edge_ids_len = 0

for edge_type in train_edge_ids_dict.keys():
    train_edge_ids_len += len(train_edge_ids_dict[edge_type])
    validation_edge_ids_len += len(validation_edge_ids_dict[edge_type])

num_batches_train_loss = math.ceil(train_edge_ids_len / edge_batch_size)
num_batches_train_metrics = math.ceil((len(train_u_ids) + len(all_i_ids)) / node_batch_size)
num_batches_validation_loss = math.ceil(validation_edge_ids_len /edge_batch_size)
num_batches_validation_metrics = math.ceil((len(validation_u_ids) + len(all_i_ids)) / node_batch_size)
num_batches_test = math.ceil((len(test_u_ids) + len(all_i_ids)) / node_batch_size)
print(num_batches_train_loss, num_batches_train_metrics, num_batches_validation_loss, num_batches_validation_metrics, num_batches_test, sep=' - ')


5 - 23 - 1 - 5 - 4


## Hàm tính toán Embeddings dựa trên từng node load data đã chia từng batch

In [257]:
def get_embeddings(g, out_dim, trained_model, nodeLoad_test, num_batches_valid, cuda, device, embedding_layer):

    if cuda:  
        trained_model = trained_model.to(device)
    i = 0
    y = {ntype: torch.zeros(g.num_nodes(ntype), out_dim) for ntype in g.ntypes}
    
    if cuda: 
        y = {ntype: torch.zeros(g.num_nodes(ntype), out_dim).to(device) for ntype in g.ntypes}
    
    # print('yyyyyy', y)
    for input_nodes, output_nodes, blocks in nodeLoad_test:
        i += 1
       
        if i % 10 == 0:
            # print("Computing embeddings: Batch "+ str(i)+ " out of "  + str(num_batches_valid))
            pass
       
        if cuda:
            blocks = [b.to(device) for b in blocks]
        
        input_features = blocks[0].srcdata['features']
        # print('input_features ', input_features)
        if embedding_layer:
            input_features['user'] = trained_model.user_embed(input_features['user'])
            input_features['product'] = trained_model.item_embed(input_features['product'])
        
        h = trained_model.get_repr(blocks, input_features)
        
        for ntype in h.keys():
            if ntype in output_nodes:
                y[ntype][output_nodes[ntype]] = h[ntype]
                # print('ntype ', ntype)
                # print(output_nodes[ntype])

    return y

## Hàm tính Metrics và Recommendations

In [258]:
def calculate_metrics(recs, ground_truth_dict, g):

    k_relevant = 0
    k_total = 0

    for uid, iids in recs.items():
        k_total += len(iids)
        k_relevant += len([id_ for id_ in iids if id_ in ground_truth_dict[uid]])
    
    precision = k_relevant/k_total

    nb_total = g.num_nodes('product')
    recs_flatten = [item for sublist in list(recs.values()) for item in sublist]
    nb_recommended = len(set(recs_flatten))
    coverage = nb_recommended / nb_total
    
    return precision, coverage
  
def get_recom(g, h, model, embed_dim, k, user_ids, cuda, device, pred: str, epoch):

    if cuda:  
        model = model.to(device)

    recom = {}

    for user in user_ids:
        user_emb = h['user'][user]
        
        user_emb_rpt = torch.cat(g.num_nodes('product') * [user_emb]).reshape(-1, embed_dim) 
        
        if pred == 'cos':
            cos = nn.CosineSimilarity(dim=1, eps=1e-6)
            ratings = cos(user_emb_rpt, h['product'])
        elif pred == 'nn':
            cat_embed = torch.cat((user_emb_rpt, h['product']), 1)
            ratings = model.pred_fn.layer_nn(cat_embed)
        elif pred == 'dotprod':
            ratings = torch.sum(user_emb_rpt * h['product'], dim=1)
            print("ratings shape: ", ratings.shape)
        elif pred == 'pw':
            ratings =nn.PairwiseDistance(user_emb_rpt, h['product'])
        else:
            print ('the prediction function not found!')
            sys.exit(1)
            
        ratings_formatted = ratings.cpu().detach().numpy().reshape(g.num_nodes('product'),)
        order = np.argsort(-ratings_formatted)
        
        rec = order[:k] # top k recommendations
        recom[user] = rec
        
    return recom

def get_metrics(h, g, model, embed_dim, ground_truth, k, cuda=False, device=None, pred='cos', epoch=4):

    users, items = ground_truth
    user_ids = np.unique(users).tolist()
    ground_truth_arr = np.stack((np.asarray(users), np.asarray(items)), axis=1)
    ground_truth_dict = defaultdict(list)
    for key, val in ground_truth_arr:
        ground_truth_dict[key].append(val)
    recs = get_recom(g, h, model, embed_dim, k, user_ids, cuda, device, pred, epoch)
    
    precision, coverage = calculate_metrics(recs, ground_truth_dict, g)
    return precision, coverage


## Hàm tính AUC
AUC càng cao càng tốt

In [259]:
def compute_auc(scores):
    edge_type = ('user', 'rating', 'product')
    pos_score = scores[0][edge_type]
    neg_score = scores[1][edge_type]

    scores = torch.cat([pos_score, neg_score]).detach().cpu().numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).detach().numpy()

    return roc_auc_score(labels, scores)

## Hàm Train  Model

In [260]:
def train_model(model,
                num_epochs,
                num_batches_train_loss,
                num_batches_validation_loss,
                num_batches_val_metrics,
                num_batches_train_metric,
                edgeLoad_train,
                edgeLoad_validation,
                nodeLoad_validation,
                nodeLoad_train,
                k,
                loss_function, 
                delta, 
                negative_sample_size, 
                cuda=False,
                device=None,
                optimizer=torch.optim.Adam,
                lr=0.001,
                train_graph=None,
                validation_graph=None,
                out_dim=None, 
                ground_truth_train=None,
                ground_truth_validation=None,
                result_filepath=None,
                patience=5,
                prediction=None,
                embedding_layer=True,
                ):

    model.train_loss_list, model.train_precision_list, model.train_coverage_list, model.validation_loss_list, model.validation_precision_list, model.validation_coverage_list = [], [], [], [], [], []
    best_metrics = {} 
    
    max_metric = - 0.1
    patience_counter = 0  
    min_loss = 1.1
    max_auc = 0.5
    best_epoch = 0
    opt = optimizer(model.parameters(), lr=lr, weight_decay=weight_decay)

    start = time.time()
    print('Start training for '+ str(num_epochs)+ ' epochs')
    
    for epoch in range(1, num_epochs):
        print('Epoch ', epoch)
        if epoch == 1:
            mode = 'w'  
        else:
            mode = 'a'  

        model.train()  
        i = 0
        total_loss = 0
        for _, pos_g, neg_g, blocks in edgeLoad_train:
            
            opt.zero_grad()

            # Negative mask
            negative_mask = {}

            nids = neg_g.ndata[dgl.NID]

            for etype in pos_g.canonical_etypes:
                neg_src, neg_dst = neg_g.edges(etype=etype)
                neg_src = nids[etype[0]][neg_src]
                neg_dst = nids[etype[2]][neg_dst]
                negative_mask_tensor = train_graph.has_edges_between(neg_src, neg_dst, etype=etype)
                negative_mask[etype] = negative_mask_tensor.type(torch.float)

                if cuda:
                    negative_mask[etype] = negative_mask[etype].to(device)

            if cuda:
                blocks = [b.to(device) for b in blocks]
                pos_g = pos_g.to(device)
                neg_g = neg_g.to(device)

            i += 1
            
            if i % 10 == 0:
                # print("Edge batch " + str(i) + " out of ", str(num_batches_train_loss))
                pass

            input_features = blocks[0].srcdata['features']

            _, pos_score, neg_score = model(blocks, input_features, pos_g, neg_g, embedding_layer,)
            train_score = (pos_score, neg_score)
            loss = loss_function(pos_score, neg_score, delta, neg_sample_size, negative_mask=negative_mask, cuda=cuda, device=device,)

            loss.backward()
            opt.step()

            total_loss += loss.item()

        train_avg_loss = total_loss / i
        model.train_loss_list.append(train_avg_loss)

        model.eval()

        with torch.no_grad():
            total_loss = 0
            i = 0

            for _, pos_g, neg_g, blocks in edgeLoad_validation:
                i += 1

                if i % 10 == 0:
                    # print("Edge batch {} out of {}".format(i, num_batches_validation_loss))
                    pass

                # Negative mask
                negative_mask = {}

                nids = neg_g.ndata[dgl.NID]

                for etype in pos_g.canonical_etypes:
                    neg_src, neg_dst = neg_g.edges(etype=etype)
                    neg_src = nids[etype[0]][neg_src]
                    neg_dst = nids[etype[2]][neg_dst]
                    negative_mask_tensor = validation_graph.has_edges_between(neg_src, neg_dst, etype=etype)
                    negative_mask[etype] = negative_mask_tensor.type(torch.float)

                    if cuda:
                        negative_mask[etype] = negative_mask[etype].to(device)

                if cuda:
                    blocks = [b.to(device) for b in blocks]
                    pos_g = pos_g.to(device)
                    neg_g = neg_g.to(device)

                input_features = blocks[0].srcdata['features']
                _, pos_score, neg_score = model(blocks, input_features, pos_g, neg_g, embedding_layer,)
                
                validation_score = (pos_score, neg_score)

                validation_loss = loss_function(pos_score, neg_score, delta, neg_sample_size, negative_mask=negative_mask, 
                                   cuda=cuda, device=device,)
                total_loss += validation_loss.item()

            validation_avg_loss = total_loss / i
            model.validation_loss_list.append(validation_avg_loss)

        model.eval()
        AUC_val = 0
        with torch.no_grad():
            y = get_embeddings(train_graph, out_dim, model, nodeLoad_train, num_batches_train_metrics, cuda, device, embedding_layer,)
            train_precision, train_coverage = get_metrics(y, train_graph, model, out_dim, ground_truth_train, k, 
                                                                cuda, device, prediction, epoch)

            # validation metrics
            y = get_embeddings(validation_graph,
                                out_dim,
                                model,
                                nodeLoad_validation,
                                num_batches_val_metrics,
                                cuda,
                                device,
                                embedding_layer,
                                )

            validation_precision, validation_coverage = get_metrics(y, validation_graph, model, out_dim, ground_truth_validation,  k, 
                                                                          cuda, device, prediction, epoch)
            AUC_val = compute_auc(validation_score)
            AUC_train = compute_auc(train_score)
            results = "Epoch " + str(epoch) +" | Training loss " + str(round(train_avg_loss,4)) +" | Training Precision " + str(round(train_precision * 100,2)) + " | Training AUC " + str(round(AUC_train,2))+ ' | Validation loss '+ str(round(validation_avg_loss,4)) + ' | Validation Precision '+str(round(validation_precision * 100,2)) + ' | AUC '+str(round(AUC_val,2))
            print(results)
            
            print('process time: ' +str(round(float(time.time()-start)/60, 2))+ ' minutes')
            
            save_txt(results, result_filepath, mode=mode)

            model.train_precision_list.append(train_precision * 100)
            model.validation_precision_list.append(validation_precision * 100)

            if validation_precision > max_metric:
                max_metric = validation_precision
                best_metrics = { 'precision': validation_precision}

        # if max_auc < AUC_val:
        #     max_auc = AUC_val
        #     best_model = copy.deepcopy(model)
        #     best_epoch = epoch

        if validation_avg_loss < min_loss:
            min_loss = validation_avg_loss
            patience_counter = 0
            best_model = copy.deepcopy(model)
            best_epoch = epoch
            # print('patience_counter = ', patience_counter)
        else:
            patience_counter += 1
            # print('patience_counter = ', patience_counter)

        # if patience_counter == patience:
        #     print('ngắt training')
        #     break

    viz_dict = {'train_loss_list': model.train_loss_list,
           'train_precision_list': model.train_precision_list,
           'val_loss_list': model.validation_loss_list,
           'validation_precision_list': model.validation_precision_list,
          }
    
    print('end of training!!')
    print ('process time: ' +str(round(float(time.time()-start)/60, 2))+ ' minutes')
    print('best epoch: ', best_epoch)
    return best_model, viz_dict, validation_score, train_score, best_metrics

## Training model

In [261]:
validation_graph

Graph(num_nodes={'product': 33, 'user': 3334},
      num_edges={('product', 'bought-by', 'user'): 11468, ('user', 'rating', 'product'): 11468},
      metagraph=[('product', 'user', 'bought-by'), ('user', 'product', 'rating')])

In [262]:
edgeLoad_train.device

device(type='cpu')

In [263]:
dim_dict = {'user': validation_graph.nodes['user'].data['features'].shape[1],
            'product': validation_graph.nodes['product'].data['features'].shape[1],
            'out':  out_dim,
            'hidden':hidden_dim}

model = GNNModel(validation_graph, number_layers, dim_dict, dropout, prediction, aggregator_hetero, embedding_layer)

if cuda:
    model = model.to(device)



trained_model, viz_dict, validation_score, train_score, best_metrics = train_model(model, 
                                                                       2,                                                      
                                                                        num_batches_train_loss, 
                                                                        num_batches_validation_loss,
                                                                        num_batches_validation_metrics,
                                                                        num_batches_train_metrics, 
                                                                        edgeLoad_train, 
                                                                        edgeLoad_validation,
                                                                        nodeLoad_validation, 
                                                                        nodeLoad_train, 
                                                                        k, # số lượng recommend
                                                                        loss_function, 
                                                                        delta, 
                                                                        neg_sample_size,
                                                                        cuda,
                                                                        device,
                                                                        optimizer,
                                                                        lr,
                                                                        train_graph, 
                                                                        validation_graph,
                                                                        out_dim, 
                                                                        ground_truth_train, 
                                                                        ground_truth_validation, 
                                                                        result_filepath,
                                                                        patience,
                                                                        prediction,
                                                                        embedding_layer
                                                                      )




Start training for 1 epochs
end of training!!
process time: 0.0 minutes
best epoch:  0


UnboundLocalError: local variable 'best_model' referenced before assignment

## Testing model

In [None]:
def model_inference(valid_graph, out_dim: int, trained_model, nodeLoad_test, num_batches_test: int, cuda: bool, device, embedding_layer: bool, 
                    ground_truth_test, all_eids_dict):

  trained_model.eval()

  with torch.no_grad():
        embeddings = get_embeddings(valid_graph, out_dim, trained_model, nodeLoad_test, num_batches_test, cuda, device, embedding_layer,)
        print(embeddings['product'].size())
        print(embeddings['user'].size())
        precision,_ = get_metrics(embeddings, valid_graph, trained_model, out_dim, ground_truth_test, k, cuda, device, prediction, epoch = 4)
        sentence = " TEST Precision {:.3f}% ".format(precision * 100)
        print(sentence)
        save_txt(sentence, result_filepath, mode= 'a')

In [None]:
model_inference(validation_graph, out_dim, trained_model, nodeLoad_test, num_batches_test, cuda, device, embedding_layer, ground_truth_test, all_edge_ids_dict)

## Save graph, model và draw kết quả

In [None]:
dgl.save_graphs('graph.dgl', validation_graph)

In [None]:
torch.save(trained_model.state_dict(), 'model.pth')

In [None]:
fig = plt.figure()
x = list(range(1,len(viz_dict['train_loss_list'])+1))
plt.title('\n'.join(textwrap.wrap('train and validation loss', 60)))
fig.tight_layout()
plt.rcParams["axes.titlesize"] = 6
plt.plot(x, viz_dict['train_loss_list'])
plt.plot(x, viz_dict['val_loss_list'])
plt.legend(['training loss', 'valid loss'], loc='upper left')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.savefig('./result/' + 'loss.png')
plt.close(fig)

fig = plt.figure()

x = list(range(1,len(viz_dict['train_precision_list'])+1))

plt.title('\n'.join(textwrap.wrap('train and validation precision', 60)))
fig.tight_layout()
plt.rcParams["axes.titlesize"] = 6
plt.plot(x, viz_dict['train_precision_list'])
plt.plot(x, viz_dict['validation_precision_list'])
plt.legend(['training precision','valid precision'], loc='upper left')
plt.xlabel('Epochs')
plt.ylabel('Percentage of metrics(%)')
plt.savefig('./result/' + 'metrics.png')
plt.close(fig)

## Load model và graph nếu đã lưu

In [None]:
prediction = 'cos' 
aggregator_hetero = 'mean' 
dropout = 0.3 
embedding_layer = True

validation_graph, _ = dgl.load_graphs('graph_train.dgl')
validation_graph = validation_graph[0]

dim_dict = {'user': validation_graph.nodes['user'].data['features'].shape[1],
            'product': validation_graph.nodes['product'].data['features'].shape[1],
            'out':  out_dim,
            'hidden':hidden_dim}

trained_model = GNNModel(validation_graph, 3, dim_dict, dropout, prediction, aggregator_hetero, embedding_layer)

# Load trọng số đã lưu vào mô hình khởi tạo
trained_model.load_state_dict(torch.load('model.pth'))

# Chuyển mô hình sang thiết bị cần thiết (nếu cần)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model.to(device)

In [None]:
validation_graph

## Test recommend với full embedded

In [None]:
def get_full_graph_embeddings(g, trained_model, device, embedding_layer):
    # Ensure the graph is on the correct device
    g = g.to(device)
    
    h = g.ndata['features']
        
    if embedding_layer:
        
        # Assuming 'user' and 'product' are keys in input_features
        h['user'] = trained_model.user_embed(h['user'].to(device))
        h['product'] = trained_model.item_embed(h['product'].to(device))
    
    # Calculate representations
    with torch.no_grad():  
        for i in range(len(trained_model.layers)):
            # print(trained_model.layers[i])
            h = trained_model.layers[i](g, h)
    with open('./embeddings.pkl', 'wb') as f:
        pickle.dump(h, f)
    return h

# Assuming 'validation_graph' is already on the correct device
embeddings_full = get_full_graph_embeddings(validation_graph, trained_model, device, embedding_layer)

# Example call to `get_recom` with embeddings



In [None]:
user_id_table[user_id_table['UserID'] == 'U00006035']

In [None]:
interact_df[interact_df['Rating'] < 3]

In [None]:
get_recom(validation_graph, embeddings_full, trained_model, out_dim, 33, [3], cuda, device, 'cos', 4)

### Thêm user node và edge

In [None]:
def add_new_user_feat(graph, user_feature):
    temple = graph.ndata['features']['user'].clone()
    graph = dgl.add_nodes(graph, 1, ntype='user')
    graph.nodes['user'].data['features'] = torch.cat((temple, user_feature.unsqueeze(0)), dim=0)
    return graph

def add_new_edge(graph, src_id, dest_id,edge_feature, etype):
    graph.add_edges(src_id, dest_id,  data = edge_feature, etype = etype)
    return graph

In [None]:
all_user_df = pd.read_csv('./collect_data/user.csv')
trained_user_df = pd.read_csv('./user_id_table.csv')
print(all_user_df.shape)
print(trained_user_df.shape)
print(all_user_df.shape[0] - trained_user_df.shape[0])


In [None]:
# Tìm các UserID chỉ có trong all_user_df mà không có trong trained_user_df
merged_df = pd.merge(all_user_df, trained_user_df, on='UserID', how='left', indicator=True)
not_trained_user_df = merged_df.loc[merged_df['_merge'] == 'left_only', ['UserID', 'F', 'M']]
not_trained_user_df.head()

In [None]:
print(rating_all_df.shape)
# do rating train bị drop trùng lặp nên giá trị ban đầu có thể lớn hơn 11468 => edge chưa train có nhỏ hơn 4461
print(rating_train_df.shape)
print(rating_all_df.shape[0] - rating_train_df.shape[0])

In [None]:
not_trained_user_df.shape

In [None]:
not_edge_trained_df = pd.merge(not_trained_user_df, rating_all_df, on='UserID', how='inner', indicator=True)
not_edge_trained_df = pd.merge(not_edge_trained_df, product_id_all_df, on='ItemID', how='inner', indicator=False)
not_edge_trained_df.head()
not_edge_trained_df.shape

In [None]:
data05 = not_edge_trained_df.loc[not_edge_trained_df['UserID'] == 'U00000005']
data05

In [None]:
x_graph = validation_graph.clone()
x_graph

In [None]:
max_user_id = trained_user_df['user_new_id'].max() if not trained_user_df.empty else 0

user05_data = not_trained_user_df.loc[not_trained_user_df['UserID'] == 'U00000005']
user05_data = user05_data.reset_index(drop=True)
user05_data

In [None]:
trained_user_df.shape

### Thêm thử 1 node user U00000005

In [None]:
user05_data['F'][0]

In [None]:
validation_graph

In [None]:

user_id = user05_data['UserID'][0]

features = torch.tensor([user05_data['F'][0], user05_data['M'][0]])  # Tạo tensor đặc trưng từ 'F' và 'M'

validation_graph = add_new_user_feat(validation_graph, features)

max_user_id += 1
trained_user_df.loc[len(trained_user_df)] = [user_id, max_user_id]
validation_graph

### Thêm thử cạnh của user U00000005

In [None]:
data05

In [None]:
user_id_05_mapped = trained_user_df['user_new_id'].max() if not trained_user_df.empty else 0
user_id_05_mapped

In [None]:
validation_graph

In [None]:
for index, row in data05.iterrows():
    data05
    src = user_id_05_mapped
    dst = row['product_new_id']
    feat = torch.tensor([row['Rating']])
    validation_graph = add_new_edge(validation_graph, src, dst, {'rating': feat}, 'rating')
    
for index, row in data05.iterrows():
    data05
    dst = user_id_05_mapped
    src = row['product_new_id']
    feat = torch.tensor([row['Rating']])
    validation_graph = add_new_edge(validation_graph, src, dst, {'rating': feat}, 'bought-by')

In [None]:
validation_graph

In [None]:
max_user_id = trained_user_df['user_new_id'].max() if not trained_user_df.empty else 0
# for index, row in not_trained_user_df.iterrows():
#     user_id = row['UserID']
#     features = torch.tensor([row['F'], row['M']])  # Tạo tensor đặc trưng từ 'F' và 'M'

#     validation_graph = add_new_user_feat(validation_graph, features)

#     max_user_id += 1
#     trained_user_df.loc[len(trained_user_df)] = [user_id, max_user_id]

validation_graph


## Thêm node và edges toàn bộ

In [None]:
def add_new_user_feat(graph, user_feature):
    temple = graph.ndata['features']['user'].clone()
    graph = dgl.add_nodes(graph, 1, ntype='user')
    graph.nodes['user'].data['features'] = torch.cat((temple, user_feature.unsqueeze(0)), dim=0)
    return graph

def add_new_edge(graph, src_id, dest_id,edge_feature, etype):
    graph.add_edges(src_id, dest_id,  data = edge_feature, etype = etype)
    return graph

In [None]:
all_user_df = pd.read_csv('./collect_data/user.csv')
trained_user_df = pd.read_csv('./user_id_table.csv')
print(all_user_df.shape)
print(trained_user_df.shape)
print(all_user_df.shape[0] - trained_user_df.shape[0])


In [None]:
# Tìm các UserID chỉ có trong all_user_df mà không có trong trained_user_df
merged_df = pd.merge(all_user_df, trained_user_df, on='UserID', how='left', indicator=True)
not_trained_user_df = merged_df.loc[merged_df['_merge'] == 'left_only', ['UserID', 'F', 'M']]
not_trained_user_df.head()

In [None]:
not_edge_trained_df = pd.merge(not_trained_user_df, rating_all_df, on='UserID', how='inner', indicator=True)
not_edge_trained_df = pd.merge(not_edge_trained_df, product_id_all_df, on='ItemID', how='inner', indicator=False)
not_edge_trained_df.head()

In [None]:
not_edge_trained_df.shape

In [None]:
# Tìm các UserID chỉ có trong all_user_df mà không có trong trained_user_df
merged_df = pd.merge(all_user_df, trained_user_df, on='UserID', how='left', indicator=True)
not_trained_user_df = merged_df.loc[merged_df['_merge'] == 'left_only', ['UserID', 'F', 'M']]
not_trained_user_df.head()

In [None]:
validation_graph

In [None]:
max_user_id = trained_user_df['user_new_id'].max() if not trained_user_df.empty else 0
for index, row in not_trained_user_df.iterrows():
    user_id = row['UserID']
    features = torch.tensor([row['F'], row['M']])  # Tạo tensor đặc trưng từ 'F' và 'M'

    validation_graph = add_new_user_feat(validation_graph, features)

    max_user_id += 1
    trained_user_df.loc[len(trained_user_df)] = [user_id, max_user_id]

validation_graph


In [None]:
for index, row in not_edge_trained_df.iterrows():
    user_new_id_value = trained_user_df[trained_user_df['UserID'] == row['UserID']]['user_new_id'].iloc[0]
    
    src = user_new_id_value
    dst = row['product_new_id']
    feat = torch.tensor([row['Rating']])
    validation_graph = add_new_edge(validation_graph, src, dst, {'rating': feat}, 'rating')
    
    src2 = row['product_new_id']
    dst2 = user_new_id_value
    feat2 = torch.tensor([row['Rating']])
    validation_graph = add_new_edge(validation_graph, src2, dst2, {'rating': feat2}, 'bought-by')
    
validation_graph

In [None]:
trained_user_df.to_csv('user_id_table.csv', index=False)

## Tính embedded dựa trên nodeLoad_test và recommend

In [None]:
embeddings = get_embeddings(validation_graph, out_dim, trained_model, nodeLoad_test, num_batches_test, cuda, device, embedding_layer,)

In [None]:

get_recom(validation_graph, embeddings, trained_model, out_dim, k, [0, 3648, 26, 44, 53, 73, 7, 23], cuda, device, 'cos', 4)

# precision,_ = get_metrics(embeddings, validation_graph, trained_model, out_dim, ground_truth_test, k, cuda, device, prediction, epoch = 4)
# sentence = " TEST Precision {:.3f}% ".format(precision * 100)

## Vẽ biểu đồ các node user, product embedded 2D

In [None]:
validation_graph

In [None]:
import dgl
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import torch

# Giả sử bạn có một mô hình đã được huấn luyện và validation_graph


# Sử dụng hàm để lấy embeddings cho toàn bộ đồ thị
full_embeddings = get_full_graph_embeddings(validation_graph, trained_model, device, True)

# Xử lý nhúng cho từng loại node
user_embeddings = full_embeddings['user'].cpu().numpy()
product_embeddings = full_embeddings['product'].cpu().numpy()

# Sử dụng TSNE để giảm chiều dữ liệu cho từng loại node
tsne = TSNE(n_components=2, random_state=42)
user_embeddings_2d = tsne.fit_transform(user_embeddings)
product_embeddings_2d = tsne.fit_transform(product_embeddings)

# Sử dụng KMeans để phân cụm các node user
kmeans = KMeans(n_clusters=10, random_state=42)  # Số cụm tùy chỉnh
user_labels = kmeans.fit_predict(user_embeddings_2d)
product_labels = kmeans.fit_predict(product_embeddings_2d)

# Tạo DataFrame cho việc vẽ
import pandas as pd
user_df = pd.DataFrame(user_embeddings_2d, columns=['x', 'y'])
user_df['label'] = user_labels
user_df['type'] = 'user'

product_df = pd.DataFrame(product_embeddings_2d, columns=['x', 'y'])
product_df['label'] = product_labels
product_df['type'] = 'product'

# Kết hợp dữ liệu user và product
df = pd.concat([user_df, product_df])

# Vẽ biểu đồ
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='x', y='y', hue='label', style='type', palette='tab10', s=100, legend='full')

plt.title('Node Embeddings Visualization with Clustering')
plt.show()


In [None]:
import pandas as pd
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import factor_cmap
from bokeh.palettes import Category10
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import torch

# Giả sử bạn có một mô hình đã được huấn luyện và validation_graph
full_embeddings = get_full_graph_embeddings(validation_graph, trained_model, device, True)

# Tạo dữ liệu giả để minh họa
user_embeddings = full_embeddings['user'].cpu().numpy()
product_embeddings = full_embeddings['product'].cpu().numpy()


# Sử dụng TSNE để giảm chiều dữ liệu cho từng loại node
tsne = TSNE(n_components=2, random_state=42)
user_embeddings_2d = tsne.fit_transform(user_embeddings)
product_embeddings_2d = tsne.fit_transform(product_embeddings)

# Sử dụng KMeans để phân cụm các node user và product
kmeans_users = KMeans(n_clusters=5, random_state=42)  # Số cụm tùy chỉnh
kmeans_products = KMeans(n_clusters=5, random_state=42)

user_labels = kmeans_users.fit_predict(user_embeddings_2d)
product_labels = kmeans_products.fit_predict(product_embeddings_2d)

# Tạo DataFrame cho việc vẽ
user_df = pd.DataFrame(user_embeddings_2d, columns=['x', 'y'])
user_df['label'] = user_labels
user_df['type'] = 'user'

product_df = pd.DataFrame(product_embeddings_2d, columns=['x', 'y'])
product_df['label'] = product_labels
product_df['type'] = 'product'

# Tạo nguồn dữ liệu cho Bokeh
source_user = ColumnDataSource(data=dict(
    x=user_df['x'],
    y=user_df['y'],
    label=user_df['label'].astype(str),  # Convert labels to string for categorical coloring
    type=user_df['type']
))

source_product = ColumnDataSource(data=dict(
    x=product_df['x'],
    y=product_df['y'],
    label=product_df['label'].astype(str),
    type=product_df['type']
))

# Tạo màu sắc cho các nhãn phân cụm
unique_user_labels = user_df['label'].unique().astype(str)
unique_product_labels = product_df['label'].unique().astype(str)

user_color_mapping = factor_cmap('label', palette=Category10[len(unique_user_labels)], factors=unique_user_labels)
product_color_mapping = factor_cmap('label', palette=Category10[len(unique_product_labels)], factors=unique_product_labels)

# Tạo figure cho Bokeh
p = figure(title="Node Embeddings Visualization with Clustering", tools="pan,wheel_zoom,box_zoom,reset,hover,save", tooltips="@type: @label")

# Thêm scatter plot với màu sắc dựa trên nhãn phân cụm
p.scatter('x', 'y', source=source_user, legend_field='type', fill_alpha=0.6, size=8, color=user_color_mapping, marker='circle')
p.scatter('x', 'y', source=source_product, legend_field='type', fill_alpha=0.6, size=8, color=product_color_mapping, marker='triangle')

# Cấu hình tooltips
hover = p.select(dict(type=HoverTool))
hover.tooltips = [
    ("Index", "$index"),
    ("(x, y)", "(@x, @y)"),
    ("Type", "@type"),
    ("Label", "@label"),
]

# Hiển thị biểu đồ
output_notebook()
show(p, notebook_handle=True)

## define recommend function

In [None]:
def recommend_for_user(user_id, validation_graph, embeddings, out_dim, k):
    user_emb = embeddings['user'][user_id]
    print(user_emb.size())
    user_emb_rpt = user_emb.repeat(validation_graph.num_nodes('product'), 1)
    
    print(user_emb_rpt.size())
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    ratings = cos(user_emb_rpt, embeddings['product'])
    
    ratings_formatted = ratings.cpu().detach().numpy().reshape(validation_graph.num_nodes('product'),)
    order = np.argsort(-ratings_formatted)
    
    rec = order[:k]  # top k recommendations
    return rec


In [None]:
def calculate_embedded_and_recommend(user_id, validation_graph, trained_model, out_dim, device, k):
    embeddings = get_embeddings(validation_graph, out_dim, trained_model, nodeLoad_test, num_batches_test, cuda, device, embedding_layer)
    
    recommended_products = recommend_for_user(user_id, validation_graph, embeddings, out_dim, k)
    print(user_id, ' - ' , recommended_products)
    return recommended_products

In [None]:
embeddings = get_embeddings(validation_graph, out_dim, trained_model, nodeLoad_train, num_batches_test, cuda, device, embedding_layer)

In [None]:
get_recom(validation_graph, embeddings, trained_model, out_dim, k, [0, 12, 2, 3, 4], cuda, device, 'cos', 4)

## Thêm các user còn lại vào graph all (Vì các user này chưa rating nên lúc đầu graph all không có chứa)

In [None]:
validation_graph

In [None]:
all_user_df = pd.read_csv('./collect_data/user.csv')
trained_user_df = pd.read_csv('./user_id_table.csv')
print(all_user_df.shape)
print(trained_user_df.shape)
print(all_user_df.shape[0] - trained_user_df.shape[0])

In [None]:
# Tìm các UserID chỉ có trong all_user_df mà không có trong trained_user_df
merged_df = pd.merge(all_user_df, trained_user_df, on='UserID', how='left', indicator=True)

In [None]:
not_trained_user_df = merged_df.loc[merged_df['_merge'] == 'left_only', ['UserID', 'F', 'M']]
not_trained_user_df.head()

In [None]:
def add_new_user_feat(graph, user_feature):
    temple = graph.ndata['features']['user'].clone()
    graph = dgl.add_nodes(graph, 1, ntype='user')
    graph.nodes['user'].data['features'] = torch.cat((temple, user_feature.unsqueeze(0)), dim=0)
    return graph

In [None]:
u, v = validation_graph.edges(etype='rating')
edges = list(zip(u.tolist(), v.tolist()))

print("Các cạnh loại 'follows' trong đồ thị:", len(edges))

In [None]:
def add_new_edge(graph, src_id, dest_id, edge_feature, etype):
    graph.add_edges(src_id, dest_id, edge_feature, etype)
    return graph

In [None]:
max_user_id = trained_user_df['user_new_id'].max() if not trained_user_df.empty else 0
for index, row in not_trained_user_df.iterrows():
    user_id = row['UserID']
    features = torch.tensor([row['F'], row['M']])  # Tạo tensor đặc trưng từ 'F' và 'M'

    validation_graph = add_new_user_feat(validation_graph, features)

    max_user_id += 1
    trained_user_df.loc[len(trained_user_df)] = [user_id, max_user_id]

validation_graph


In [None]:
# Tìm các UserID chỉ có trong all_user_df mà không có trong trained_user_df
merged_df = pd.merge(rating_train_df
                     , trained_user_df, on='UserID', how='left', indicator=True)

In [None]:
import pickle
with open('embeddings.pkl', 'wb') as f:
    embeddings = get_embeddings(validation_graph, out_dim, trained_model, nodeLoad_test, num_batches_test, cuda, device, embedding_layer)
    pickle.dump(embeddings, f)

In [None]:
trained_user_df.to_csv('user_id_table.csv', index=False)

In [None]:
# save lại graph mới (thêm user mới)
dgl.save_graphs('graph_train.dgl', validation_graph)

# add new edge

In [None]:
x_graph = validation_graph.clone()
x_graph.num_edges('rating')


In [None]:
x_graph.edges['bought-by'].data['rating'].size()

In [None]:
rating_star = 99
x_graph = dgl.add_edges(x_graph, torch.tensor([3]), torch.tensor([3]),  data={'weight': torch.tensor(rating_star)}, etype='rating')
x_graph = dgl.add_edges(x_graph, torch.tensor([3]), torch.tensor([3]), data={'weight': torch.tensor(rating_star)}, etype='bought-by')
x_graph.num_edges('rating')

# recomend user có trong 6040 user với user_new_id

In [None]:
def recommend_for_user(user_id, validation_graph, embeddings, out_dim, k):
    user_emb = embeddings['user'][user_id]
    print(user_emb.size())
    user_emb_rpt = user_emb.repeat(validation_graph.num_nodes('product'), 1)
    
    print(user_emb_rpt.size())
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    ratings = cos(user_emb_rpt, embeddings['product'])
    
    ratings_formatted = ratings.cpu().detach().numpy().reshape(validation_graph.num_nodes('product'),)
    order = np.argsort(-ratings_formatted)
    
    rec = order[:k]  # top k recommendations
    return rec


In [None]:
user_id_map = pd.read_csv('user_id_table.csv')
product_id_map = pd.read_csv('product_id_table.csv')

In [None]:
def calculate_embedded_and_recommend(user_id, validation_graph, trained_model, out_dim, device, k):
    embeddings = get_embeddings(validation_graph, out_dim, trained_model, nodeLoad_test, num_batches_test, cuda, device, embedding_layer)
    
    recommended_products = recommend_for_user(user_id, validation_graph, embeddings, out_dim, k)
    print(user_id, ' - ' , recommended_products)
    return recommended_products

# with open('./recommend api/config/embeddings.pkl', 'rb') as f:
    #     embeddings = pickle.load(f)

In [None]:
def getRealUserId(mappedId):
    result = user_id_map.loc[user_id_map['user_new_id'] == mappedId, 'UserID']
    
    if not result.empty:
        return result.iloc[0]
    else:
        return None 

def getMappedUserId(realId):
    result = user_id_map.loc[user_id_map['UserID'] == realId, 'user_new_id']
    
    if not result.empty:
        return result.iloc[0]
    else:
        return None 
    
def getRealProductId(mappedId):
    result = product_id_map.loc[product_id_map['product_new_id'] == mappedId, 'ItemID']
    
    if not result.empty:
        return result.iloc[0]
    else:
        return None 

def getMappedProductId(realId):
    result = product_id_map.loc[product_id_map['ItemID'] == realId, 'product_new_id']
    
    if not result.empty:
        return result.iloc[0]
    else:
        return None 
    
def getRealProductIds(recommended_products):
    mapped_ids = []
    for id in recommended_products:
        mapped_id = getRealProductId(id)
        if mapped_id is not None:
            mapped_ids.append(mapped_id)
    return mapped_ids

In [None]:

recommended_products = calculate_embedded_and_recommend(132, validation_graph, trained_model, out_dim, device, k=10)

In [None]:
user_id_real = 'U00002222'
user_id_mapped = getMappedUserId(user_id_real)
recommended_products = calculate_embedded_and_recommend(user_id_mapped, validation_graph, trained_model, out_dim, device, k=10)
mapped_ids = getRealProductIds(recommended_products)

print("Các sản phẩm dành cho ",  user_id_real, " là :")
print(mapped_ids)