In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/000GithubRepos/000recsys_key_papers_implementation/model_comparison/kyeongchan/NGCF')

In [3]:
from tqdm import tqdm 
import os 

import numpy as np 
import pandas as pd

import scipy.sparse  as sp 

from sklearn.model_selection import train_test_split 

import torch 
from torch import nn, optim 
from torch.utils.data import Dataset, DataLoader 

In [4]:
class args:
    seed = 42
    num_layers = 3
    batch_size= 512
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    SAVE_PATH = 'Parameters'

In [5]:
d_set = pd.read_csv('Data/movielens1m/ratings.dat', sep='::', names=['user_id','business_id','stars','ts'], encoding='latin-1',header=None)
d_set = d_set.drop(columns=['ts'])

  d_set = pd.read_csv('Data/movielens1m/ratings.dat', sep='::', names=['user_id','business_id','stars','ts'], encoding='latin-1',header=None)


In [20]:
d_set

Unnamed: 0,user_id,business_id,stars
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [6]:
d_train, d_test = train_test_split(d_set, train_size=0.6, random_state=args.seed)
d_valid, d_test = train_test_split(d_test, train_size=0.5, random_state=args.seed)


In [7]:
d_train = d_train.astype({'user_id':'category', 'business_id':'category'})
d_valid = d_valid.astype({'user_id':'category', 'business_id':'category'})
d_test = d_test.astype({'user_id':'category', 'business_id':'category'})

In [21]:
d_train

Unnamed: 0,user_id,business_id,stars
0,6022,3236,5
1,5529,1044,4
2,2908,2323,5
3,4610,2553,4
4,1194,1883,4
...,...,...,...
600120,1585,985,5
600121,2128,2452,5
600122,853,2839,3
600123,4032,3188,5


In [8]:
u_cat = d_train.user_id.cat.categories
b_cat = d_train.business_id.cat.categories

In [22]:
u_cat

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            6031, 6032, 6033, 6034, 6035, 6036, 6037, 6038, 6039, 6040],
           dtype='int64', length=6040)

In [9]:
d_valid.user_id = d_valid.user_id.cat.set_categories(u_cat)
d_valid.business_id = d_valid.business_id.cat.set_categories(b_cat)

d_test.user_id = d_test.user_id.cat.set_categories(u_cat)
d_test.business_id = d_test.business_id.cat.set_categories(b_cat)

In [10]:
d_train.user_id = d_train.user_id.cat.codes
d_train.business_id = d_train.business_id.cat.codes 

d_valid.user_id = d_valid.user_id.cat.codes
d_valid.business_id = d_valid.business_id.cat.codes 

d_test.user_id = d_test.user_id.cat.codes
d_test.business_id = d_test.business_id.cat.codes 

In [11]:
d_train = d_train.dropna()
d_valid = d_valid.dropna()
d_test = d_test.dropna()

d_train.reset_index(drop=True, inplace=True)
d_valid.reset_index(drop=True, inplace=True)
d_test.reset_index(drop=True, inplace=True)

In [12]:
d_train = d_train.astype({'user_id': int, 'business_id': int})
d_valid = d_valid.astype({'user_id': int, 'business_id': int})
d_test = d_test.astype({'user_id': int, 'business_id': int})

In [13]:
args.num_users = d_train.user_id.max() + 1
args.num_items = d_train.business_id.max() + 1
args.latent_dim = 64
args.num_epochs = 50

In [14]:
class GNNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GNNLayer, self).__init__()
        self.in_feats = in_feats
        self.out_feats = out_feats 

        self.W1 = nn.Linear(in_feats, out_feats)
        self.W2 = nn.Linear(in_feats, out_feats)

    def forward(self, L, SelfLoop, feats):
        # (L+I)EW_1
        sf_L = L + SelfLoop
        L = L.cuda()
        sf_L = sf_L.cuda()
        sf_E = torch.sparse.mm(sf_L, feats)
        left_part = self.W1(sf_E) # left part

        # EL odot EW_2, odot indicates element-wise product 
        LE = torch.sparse.mm(L, feats)
        E = torch.mul(LE, feats)
        right_part = self.W2(E)

        return left_part + right_part 

class NGCF(nn.Module):
    def __init__(self, args, matrix):
        super(NGCF, self).__init__()
        self.num_users = args.num_users 
        self.num_items = args.num_items 
        self.latent_dim = args.latent_dim 
        self.device = args.device

        self.user_emb = nn.Embedding(self.num_users, self.latent_dim)
        self.item_emb = nn.Embedding(self.num_items, self.latent_dim)

        self.num_layers = args.num_layers
        self.L = self.LaplacianMatrix(matrix)
        self.I = self.SelfLoop(self.num_users + self.num_items)

        self.leakyrelu = nn.LeakyReLU()
        self.GNNLayers = nn.ModuleList()

        for i in range(self.num_layers-1):
            self.GNNLayers.append(GNNLayer(self.latent_dim, self.latent_dim))

        self.fc_layer = nn.Sequential(
            nn.Linear(self.latent_dim * self.num_layers * 2, 64), 
            nn.ReLU(), 
            nn.Linear(64, 32), 
            nn.ReLU(), 
            nn.Linear(32, 1)
        )

    def SelfLoop(self, num):
        i = torch.LongTensor([[k for k in range(0, num)], [j for j in range(0, num)]])
        val = torch.FloatTensor([1]*num)
        return torch.sparse.FloatTensor(i, val)

    def LaplacianMatrix(self, ratings):
        iids = ratings['business_id'] + self.num_users 
        matrix = sp.coo_matrix((ratings['stars'], (ratings['user_id'], ratings['business_id'])))
        
        upper_matrix = sp.coo_matrix((ratings['stars'], (ratings['user_id'], iids)))
        lower_matrix = matrix.transpose()
        lower_matrix.resize((self.num_items, self.num_users + self.num_items))

        A = sp.vstack([upper_matrix, lower_matrix])
        row_sum = (A > 0).sum(axis=1)
        # row_sum = np.array(row_sum).flatten()
        diag = list(np.array(row_sum.flatten())[0])
        D = np.power(diag, -0.5)
        D = sp.diags(D)
        L = D * A * D
        L = sp.coo_matrix(L)
        row = L.row 
        col = L.col
        idx = np.stack([row, col])
        idx = torch.LongTensor(idx)
        data = torch.FloatTensor(L.data)
        SparseL = torch.sparse.FloatTensor(idx, data)
        return SparseL 

    def FeatureMatrix(self):
        uids = torch.LongTensor([i for i in range(self.num_users)]).to(self.device)
        iids = torch.LongTensor([i for i in range(self.num_items)]).to(self.device)
        user_emb = self.user_emb(uids)
        item_emb = self.item_emb(iids)
        features = torch.cat([user_emb, item_emb], dim=0)
        return features

    def forward(self, uids, iids):
        iids = self.num_users + iids 

        features = self.FeatureMatrix()
        final_emb = features.clone()

        for gnn in self.GNNLayers:
            features = gnn(self.L, self.I, features)
            features = self.leakyrelu(features)
            final_emb = torch.concat([final_emb, features],dim=-1)

        user_emb = final_emb[uids]
        item_emb = final_emb[iids]

        inputs = torch.concat([user_emb, item_emb], dim=-1)
        outs = self.fc_layer(inputs)
        return outs.flatten()

In [15]:
class GraphDataset(Dataset):
    def __init__(self, dataframe):
        super(Dataset, self).__init__()
        
        self.uid = list(dataframe['user_id'])
        self.iid = list(dataframe['business_id'])
        self.ratings = list(dataframe['stars'])
    
    def __len__(self):
        return len(self.uid)
    
    def __getitem__(self, idx):
        uid = self.uid[idx]
        iid = self.iid[idx]
        rating = self.ratings[idx]
        
        return (uid, iid, rating)

In [16]:
def get_loader(args, dataset, num_workers):
    d_set = GraphDataset(dataset)
    return DataLoader(d_set, batch_size=args.batch_size, num_workers=num_workers)

In [26]:
d_train

Unnamed: 0,user_id,business_id,stars
0,6022,3236,5
1,5529,1044,4
2,2908,2323,5
3,4610,2553,4
4,1194,1883,4
...,...,...,...
600120,1585,985,5
600121,2128,2452,5
600122,853,2839,3
600123,4032,3188,5


In [24]:
max(d_train.user_id)

6039

In [25]:
min(d_train.user_id)

0

In [17]:
train_loader = get_loader(args, d_train, 4)
valid_loader = get_loader(args, d_valid, 4)
test_loader = get_loader(args, d_test, 4)

In [18]:
def graph_evaluate(args, model, test_loader, criterion):
    output = []
    test_loss = 0

    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='evaluating...'):
            batch = tuple(b.to(args.device) for b in batch)
            inputs = {'uids':   batch[0], 
                      'iids':   batch[1]}
            gold_y = batch[2].float()
            
            pred_y = model(**inputs)
            output.append(pred_y)
            
            loss = criterion(pred_y, gold_y)
            loss = torch.sqrt(loss)
            test_loss += loss.item()
    test_loss /= len(test_loader)
    return test_loss, output

In [19]:
def graph_train(args, model, train_loader, valid_loader, optimizer, criterion):
    best_loss = float('inf')
    train_losses, valid_losses = [], []
    for epoch in range(1, args.num_epochs + 1):
        train_loss = 0.0

        model.train()
        for batch in tqdm(train_loader, desc='training...'):
            batch = tuple(b.to(args.device) for b in batch)
            inputs = {'uids':   batch[0], 
                      'iids':   batch[1]}
            
            gold_y = batch[2].float()
            

            pred_y = model(**inputs)
            
            loss = criterion(pred_y, gold_y)
            loss = torch.sqrt(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        valid_loss , outputs = graph_evaluate(args, model, valid_loader, criterion)
        valid_losses.append(valid_loss)
        

        print(f'Epoch: [{epoch}/{args.num_epochs}]')
        print(f'Train Loss: {train_loss:.4f}\tValid Loss: {valid_loss:.4f}')

        if best_loss > valid_loss:
            best_loss = valid_loss
            if not os.path.exists(args.SAVE_PATH):
                os.makedirs(args.SAVE_PATH)
            torch.save(model.state_dict(), os.path.join(args.SAVE_PATH, f'{model._get_name()}_parameters.pt'))

    return {
        'train_loss': train_losses, 
        'valid_loss': valid_losses
    }, outputs

In [None]:
models = NGCF(args, d_train).to(args.device)

optimizer = optim.Adam(models.parameters(), lr = 1e-3)
criterion = nn.L1Loss()

In [None]:
results = graph_train(args, models, train_loader, valid_loader, optimizer, criterion)

training...: 100%|██████████| 1173/1173 [02:25<00:00,  8.08it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.60it/s]


Epoch: [1/50]
Train Loss: 0.8890	Valid Loss: 0.8625


training...: 100%|██████████| 1173/1173 [02:20<00:00,  8.33it/s]
evaluating...: 100%|██████████| 391/391 [00:41<00:00,  9.32it/s]


Epoch: [2/50]
Train Loss: 0.8521	Valid Loss: 0.8505


training...: 100%|██████████| 1173/1173 [02:19<00:00,  8.42it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.65it/s]


Epoch: [3/50]
Train Loss: 0.8445	Valid Loss: 0.8464


training...: 100%|██████████| 1173/1173 [02:19<00:00,  8.43it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.62it/s]


Epoch: [4/50]
Train Loss: 0.8392	Valid Loss: 0.8439


training...: 100%|██████████| 1173/1173 [02:19<00:00,  8.39it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.62it/s]


Epoch: [5/50]
Train Loss: 0.8345	Valid Loss: 0.8429


training...: 100%|██████████| 1173/1173 [02:20<00:00,  8.35it/s]
evaluating...: 100%|██████████| 391/391 [00:41<00:00,  9.48it/s]


Epoch: [6/50]
Train Loss: 0.8305	Valid Loss: 0.8411


training...: 100%|██████████| 1173/1173 [02:19<00:00,  8.38it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.59it/s]


Epoch: [7/50]
Train Loss: 0.8273	Valid Loss: 0.8413


training...: 100%|██████████| 1173/1173 [02:20<00:00,  8.38it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.58it/s]


Epoch: [8/50]
Train Loss: 0.8242	Valid Loss: 0.8418


training...: 100%|██████████| 1173/1173 [02:19<00:00,  8.39it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.73it/s]


Epoch: [9/50]
Train Loss: 0.8215	Valid Loss: 0.8409


training...: 100%|██████████| 1173/1173 [02:19<00:00,  8.41it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.67it/s]


Epoch: [10/50]
Train Loss: 0.8188	Valid Loss: 0.8409


training...: 100%|██████████| 1173/1173 [02:19<00:00,  8.43it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.65it/s]


Epoch: [11/50]
Train Loss: 0.8163	Valid Loss: 0.8395


training...: 100%|██████████| 1173/1173 [02:20<00:00,  8.37it/s]
evaluating...: 100%|██████████| 391/391 [00:41<00:00,  9.52it/s]


Epoch: [12/50]
Train Loss: 0.8140	Valid Loss: 0.8402


training...: 100%|██████████| 1173/1173 [02:19<00:00,  8.39it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.55it/s]


Epoch: [13/50]
Train Loss: 0.8120	Valid Loss: 0.8402


training...: 100%|██████████| 1173/1173 [02:21<00:00,  8.30it/s]
evaluating...: 100%|██████████| 391/391 [00:40<00:00,  9.60it/s]


Epoch: [14/50]
Train Loss: 0.8101	Valid Loss: 0.8418


training...: 100%|██████████| 1173/1173 [02:20<00:00,  8.32it/s]
evaluating...: 100%|██████████| 391/391 [00:41<00:00,  9.34it/s]


Epoch: [15/50]
Train Loss: 0.8079	Valid Loss: 0.8422


training...: 100%|██████████| 1173/1173 [02:21<00:00,  8.27it/s]
evaluating...: 100%|██████████| 391/391 [00:41<00:00,  9.43it/s]


Epoch: [16/50]
Train Loss: 0.8053	Valid Loss: 0.8424


training...:  24%|██▍       | 284/1173 [00:34<01:43,  8.56it/s]