# Import libraries

In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import matplotlib.pyplot as plt
import random
from typing import Dict

from torch import nn, optim, Tensor, torch
from torch_geometric.utils.num_nodes import maybe_num_nodes

In [None]:
from data_loader.data_loader import UserUserDataset
from light_gcn.light_gcn import LightGCN, bpr_loss, get_user_positive_items, RecallPrecision_ATk, NDCGatK_r, get_metrics, evaluation

In [2]:
torch.__version__

'2.1.1+cu118'

In [3]:
postings_1 = pd.read_csv("../data/Postings_01052019_15052019.csv", sep = ';')
postings_2 = pd.read_csv("../data/Postings_16052019_31052019.csv", sep = ';')
votes_1 = pd.read_csv("../data/Votes_01052019_15052019.csv", sep = ';')
votes_2 = pd.read_csv("../data/Votes_16052019_31052019.csv", sep = ';')
postings = pd.concat([postings_1, postings_2])
votes = pd.concat([votes_1, votes_2])

datasets_dict = {
    "postings": postings,
    "votes": votes
}

In [None]:
uu_dataset = UserUserDataset(datasets_dict = datasets_dict, verbose = True)

In [None]:
train_edge_index, val_edge_index, test_edge_index = uu_dataset.get_train_test_val_split(val_split_ratio = 0.2, test_split_ratio = 0.5, random_state = 1, sparse = False)

In [None]:
source, target, negative_target = uu_dataset.get_negative_samples(train_edge_index)

In [15]:
user_indices, pos_item_indices, neg_item_indices = uu_dataset.get_random_mini_batches(512)

In [18]:
num_users, num_articles = uu_dataset.num_users, uu_dataset.num_articles
model = LightGCN(num_users, num_articles)

In [21]:
# define constants
ITERATIONS = 10000
BATCH_SIZE = 512
LR = 1e-3
ITERS_PER_EVAL = 1000
ITERS_PER_LR_DECAY = 200
K = 100
LAMBDA = 1e-6

In [22]:
# setup
device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
# device_name = 'cpu'
device = torch.device(device_name)
print(f"Using device {device}.")


model = model.to(device)
model.train()

optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

edge_index = uu_dataset.edge_index.to(device)

train_edge_index = train_edge_index.to(device)
train_sparse_edge_index = uu_dataset.train_sparse_edge_index.to(device)

val_edge_index = val_edge_index.to(device)
val_sparse_edge_index = uu_dataset.val_sparse_edge_index.to(device)

Using device cuda.


In [23]:
# training loop
train_losses = []
val_losses = []

for iter in tqdm(range(ITERATIONS)):

    # forward propagation
    users_emb_final, users_emb_0, items_emb_final, items_emb_0 = model.forward(train_sparse_edge_index)

    # mini batching
    user_indices, pos_item_indices, neg_item_indices = uu_dataset.get_random_mini_batches(BATCH_SIZE)
    user_indices, pos_item_indices, neg_item_indices = user_indices.to(device), pos_item_indices.to(device), neg_item_indices.to(device)
    users_emb_final, users_emb_0 = users_emb_final[user_indices], users_emb_0[user_indices]
    pos_items_emb_final, pos_items_emb_0 = items_emb_final[pos_item_indices], items_emb_0[pos_item_indices]
    neg_items_emb_final, neg_items_emb_0 = items_emb_final[neg_item_indices], items_emb_0[neg_item_indices]

    # loss computation
    train_loss = bpr_loss(users_emb_final, users_emb_0, pos_items_emb_final,
                          pos_items_emb_0, neg_items_emb_final, neg_items_emb_0, LAMBDA)

    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    if iter % ITERS_PER_EVAL == 0:
        model.eval()
        val_loss, recall, precision, ndcg = evaluation(
            model, val_edge_index, val_sparse_edge_index, [train_edge_index], K, LAMBDA)
        print(f"[Iteration {iter}/{ITERATIONS}] train_loss: {round(train_loss.item(), 5)}, val_loss: {round(val_loss, 5)}, val_recall@{K}: {round(recall, 5)}, val_precision@{K}: {round(precision, 5)}, val_ndcg@{K}: {round(ndcg, 5)}")
        train_losses.append(train_loss.item())
        val_losses.append(val_loss)
        model.train()

    if iter % ITERS_PER_LR_DECAY == 0 and iter != 0:
        scheduler.step()

  0%|          | 0/10000 [00:00<?, ?it/s]

33703


  0%|          | 3/10000 [03:33<153:43:52, 55.36s/it] 

[Iteration 0/10000] train_loss: -0.69219, val_loss: -0.48347, val_recall@100: 0.01776, val_precision@100: 0.00091, val_ndcg@100: 0.00562


 10%|▉         | 999/10000 [04:41<10:01, 14.97it/s]  

33703


 10%|█         | 1003/10000 [08:05<53:33:13, 21.43s/it]

[Iteration 1000/10000] train_loss: -44.31754, val_loss: -3.8935, val_recall@100: 0.2042, val_precision@100: 0.0099, val_ndcg@100: 0.07721


 20%|█▉        | 1999/10000 [09:13<09:29, 14.05it/s]   

33703


 20%|██        | 2003/10000 [12:37<47:46:52, 21.51s/it]

[Iteration 2000/10000] train_loss: -118.54932, val_loss: -11.38222, val_recall@100: 0.2121, val_precision@100: 0.01, val_ndcg@100: 0.08028


 30%|██▉       | 2999/10000 [13:46<08:33, 13.64it/s]   

33703


 30%|███       | 3003/10000 [17:11<41:50:33, 21.53s/it]

[Iteration 3000/10000] train_loss: -195.56348, val_loss: -18.76358, val_recall@100: 0.21428, val_precision@100: 0.01003, val_ndcg@100: 0.08089


 40%|███▉      | 3999/10000 [18:20<07:05, 14.09it/s]   

33703


 40%|████      | 4003/10000 [21:45<35:58:44, 21.60s/it]

[Iteration 4000/10000] train_loss: -271.21112, val_loss: -25.38278, val_recall@100: 0.21547, val_precision@100: 0.01005, val_ndcg@100: 0.08119


 50%|████▉     | 4999/10000 [22:54<06:09, 13.54it/s]   

33703


 50%|█████     | 5003/10000 [26:16<29:31:19, 21.27s/it]

[Iteration 5000/10000] train_loss: -304.68597, val_loss: -30.89661, val_recall@100: 0.21658, val_precision@100: 0.01006, val_ndcg@100: 0.08145


 60%|█████▉    | 5999/10000 [27:25<04:48, 13.85it/s]   

33703


 60%|██████    | 6003/10000 [30:46<23:30:28, 21.17s/it]

[Iteration 6000/10000] train_loss: -355.63431, val_loss: -35.38113, val_recall@100: 0.2166, val_precision@100: 0.01006, val_ndcg@100: 0.08153


 70%|██████▉   | 6999/10000 [31:54<03:40, 13.60it/s]   

33703


 70%|███████   | 7003/10000 [35:19<17:56:15, 21.55s/it]

[Iteration 7000/10000] train_loss: -399.13916, val_loss: -38.83504, val_recall@100: 0.21684, val_precision@100: 0.01007, val_ndcg@100: 0.08154


 80%|███████▉  | 7999/10000 [36:27<02:17, 14.59it/s]   

33703


 80%|████████  | 8003/10000 [39:49<11:49:13, 21.31s/it]

[Iteration 8000/10000] train_loss: -457.30225, val_loss: -41.64065, val_recall@100: 0.21689, val_precision@100: 0.01007, val_ndcg@100: 0.08161


 90%|████████▉ | 8999/10000 [40:56<01:08, 14.53it/s]   

33703


 90%|█████████ | 9003/10000 [44:17<5:51:19, 21.14s/it]

[Iteration 9000/10000] train_loss: -446.88718, val_loss: -43.8705, val_recall@100: 0.21698, val_precision@100: 0.01007, val_ndcg@100: 0.08166


100%|██████████| 10000/10000 [45:24<00:00,  3.67it/s] 
