In [1]:
import torch

TORCH = torch.__version__.split('+')[0]
CUDA = 'cu' + torch.version.cuda.replace('.', '')
device = torch.device("cuda")

In [2]:
#!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
#!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
#!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
#!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric



In [3]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from torch.utils.data import DataLoader, Dataset
from torch_geometric.nn import GCNConv, GATConv, GraphConv, SAGEConv
from tqdm import tqdm, trange
from torch.utils.data import DataLoader, Dataset
from IPython import embed
from statistics import mean
from sklearn.metrics import roc_auc_score
from torch_geometric.utils import from_scipy_sparse_matrix
from scipy.sparse import identity, coo_matrix

In [4]:
%load_ext tensorboard

In [5]:
import os
logs_dir = "runs"
os.makedirs(logs_dir, exist_ok=True)

In [6]:
from torch.utils.tensorboard import SummaryWriter

tb_gcn = SummaryWriter(log_dir=f'{logs_dir}/{logs_dir}_NGCF/')

In [7]:
if not os.path.exists('ml-100k'):
    !wget "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
    !unzip "ml-100k.zip"

In [8]:
def ml100k_dataset(path='ml-100k'):

    if not os.path.exists(os.path.join(path, 'interactions.csv')):

        df = pd.read_csv(os.path.join(path, "u.data"), delimiter='\t',
                         names=['user','item','rating','timestamp'])

        df['user'] = pd.Categorical(df['user']).codes
        df['item'] = pd.Categorical(df['item']).codes
        df['rating'] = 1

        df.to_csv(os.path.join(path, 'interactions.csv'), index=False)
    else:
        df = pd.read_csv(os.path.join(path, 'interactions.csv'))
    return df

df = ml100k_dataset()

In [9]:
df

Unnamed: 0,user,item,rating,timestamp
0,195,241,1,881250949
1,185,301,1,891717742
2,21,376,1,878887116
3,243,50,1,880606923
4,165,345,1,886397596
...,...,...,...,...
99995,879,475,1,880175444
99996,715,203,1,879795543
99997,275,1089,1,874795795
99998,12,224,1,882399156


In [10]:
def split_train_test(data, n_users):
    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame(data, columns=['user_id', 'item_id', 'timestamp'])

    train_x = []
    test_x = []

    for u in trange(n_users, desc='Splitting train/test and removing timestamp...'):
        user_data = df[df['user_id'] == u]
        if user_data.empty:
            continue

        user_data = user_data.sort_values(by='timestamp')

        if len(user_data) == 1:
            train_x.append(user_data.iloc[0][['user_id', 'item_id']].values)
        else:
            train_x.append(user_data.iloc[:-1][['user_id', 'item_id']].values)
            test_x.append(user_data.iloc[-1][['user_id', 'item_id']].values)

    return np.vstack(train_x), np.vstack(test_x)

##### SHOULD CHANGE THE DATA VALUE ####
data = df[['user', 'item', 'timestamp']].astype('int32').to_numpy()
n_users = len(np.unique(data[:, 0]))

train_data, test_data = split_train_test(data, n_users)

Splitting train/test and removing timestamp...: 100%|██████████| 943/943 [00:03<00:00, 257.12it/s]


In [11]:
train_data

array([[  0, 171],
       [  0, 167],
       [  0, 164],
       ...,
       [942, 229],
       [942, 228],
       [942, 448]], dtype=int32)

In [12]:
# dims = [min_item_id, max_item_id, max_user_id]
add_dims=0
for i in range(data.shape[1] - 1):

    data[:, i] -= np.min(data[:, i])
    data[:, i] += add_dims
    add_dims = np.max(data[:, i]) + 1

dims = np.max(data, axis=0) + 1

In [13]:
assert train_data.shape[0] + test_data.shape[0] == 100000

In [14]:
train_data = train_data[:, :2]
dims = dims[:2]
# dims = [min_item_id, max_item_id, max_user_id]

In [17]:
def construct_interaction_mat(n_features, data):
    train_mat = sp.dok_matrix((n_features, n_features), dtype=np.float32)
    for x in tqdm(data, desc=f"BUILDING ADJACENCY MATRIX..."):
        train_mat[x[0], x[1]] = 1.0
        train_mat[x[1], x[0]] = 1.0

        if data.shape[1] > 2:
            for idx in range(len(x[2:])):
                train_mat[x[0], x[2 + idx]] = 1.0
                train_mat[x[1], x[2 + idx]] = 1.0
                train_mat[x[2 + idx], x[0]] = 1.0
                train_mat[x[2 + idx], x[1]] = 1.0
    return train_mat

In [18]:
def ng_sample(data, dims, num_ng=4):
    rating_mat = construct_interaction_mat(dims[-1], data)
    interactions = []
    min_item, max_item = dims[0], dims[1]

    data_df = pd.DataFrame(data, columns=['user_id', 'item_id'])

    for _, row in tqdm(data_df.iterrows(), desc='PERFORMING NEGATIVE SAMPLING...', total=len(data_df)):
        user, item = int(row['user_id']), int(row['item_id'])
        interactions.append([user, item, 1])

        for _ in range(num_ng):
            j = np.random.randint(min_item, max_item)
            while rating_mat[user, j] != 0 or j == item:
                j = np.random.randint(min_item, max_item)
            interactions.append([user, j, 0])

    interactions = np.array(interactions)
    return interactions, rating_mat

train_data, rating_mat = ng_sample(train_data, dims)

BUILDING ADJACENCY MATRIX...: 100%|██████████| 99057/99057 [00:01<00:00, 52726.35it/s]
PERFORMING NEGATIVE SAMPLING...: 100%|██████████| 99057/99057 [00:17<00:00, 5630.50it/s]


In [19]:
train_data

array([[   0,  171,    1],
       [   0, 1452,    0],
       [   0, 2242,    0],
       ...,
       [ 942, 1799,    0],
       [ 942, 1615,    0],
       [ 942, 1090,    0]])

In [20]:
rating_mat

<2625x2625 sparse matrix of type '<class 'numpy.float32'>'
	with 188061 stored elements in Dictionary Of Keys format>

In [None]:
###
def ng_sample(data, dims, num_ng=4):
    rating_mat = construct_interaction_mat(dims[-1], data)
    interactions = []
    min_item, max_item = dims[0], dims[1]
    for num, x in tqdm(enumerate(data), desc='perform negative sampling...'):
        interactions.append(np.append(x, 1))
        for t in range(num_ng):
            j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
            # IDEA: Loop to exclude true interactions (set to 1 in adj_train) user - item
            while (x[0], j) in rating_mat or j == int(x[1]):
                j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
            interactions.append(np.concatenate([[x[0], j], x[2:], [0]]))
    return np.vstack(interactions), rating_mat

train_data, rating_mat = ng_sample(train_data, dims)

In [21]:
class PointData(Dataset):
    def __init__(self, train_data, dims):
        super(PointData, self).__init__()
        self.interactions = train_data
        self.dims = dims

    def __len__(self):
        return len(self.interactions)

    def __getitem__(self, index):

        return self.interactions[index][:-1], self.interactions[index][-1]

train_dataset = PointData(train_data, dims)

In [23]:
def compute_zero_positions_and_items(rating_mat, dims):
    # Convert sparse matrix to dense array
    dense_matrix = rating_mat.A

    # Find zero positions
    zero_positions = np.argwhere(dense_matrix == 0)

    # Initialize items2compute list
    compute_items = []

    # Process users to find items to compute
    for user in trange(dims[0], desc='Computing items to compute...'):
        # Mask to find the zeros for the current user
        user_mask = (zero_positions[:, 0] == user)
        # Extract the item indices where the value is zero and is an item (greater than or equal to dims[0])
        items_for_user = zero_positions[user_mask, 1]
        items_to_compute = items_for_user[items_for_user >= dims[0]]
        compute_items.append(items_to_compute)

    return compute_items

compute_items = compute_zero_positions_and_items(rating_mat, dims)

Computing items to compute...: 100%|██████████| 943/943 [00:17<00:00, 54.25it/s]


In [25]:
def build_test_set(itemsnoninteracted, gt_test_interactions):
    #max_users, max_items = dims # number users (943), number items (2625)
    test_set = []
    for pair, negatives in tqdm(zip(gt_test_interactions, itemsnoninteracted), desc="BUILDING TEST SET..."):
        # APPEND TEST SETS FOR SINGLE USER
        negatives = np.delete(negatives, np.where(negatives == pair[1]))
        single_user_test_set = np.vstack([pair, ] * (len(negatives)+1))
        single_user_test_set[:, 1][1:] = negatives
        test_set.append(single_user_test_set.copy())
    return test_set

test_dataset = build_test_set(compute_items, test_data)

# Build A in an "edge index" format using torch_sparse library
edge_idx, edge_attr = from_scipy_sparse_matrix(rating_mat)

BUILDING TEST SET...: 943it [00:01, 688.44it/s]


In [26]:
# Build features or side-information (identity tensor for the moment) using an alternative method
def build_sparse_mat_to_sparse_tensor(size):
    """ Build identity features as a torch sparse tensor using PyTorch's native COO format."""
    identity_mat = identity(size).tocoo().astype(np.float32)
    indices = torch.tensor([identity_mat.row, identity_mat.col], dtype=torch.int64)
    values = torch.tensor(identity_mat.data, dtype=torch.float32)
    shape = torch.Size(identity_mat.shape)
    return torch.sparse_coo_tensor(indices, values, shape)

X = build_sparse_mat_to_sparse_tensor(rating_mat.shape[0])

  indices = torch.tensor([identity_mat.row, identity_mat.col], dtype=torch.int64)


In [27]:
edge_idx

tensor([[  0, 171,   0,  ..., 228, 942, 448],
        [171,   0, 167,  ..., 942, 448, 942]])

In [28]:
class FM_operation(torch.nn.Module):

    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix

In [29]:
class GCELayer(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, features, train_mat, attention=False, use_additional_gnn=False):
        super(GCELayer, self).__init__()

        self.A = train_mat
        self.features = features
        self.use_additional_gnn = use_additional_gnn

        if attention:
            self.GCN_module = GATConv(int(field_dims), embed_dim, heads=8, dropout=0.6)
        else:
            self.GCN_module = GCNConv(field_dims, embed_dim)

        if use_additional_gnn:
            self.gnn_layer1 = GraphConv(embed_dim, embed_dim)
            self.gnn_layer2 = SAGEConv(embed_dim, embed_dim)
            self.dropout = torch.nn.Dropout(0.5)
            self.activation = torch.nn.ReLU()

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        out = self.GCN_module(self.features, self.A)
        if self.use_additional_gnn:
            out = self.gnn_layer1(out, self.A)
            out = self.activation(out)
            out = self.dropout(out)
            out = self.gnn_layer2(out, self.A)
            out = self.activation(out)
        return out[x]

class FactorizationMachineModel_withGCN(torch.nn.Module):

    def __init__(self, field_dims, embed_dim, X, A, attention=False, use_additional_gnn=False):
        super(FactorizationMachineModel_withGCN, self).__init__()
        self.linear = torch.nn.Linear(len(field_dims), 1)
        self.embedding = GCELayer(field_dims[-1], embed_dim, X, A, attention=attention, use_additional_gnn=use_additional_gnn)
        self.fm = FM_operation(reduce_sum=True)

    def forward(self, interaction_pairs):
        """
        :param interaction_pairs: Long tensor of size ``(batch_size, num_fields)``
        """
        linear_out = self.linear(interaction_pairs.float())
        embedding_out = self.embedding(interaction_pairs)
        fm_out = self.fm(embedding_out)
        return (linear_out + fm_out).squeeze(1)

    def predict(self, interactions, device):
        # Return the score, inputs are numpy arrays, outputs are tensors
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device)
        output_scores = self.forward(test_interactions)
        return output_scores

In [30]:
data_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0)

In [34]:
def getHitRatio(recommend_list, gt_item):
    return int(gt_item in recommend_list)

def getNDCG(recommend_list, gt_item):
    if gt_item in recommend_list:
        index = recommend_list.index(gt_item)
        return 1 / (torch.log2(index + 2))
    return 0

def train_and_evaluate(model, optimizer, criterion, data_loader, device, test_data, tb, epochs=10, topk=10):
    for epoch_i in trange(epochs, desc='Training'):
        # Training phase
        model.train()
        total_loss = []
        for interactions, targets in data_loader:
            interactions = interactions.to(device)
            targets = targets.to(device)

            predictions = model(interactions)
            loss = criterion(predictions, targets.float())
            model.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss.append(loss.item())

        train_loss = mean(total_loss)

        # Evaluation phase
        model.eval()
        HR, NDCG = [], []
        with torch.no_grad():
            for user_test in test_dataset:
                gt_item = user_test[0][1]
                predictions = model.predict(user_test, device)
                _, indices = torch.topk(predictions, topk)
                recommend_list = user_test[indices.cpu().numpy()][:, 1]

                HR.append(getHitRatio(recommend_list, gt_item))
                NDCG.append(getNDCG(recommend_list, gt_item))

        hr = mean(HR)
        ndcg = mean(NDCG)

        # Logging
        tb.add_scalar('train/loss', train_loss, epoch_i)
        tb.add_scalar(f'eval/HR@{topk}', hr, epoch_i)
        tb.add_scalar(f'eval/NDCG@{topk}', ndcg, epoch_i)

        print(f'Epoch {epoch_i}: Training loss = {train_loss:.4f} | HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f}')

In [36]:
model_GCN = FactorizationMachineModel_withGCN(train_dataset.dims,
                                              32,
                                              X.to(device),
                                              edge_idx.to(device),
                                              ).to(device)

GCN_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
GCN_optimizer = torch.optim.Adam(params=model_GCN.parameters(), lr=0.01)

#model_GAT = FactorizationMachineModel_withGCN(train_dataset.dims,
#                                                  32,
#                                                  X.to(device),
#                                                  edge_idx.to(device),
#                                                  attention=True
#                                                  ).to(device)

#GAT_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
#GAT_optimizer = torch.optim.Adam(params=model_GAT.parameters(), lr=0.01)

In [37]:
train_and_evaluate(model_GCN, GCN_optimizer, GCN_criterion, data_loader, device, test_dataset, tb_gcn)

Training:   0%|          | 0/10 [00:10<?, ?it/s]


AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [None]:
%tensorboard --logdir runs