In [28]:
import warnings
warnings.filterwarnings('ignore')
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import time
import os
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset,DataLoader

In [29]:
#check gpu device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

In [30]:
# import cluster
df_cluster = pd.read_csv("poi_cluster.csv")
df_cluster.head()

Unnamed: 0,poi_id,clusters
0,3fd66200f964a52000e71ee3,53
1,3fd66200f964a52001e81ee3,123
2,3fd66200f964a52003e51ee3,66
3,3fd66200f964a52003e71ee3,26
4,3fd66200f964a52004e41ee3,37


In [31]:
df_cluster.shape

(9989, 2)

In [32]:
# load poi sequential data
dir = 'E:\\Sebnewrepo/Rec_sys_lab/paper1_experiment/'
checkin_file = 'ny_ordered.csv'
df = pd.read_csv(dir + checkin_file)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,datetime
0,1,4abc1f51f964a520798620e3,4bf58dd8d48988d1ce941735,Seafood Restaurant,40.781558,-73.975792,-240,Wed Apr 04 23:31:31 +0000 2012,2012-04-04 23:31:31
1,1,4d4ac10da0ef54814b6ffff6,4bf58dd8d48988d157941735,American Restaurant,40.784018,-73.974524,-240,Sat Apr 07 17:42:24 +0000 2012,2012-04-07 17:42:24
2,1,4db44994cda1c57c82583709,4bf58dd8d48988d1f1931735,General Entertainment,40.739398,-73.99321,-240,Sun Apr 08 18:20:29 +0000 2012,2012-04-08 18:20:29
3,1,4a541923f964a52008b31fe3,4bf58dd8d48988d14e941735,American Restaurant,40.785677,-73.976498,-240,Sun Apr 08 20:02:10 +0000 2012,2012-04-08 20:02:10
4,1,40f1d480f964a5205b0a1fe3,4bf58dd8d48988d143941735,Breakfast Spot,40.719929,-74.008532,-240,Mon Apr 09 16:20:52 +0000 2012,2012-04-09 16:20:52


In [33]:
# remove infrequent items and users
from copy import deepcopy
def rm_infrequent_items(data, min_counts):
    df = deepcopy(data)
    counts = df['poi_id'].value_counts()
    df = df[df['poi_id'].isin(counts[counts >= min_counts].index)]
    print("POIs with < {} interactoins are removed".format(min_counts))
    return df
def rm_infrequent_users(data, min_counts):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]
    print("users with < {} interactoins are removed".format(min_counts))
    return df
          
filtered_df = rm_infrequent_users(df, 5)
filtered_df = rm_infrequent_items(filtered_df, 5)
print('num of users:{}, num of POIs:{}'.format(len(filtered_df['user_id'].unique()), len(filtered_df['poi_id'].unique())))

users with < 5 interactoins are removed
POIs with < 5 interactoins are removed
num of users:1083, num of POIs:9989


In [35]:
df = filtered_df.merge(df_cluster, how = 'left', on = 'poi_id')
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,datetime,clusters
0,1,4abc1f51f964a520798620e3,4bf58dd8d48988d1ce941735,Seafood Restaurant,40.781558,-73.975792,-240,Wed Apr 04 23:31:31 +0000 2012,2012-04-04 23:31:31,115
1,1,4d4ac10da0ef54814b6ffff6,4bf58dd8d48988d157941735,American Restaurant,40.784018,-73.974524,-240,Sat Apr 07 17:42:24 +0000 2012,2012-04-07 17:42:24,115
2,1,4db44994cda1c57c82583709,4bf58dd8d48988d1f1931735,General Entertainment,40.739398,-73.99321,-240,Sun Apr 08 18:20:29 +0000 2012,2012-04-08 18:20:29,143
3,1,4a541923f964a52008b31fe3,4bf58dd8d48988d14e941735,American Restaurant,40.785677,-73.976498,-240,Sun Apr 08 20:02:10 +0000 2012,2012-04-08 20:02:10,109
4,1,40f1d480f964a5205b0a1fe3,4bf58dd8d48988d143941735,Breakfast Spot,40.719929,-74.008532,-240,Mon Apr 09 16:20:52 +0000 2012,2012-04-09 16:20:52,151


In [36]:
df_input = pd.DataFrame({
    'user_id': df['user_id'],  # user_id offset by 1
    'cluster_id': df['clusters'] - 1,
    #'implicit': np.ones(179468)
})
df_input.head()

Unnamed: 0,user_id,cluster_id
0,1,114
1,1,114
2,1,142
3,1,108
4,1,150


In [37]:
df_input['user_id'].nunique()

1083

In [38]:
df_input['cluster_id'].nunique()

200

In [39]:
from copy import deepcopy 
def convert_data(data):
    df = deepcopy(data)
    data = df.groupby('user_id')['cluster_id'].apply(list)
    unique_data = df.groupby('user_id')['cluster_id'].nunique()
    print(data[:10])
    print(len(data))
    return data

In [40]:
seq_data = convert_data(df_input)

user_id
1     [114, 114, 142, 108, 150, 171, 59, 23, 59, 197...
2     [102, 51, 40, 99, 3, 172, 81, 81, 65, 81, 65, ...
3     [131, 0, 61, 172, 43, 29, 180, 114, 0, 52, 61,...
4     [124, 124, 124, 124, 124, 0, 124, 52, 33, 110,...
5     [168, 168, 0, 50, 79, 50, 0, 50, 51, 51, 51, 1...
6     [44, 14, 44, 164, 82, 120, 44, 13, 4, 82, 44, ...
7     [0, 42, 42, 33, 114, 0, 81, 69, 8, 83, 0, 79, ...
8     [81, 81, 81, 37, 81, 81, 81, 81, 0, 61, 81, 81...
9     [39, 183, 81, 108, 39, 133, 10, 100, 141, 51, ...
10    [36, 0, 0, 170, 131, 0, 65, 65, 6, 20, 48, 81,...
Name: cluster_id, dtype: object
1083


## Attention Encoder Kernel

In [41]:
class att_encoder(nn.Module):
    def __init__(self, num_users, num_items, model_args, device):
        super(att_encoder, self).__init__()

        self.args = model_args

        # init args
        L = self.args.L
        dims = self.args.d

        # user and item embeddings
        self.user_embeddings = nn.Embedding(num_users, dims).to(device)
        self.item_embeddings = nn.Embedding(num_items, dims).to(device)

        self.att_weight_item = Variable(torch.zeros(dims, 1).type(torch.FloatTensor), requires_grad=True).to(device)
        self.att_weight_user = Variable(torch.zeros(dims, L).type(torch.FloatTensor), requires_grad=True).to(device)
        self.att_weight_item = torch.nn.init.xavier_uniform_(self.att_weight_item)
        self.att_weight_user = torch.nn.init.xavier_uniform_(self.att_weight_user)

        self.W2 = nn.Embedding(num_items, dims, padding_idx=0).to(device)
        self.b2 = nn.Embedding(num_items, 1, padding_idx=0).to(device)

        # weight initialization
        self.user_embeddings.weight.data.normal_(0, 1.0 / self.user_embeddings.embedding_dim)
        self.item_embeddings.weight.data.normal_(0, 1.0 / self.item_embeddings.embedding_dim)
        self.W2.weight.data.normal_(0, 1.0 / self.W2.embedding_dim)
        self.b2.weight.data.zero_()

    def forward(self, item_seq, user_ids, items_to_predict, for_pred=False):
        item_embs = self.item_embeddings(item_seq)
        user_emb = self.user_embeddings(user_ids)
        #print(user_emb.mm(self.att_weight_user).shape)
        #attention score
        attention_score = torch.sigmoid(torch.matmul(item_embs, self.att_weight_item.unsqueeze(0)).squeeze() +
                                        user_emb.mm(self.att_weight_user))
        union_out = torch.sum(item_embs, dim=1)
        #print(union_out.shape)
        union_out = union_out * torch.sum(attention_score, dim=1).unsqueeze(1) 
        w2 = self.W2(items_to_predict)
        b2 = self.b2(items_to_predict)
         
        if for_pred:
            w2 = w2.squeeze()
            b2 = b2.squeeze()

            # MF
            #print('to predict', items_to_predict.shape)
            res = user_emb.mm(w2.t()) + b2
            res += union_out.mm(w2.t())
            #print('res',res.shape)

        else:
            # MF
            res = torch.baddbmm(b2, w2, user_emb.unsqueeze(2)).squeeze()
            res += torch.bmm(union_out.unsqueeze(1), w2.permute(0, 2, 1)).squeeze()
            #print('to train', res.shape)
        return res

In [42]:
from interactions import Interactions
from eval_metrics import *

import argparse
import logging
from time import time
import datetime

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)


In [43]:
def evaluation(model, train, test_set, topk=20):
    num_users = train.num_users
    num_items = train.num_items
    batch_size = 1024
    num_batches = int(num_users / batch_size) + 1
    user_indexes = np.arange(num_users)
    item_indexes = np.arange(num_items)
    pred_list = None
    train_matrix = train.tocsr()
    test_sequences = train.test_sequences.sequences

    for batchID in range(num_batches):
        start = batchID * batch_size
        end = start + batch_size

        if batchID == num_batches - 1:
            if start < num_users:
                end = num_users
            else:
                break

        batch_user_index = user_indexes[start:end]

        batch_test_sequences = test_sequences[batch_user_index]
        batch_test_sequences = np.atleast_2d(batch_test_sequences)

        batch_test_sequences = torch.from_numpy(batch_test_sequences).type(torch.LongTensor).to(device)
        item_ids = torch.from_numpy(item_indexes).type(torch.LongTensor).to(device)
        #print('item_ids', item_ids.shape)
        batch_user_ids = torch.from_numpy(np.array(batch_user_index)).type(torch.LongTensor).to(device)

        rating_pred = model(batch_test_sequences, batch_user_ids, item_ids, True)
        rating_pred = rating_pred.cpu().data.numpy().copy()
        rating_pred[train_matrix[batch_user_index].toarray() > 0] = 0
        ind = np.argpartition(rating_pred, -topk)
        ind = ind[:, -topk:]
        arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]
        arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]
        batch_pred_list = ind[np.arange(len(rating_pred))[:, None], arr_ind_argsort]
        #print('batch_pred_list', batch_pred_list)

        if batchID == 0:
            pred_list = batch_pred_list
        else:
            pred_list = np.append(pred_list, batch_pred_list, axis=0)

    precision, recall, MAP, ndcg = [], [], [], []
    for k in [5, 10, 15, 20]:
        precision.append(precision_at_k(test_set, pred_list, k))
        recall.append(recall_at_k(test_set, pred_list, k))
        MAP.append(mapk(test_set, pred_list, k))
        ndcg.append(ndcg_k(test_set, pred_list, k))

    return precision, recall, MAP, ndcg


def negsamp_vectorized_bsearch_preverif(pos_inds, n_items, n_samp=32):
    """ Pre-verified with binary search
    `pos_inds` is assumed to be ordered
    reference: https://tech.hbc.com/2018-03-23-negative-sampling-in-numpy.html
    """
    raw_samp = np.random.randint(0, n_items - len(pos_inds), size=n_samp)
    pos_inds_adj = pos_inds - np.arange(len(pos_inds))
    neg_inds = raw_samp + np.searchsorted(pos_inds_adj, raw_samp, side='right')
    return neg_inds


def generate_negative_samples(train_matrix, num_neg=3, num_sets=10):
    neg_samples = []
    for user_id, row in enumerate(train_matrix):
        pos_ind = row.indices
        neg_sample = negsamp_vectorized_bsearch_preverif(pos_ind, train_matrix.shape[1], num_neg * num_sets)
        neg_samples.append(neg_sample)

    return np.asarray(neg_samples).reshape(num_sets, train_matrix.shape[0], num_neg)


def train_model(model, train_data, test_data, config):
    num_users = train_data.num_users
    num_items = train_data.num_items

    # convert to sequences, targets and users
    sequences_np = train_data.sequences.sequences
    #print(sequences_np)
    targets_np = train_data.sequences.targets
    users_np = train_data.sequences.user_ids
    train_matrix = train_data.tocsr()

    n_train = sequences_np.shape[0]
    logger.info("Total training records:{}".format(n_train))

    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.l2)

    record_indexes = np.arange(n_train)
    batch_size = config.batch_size
    num_batches = int(n_train / batch_size) + 1
    for epoch_num in range(config.n_iter):

        t1 = time()

        # set model to training mode
        model.train()

        np.random.shuffle(record_indexes)

        t_neg_start = time()
        negatives_np_multi = generate_negative_samples(train_matrix, config.neg_samples, config.sets_of_neg_samples)
        logger.info("Negative sampling time: {}s".format(time() - t_neg_start))

        epoch_loss = 0.0
        for batchID in range(num_batches):
            start = batchID * batch_size
            end = start + batch_size

            if batchID == num_batches - 1:
                if start < n_train:
                    end = n_train
                else:
                    break

            batch_record_index = record_indexes[start:end]

            batch_users = users_np[batch_record_index]
            batch_sequences = sequences_np[batch_record_index]
            batch_targets = targets_np[batch_record_index]
            negatives_np = negatives_np_multi[batchID % config.sets_of_neg_samples]
            batch_neg = negatives_np[batch_users]

            batch_users = torch.from_numpy(batch_users).type(torch.LongTensor).to(device)
            batch_sequences = torch.from_numpy(batch_sequences).type(torch.LongTensor).to(device)
            batch_targets = torch.from_numpy(batch_targets).type(torch.LongTensor).to(device)
            batch_negatives = torch.from_numpy(batch_neg).type(torch.LongTensor).to(device)

            items_to_predict = torch.cat((batch_targets, batch_negatives), 1)
            prediction_score = model(batch_sequences, batch_users, items_to_predict, False)
            #print(prediction_score.shape)
            (targets_prediction, negatives_prediction) = torch.split(
                prediction_score, [batch_targets.size(1), batch_negatives.size(1)], dim=1)

            # compute the BPR loss
            loss = -torch.log(torch.sigmoid(targets_prediction - negatives_prediction) + 1e-8)
            loss = torch.mean(torch.sum(loss))

            epoch_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # clean the grad, 
            #optimizer.zero_grad()
        epoch_loss /= num_batches

        t2 = time()

        output_str = "Epoch %d [%.1f s]  loss=%.4f" % (epoch_num + 1, t2 - t1, epoch_loss)
        logger.info(output_str)

        if (epoch_num + 1) % 20 == 0:
            model.eval()
            precision, recall, MAP, ndcg = evaluation(model, train_data, test_data, topk=20)
            logger.info(', '.join(str(e) for e in precision))
            logger.info(', '.join(str(e) for e in recall))
            logger.info(', '.join(str(e) for e in MAP))
            logger.info(', '.join(str(e) for e in ndcg))
            logger.info("Evaluation time:{}".format(time() - t2))
    logger.info("\n")
    logger.info("\n")
    torch.save(model.state_dict(), 'car.pkl')
    print('model arg save complete')
    

In [44]:
# split train test data        
def split_data_sequentially(user_records, test_radio=0.2):
    train_set = []
    test_set = []

    for item_list in user_records:
        len_list = len(item_list)
        num_test_samples = int(math.ceil(len_list * test_radio))
        train_sample = []
        test_sample = []
        for i in range(len_list - num_test_samples, len_list):
            test_sample.append(item_list[i])
            
        for place in item_list:
            if place not in set(test_sample):
                train_sample.append(place)
                
        train_set.append(train_sample)
        test_set.append(test_sample)

    return train_set, test_set
    

def generate_dataset(seq_data):
    user_records = seq_data.tolist()
    # split dataset
    train_val_set, test_set = split_data_sequentially(user_records, test_radio=0.2)
    train_set, val_set = split_data_sequentially(train_val_set, test_radio=0.1)

    return train_set, val_set, train_val_set, test_set, 1083, 200

In [45]:
train_set, val_set, train_val_set, test_set, num_users, num_items = generate_dataset(seq_data)

In [46]:
parser = argparse.ArgumentParser()

# data arguments
parser.add_argument('--L', type=int, default=5)
parser.add_argument('--T', type=int, default=3)
# train arguments
parser.add_argument('--n_iter', type=int, default=200)
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--batch_size', type=int, default=4096)
parser.add_argument('--learning_rate', type=float, default=1e-3)
parser.add_argument('--l2', type=float, default=1e-3)
parser.add_argument('--neg_samples', type=int, default=3)
parser.add_argument('--sets_of_neg_samples', type=int, default=50)

# model dependent arguments
parser.add_argument('--d', type=int, default=50)
config = parser.parse_args(
    args = [
        '--L', '5',
        '--T', '3',
        '--n_iter', '100',
        '--seed', '1200',
        '--batch_size', '500',
        '--learning_rate', '0.001',
        '--l2', '0.001',
        '--neg_samples', '3',
        '--sets_of_neg_samples', '30'
    ])

In [47]:
train = Interactions(train_val_set, num_users, num_items)
train.to_sequence(config.L, config.T)

logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
logger.info(config)
model = att_encoder(num_users, num_items, config, device).to(device)
train_model(model, train, test_set, config)

INFO:__main__:2021-06-15 21:36:00
INFO:__main__:Namespace(L=5, T=3, batch_size=500, d=50, l2=0.001, learning_rate=0.001, n_iter=100, neg_samples=3, seed=1200, sets_of_neg_samples=30)
INFO:__main__:Total training records:37352
INFO:__main__:Negative sampling time: 0.0619509220123291s
INFO:__main__:Epoch 1 [0.4 s]  loss=881.5618
INFO:__main__:Negative sampling time: 0.05385589599609375s
INFO:__main__:Epoch 2 [0.3 s]  loss=665.5214
INFO:__main__:Negative sampling time: 0.053856611251831055s
INFO:__main__:Epoch 3 [0.3 s]  loss=600.8601
INFO:__main__:Negative sampling time: 0.052858829498291016s
INFO:__main__:Epoch 4 [0.3 s]  loss=557.6674
INFO:__main__:Negative sampling time: 0.05388379096984863s
INFO:__main__:Epoch 5 [0.3 s]  loss=524.6755
INFO:__main__:Negative sampling time: 0.05385565757751465s
INFO:__main__:Epoch 6 [0.3 s]  loss=502.7089
INFO:__main__:Negative sampling time: 0.05285763740539551s
INFO:__main__:Epoch 7 [0.3 s]  loss=484.9997
INFO:__main__:Negative sampling time: 0.05385

INFO:__main__:Epoch 64 [0.3 s]  loss=221.0240
INFO:__main__:Negative sampling time: 0.05385541915893555s
INFO:__main__:Epoch 65 [0.3 s]  loss=215.0292
INFO:__main__:Negative sampling time: 0.05385565757751465s
INFO:__main__:Epoch 66 [0.3 s]  loss=216.9201
INFO:__main__:Negative sampling time: 0.05382394790649414s
INFO:__main__:Epoch 67 [0.3 s]  loss=212.5184
INFO:__main__:Negative sampling time: 0.05285763740539551s
INFO:__main__:Epoch 68 [0.3 s]  loss=206.8478
INFO:__main__:Negative sampling time: 0.052828311920166016s
INFO:__main__:Epoch 69 [0.3 s]  loss=207.5565
INFO:__main__:Negative sampling time: 0.05385541915893555s
INFO:__main__:Epoch 70 [0.3 s]  loss=207.0263
INFO:__main__:Negative sampling time: 0.05283093452453613s
INFO:__main__:Epoch 71 [0.3 s]  loss=203.9979
INFO:__main__:Negative sampling time: 0.05283021926879883s
INFO:__main__:Epoch 72 [0.3 s]  loss=200.8496
INFO:__main__:Negative sampling time: 0.05382800102233887s
INFO:__main__:Epoch 73 [0.3 s]  loss=199.2951
INFO:__m

model arg save complete


## Output Encoder Results

In [48]:
best_encoder = att_encoder(num_users, num_items, config, device).to(device)

In [49]:
best_encoder.load_state_dict(torch.load('car.pkl'))

<All keys matched successfully>

In [50]:
pred_1 = Interactions(seq_data.to_list(), num_users, num_items)
pred_1.to_sequence(config.L, config.T)

In [51]:
def encoder_output(model, dataset):
    num_users = dataset.num_users
    num_items = dataset.num_items
    batch_size = 1083
    num_batches = int(num_users / batch_size) + 1
    user_indexes = np.arange(num_users)
    item_indexes = np.arange(num_items)
    pred_list = None
    dataset_matrix = dataset.tocsr()
    dataset_sequences = dataset.sequences.sequences

    for batchID in range(num_batches):
        start = batchID * batch_size
        end = start + batch_size

        if batchID == num_batches - 1:
            if start < num_users:
                end = num_users
            else:
                break
        

        batch_user_index = user_indexes[start:end]

        batch_test_sequences = dataset_sequences[batch_user_index]
        batch_test_sequences = np.atleast_2d(batch_test_sequences)

        batch_test_sequences = torch.from_numpy(batch_test_sequences).type(torch.LongTensor).to(device)
        item_ids = torch.from_numpy(item_indexes).type(torch.LongTensor).to(device)
        batch_user_ids = torch.from_numpy(np.array(batch_user_index)).type(torch.LongTensor).to(device)

        rating_pred = model(batch_test_sequences, batch_user_ids, item_ids, True)
        rating_pred = rating_pred.cpu().data.numpy().copy()
    return rating_pred

In [52]:
output = encoder_output(best_encoder, pred_1)
output.shape

(1083, 200)

In [53]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

In [54]:
save_obj(output, 'clusters_score')