In [71]:
import warnings
warnings.filterwarnings('ignore')
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import time
import os
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset,DataLoader

### Load Data

In [72]:
# load data
dir = 'E:\\Sebnewrepo\\Data\\checkin_data\\dataset_tsmc2014/'
checkin_file = 'dataset_TSMC2014_NYC.txt'
col = ['user_id',
       'poi_id',
       'poi_category_id',
       'poi_category_name',
       'latitude', 
       'longitude',
       'time_offset',
       'UTC_time']
df = pd.read_csv(dir + checkin_file, delimiter = "\t", names = col)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012


In [73]:
# remove infrequent items and users
from copy import deepcopy
def rm_infrequent_items(data, min_counts):
    df = deepcopy(data)
    counts = df['poi_id'].value_counts()
    df = df[df['poi_id'].isin(counts[counts >= min_counts].index)]
    print("POIs with < {} interactoins are removed".format(min_counts))
    return df
def rm_infrequent_users(data, min_counts):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]
    print("users with < {} interactoins are removed".format(min_counts))
    return df
          
filtered_df = rm_infrequent_users(df, 5)
filtered_df = rm_infrequent_items(filtered_df, 5)
print('num of users:{}, num of POIs:{}'.format(len(filtered_df['user_id'].unique()), len(filtered_df['poi_id'].unique())))

users with < 5 interactoins are removed
POIs with < 5 interactoins are removed
num of users:1083, num of POIs:9989


In [74]:
# POIs encode, and generate encode mapping
poi_cat = pd.Categorical(filtered_df['poi_id'])
poi_encode = poi_cat.codes
#generate poi mapping table
poi_mapping = pd.DataFrame({
    'poi_encode': poi_encode,
    'poi_id': filtered_df['poi_id']
    })
#drop duplicate
poi_mapping_output = poi_mapping.drop_duplicates()
filtered_df['poi_encode'] = poi_encode
filtered_df.drop(['poi_id'], axis = 1, inplace = True)
filtered_df.head(5)

Unnamed: 0,user_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,poi_encode
0,470,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,1230
1,979,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,1879
2,69,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,6161
4,87,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,6859
5,484,4bf58dd8d48988d118951735,Food & Drink Shop,40.690427,-73.954687,-240,Tue Apr 03 18:04:00 +0000 2012,4017


In [75]:
# convert to sequential data per user
df_ordered = filtered_df.groupby('user_id').apply(pd.DataFrame.sort_values, 'UTC_time')
df_input = pd.DataFrame({
    'user_id': df_ordered['user_id'],
    'poi_id': df_ordered['poi_encode'],
    'implicit': np.ones(179468)
})

In [76]:
df_input = df_input.reset_index(drop = True)
df_input

Unnamed: 0,user_id,poi_id,implicit
0,1,9066,1.0
1,1,405,1.0
2,1,773,1.0
3,1,678,1.0
4,1,499,1.0
...,...,...,...
179463,1083,1956,1.0
179464,1083,672,1.0
179465,1083,8703,1.0
179466,1083,1956,1.0


### Train Test split

In [77]:
def neg_sample_item(num_item, neg_num,item_list):
    neg_list = []
    while len(neg_list)<neg_num:
        neg_item = np.random.choice(num_item, 1)[0]
        while neg_item in item_list:
            neg_item = np.random.choice(num_item, 1)[0]
        neg_list.append(neg_item)
    return neg_list

def generate_train_test_data(data, neg_num):
    # user rating item
    num_item = len(data['poi_id'].unique())
    
    train = []
    test = []
    # split data
    for uid in data['user_id'].unique():
        item_list = data[data['user_id']==uid]['poi_id'].tolist()
        for i in range(len(item_list)-8):
            item_seq = item_list[i:i+8]
            if i == len(item_list)-9:
                neg_list = neg_sample_item(num_item, neg_num,item_list)
                result_slice = [uid] + item_seq + neg_list
                test.append(result_slice)
            else:
                neg_list = neg_sample_item(num_item, neg_num,item_list)
                result_slice = [uid] + item_seq + neg_list
                train.append(result_slice)
    return train, test

In [78]:
%%time
train_data, test_data = generate_train_test_data(df_input, 3)

train_data = torch.from_numpy(np.array(train_data))
test_data = torch.from_numpy(np.array(test_data))
train_x = train_data[:,:6]
train_y = train_data[:,6:]

# construct dataset for train test
train_dataset = TensorDataset(train_x, train_y)
dataloader = DataLoader(dataset=train_dataset, batch_size=512, shuffle=True)

Wall time: 21.5 s


### self attention model

In [80]:
class SelfAttenion(nn.Module):
    def __init__(self, embedding_dim):
        """
        embeding_dim: int, laten vector dim of item
        """
        super(SelfAttenion, self).__init__()
        self.linear1 = nn.Linear(embedding_dim, embedding_dim)
        self.linear1.weight.data.normal_(mean=0, std=np.sqrt(2.0 / embedding_dim))
        # self.linear2 = nn.Linear(embedding_dim, embedding_dim)
        # init weight
        # nn.init.normal_(self.linear1.weight, mean=0, std=np.sqrt(2.0 / embedding_dim)
    
    def forward(self, item_embedding):
        """
        item_embeding: L*d user history L squence interaction item
        """
        Q = F.relu(self.linear1(item_embedding))
        K = F.relu(self.linear1(item_embedding))
        d = torch.FloatTensor([100]).cuda()
        affinity = torch.matmul(Q, torch.transpose(K, 1, 2))/torch.sqrt(d)
        
        # mask the diagonal value
        mask = torch.eye(item_embedding.size(1), item_embedding.size(1)).byte().cuda()
        affinity = affinity.masked_fill(mask, 0)
        S = F.softmax(affinity)
        A = torch.mean(torch.matmul(S, item_embedding), dim=1)
        return A

In [93]:
class AttSeqModel(nn.Module):
    def __init__(self, num_user, num_item, L, w, embedding_dim):
        """
        num_user: int, user number in dataset
        num_item: int, item number in dataset
        L: int the number of history item will consider
        embeding_dim: int, laten vector dim of item
        """
        super(AttSeqModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_item = num_item
        self.L = L
        self.user_embed = nn.Embedding(num_user, embedding_dim)
        self.item_embed_short = nn.Embedding(num_item, embedding_dim)
        self.item_embed_long = nn.Embedding(num_item, embedding_dim)
        self.item_position_embed = nn.Embedding.from_pretrained(self.position_embed(L),freeze=True)
        self.att = SelfAttenion(embedding_dim).cuda()
        self.w = w
        
        # embedding init
        self.user_embed.weight.data.normal_(0,1.0/self.user_embed.embedding_dim)
        self.item_embed_short.weight.data.normal_(0, 1.0/self.item_embed_short.embedding_dim)
        self.item_embed_long.weight.data.normal_(0, 1.0/self.item_embed_long.embedding_dim)
        
    def position_embed(self, L):
        position_embedding = np.array([[pos/np.power(1000, 2.*i)/ self.embedding_dim for i in range(self.embedding_dim)]
                                      for pos in range(L)])
        position_embedding[:,0::2] = np.sin(position_embedding[:,0::2])
        position_embedding[:,1::2] = np.cos(position_embedding[:,1::2])
        return torch.from_numpy(position_embedding).cuda()
    
    def forward(self, user, seq_item, target=None, for_pred=False):
        """
        user: uid of user
        seq_item: L item id user interacte before
        target: item
        """
        # sequential item embedding
        item_embedding = self.item_embed_short(seq_item)  # L*d
        # item position embedding
        position_idx = torch.range(0,self.L-1).unsqueeze(0).expand(seq_item.size(0),-1).long().cuda()
        position_embedding = self.item_position_embed(position_idx)
        # add position embedding
        item_embedding_cat = item_embedding.float() + position_embedding.float()
        
        # attention
        attention = self.att(item_embedding_cat)
        
        # user embedding
        user_embedding = self.user_embed(user).squeeze()
        # target embedding short and long note: those two embedding is different 
        if target is None:
            target = torch.range(0,self.num_item-1).long().unsqueeze(0).cuda()
            target_embedding_short = self.item_embed_short(target).squeeze()
            target_embedding_long = self.item_embed_long(target).squeeze()
        else:
            target_embedding_short = self.item_embed_short(target).squeeze()
            target_embedding_long = self.item_embed_long(target).squeeze()
        # pred
        if for_pred == False:
            user_embedding = user_embedding.unsqueeze(1).expand(-1,target.size(1),-1)
            attention = attention.unsqueeze(1).expand(-1,target.size(1),-1)
            y_pred = self.w* torch.sqrt(torch.sum((user_embedding - target_embedding_long)**2, dim=2)) + (1-self.w)*torch.sqrt(torch.sum((attention-target_embedding_short)**2, dim=2))
            return y_pred
        else:
            user_embedding = user_embedding.unsqueeze(0).expand(target.size(1),-1)
            attention = attention.expand(target.size(1),-1)
            y_pred = self.w* torch.sqrt(torch.sum((user_embedding - target_embedding_long)**2, dim=1)) + (1-self.w)*torch.sqrt(torch.sum((attention-target_embedding_short)**2, dim=1))
            return y_pred

In [94]:
# parameters
num_user = len(df_input['user_id'].unique())
num_item = len(df_input['poi_id'].unique())
L = 5
embedding_dim = 100
w = 0.2

In [95]:
def train(model, dataloader, test_data, epochs):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=0.001,weight_decay=0.0001)
    for epoch in range(epochs):
        losses = []
        start = time.time()
        for train_x, train_y in dataloader:
            user = train_x[:,0]
            item_seq = train_x[:,1:]
            target_pos = train_y[:,:3]
            target_neg = train_y[:,3:]
            y_pred_pos = model(user, item_seq, target_pos,for_pred=False)
            y_pred_neg = model(user, item_seq, target_neg, for_pred=False)
            optimizer.zero_grad()
            loss = torch.zeros(y_pred_pos.size(0),1).to(device)
            for i in range(y_pred_pos.size(1)):
                l = y_pred_pos[:,i].view(-1,1)
                y_pos_slice = l.expand(-1,y_pred_pos.size(1))
                loss += torch.sum(y_pos_slice - y_pred_neg + 0.5,dim=1).unsqueeze(1)
            loss = torch.mean(loss)
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
        print("Epoch %d loss is %.3f and consume time is %.2f" %(epoch+1, np.mean(losses), (time.time() - start)))
        hr, mrr = test(model, test_data, 50)
        print("hr is %.3f and mrr is %.3f" %(hr, mrr))

In [96]:
def hr(y_target, y_pred, topk):
    y_pred = y_pred[:topk].cpu().numpy()
    for item in y_pred:
        if item in y_target:
            return 1
    return 0

def mrr(y_target, y_pred, topk):
    y_pred = y_pred[:topk].cpu().numpy()
    for idx in range(len(y_pred)):
        if y_pred[idx] in y_target:
            return 1/(idx+1)
    return 0

def test(model, test_data, topk):
    model.eval()
    HR = []
    MRR = []
    for idx in range(test_data.size(0)):
        uid = test_data[idx,0].unsqueeze(0)
        item_seq = test_data[idx, 1:6].unsqueeze(0)
        y_target = test_data[idx,6:9].numpy()
        y_pred = model(uid, item_seq,for_pred=True)
        y_pred = torch.argsort(y_pred)
        hits = hr(y_target, y_pred, topk)
        mrrs = mrr(y_target, y_pred, topk)
        HR.append(hits)
        MRR.append(mrrs)
    return np.mean(HR), np.mean(MRR)

In [97]:
selfatt = AttSeqModel(num_user, num_item, L, w, embedding_dim)
train(selfatt,dataloader,test_data,20)

RuntimeError: CUDA error: device-side assert triggered