This notebook tries to develop a basic understanding of the workflow of KG-enhanved recommendation using [Pykeen](https://pykeen.readthedocs.io/).
In the beginning, we import the necessary packages.

In [None]:
import os, sys, re, pickle, torch
import numpy as np
from numpy.random import default_rng
from pykeen.pipeline import pipeline
from pykeen.models import predict
import torch
import numpy as np
import pandas as pd
from pykeen.triples import TriplesFactory
import pickle
import cupy as cp
from tqdm import tqdm
import time
import sys, os
import pickle

Now, we import the KG and recommendation data which are in separate files. Then, we form the recommendation dataset in the <user, likes, item> format and the "likes" relation is assigned to be the relation number "0" in our "user-item" graph. Thus, we shift the KG relations by one (not to begin from 0).

In [None]:
#%%
kg_path = 'datasets/www_data/www_data/Movielens/kg/train.dat'
rec_path = 'datasets/www_data/www_data/Movielens/rs/ratings.txt'
kg = np.genfromtxt(kg_path, delimiter='\t', dtype=np.int32)
rec = np.genfromtxt(rec_path, delimiter='\t', dtype=np.int32)
rec = rec[:,:3] # remove time col.
rec[:,2] = rec[:,2] >= 4 # binary ratings, 0 if [0, 4), 1 if [4, 5] 
rec = rec[rec[:,2] == 1] # select only positive ratings
rec[:,2] = 0 # set redundant col to relationship 0
kg[:,1] += 1 # offset
kg = remove_rare(kg) #remove rare relations
#kg_train, kg_test = split_kg(kg)
rec = rec[:, [0,2,1]] # <user, likes, item> format

In [None]:
# remove kg rels with very low frequency
def remove_rare(kg):
    _ , counts = np.unique(kg[:,1], return_counts=True)
    #finding rels that occur less than 100 times
    rare_rels = np.where(counts<100)
    for rare_rel in rare_rels[0]:
        kg = np.delete(kg,np.where(kg[:,1] == (rare_rel + 1)),axis=0)
    return kg

Entity matching is a crucial step in data processing for KG-enhanced recommendation. In this step, we use the entity matchings provided to mach the Freebase html codes to Movielens ids.

In [None]:

TOTAL_FB_IDS = np.max(kg) # total number of default kg pairs (# rel << # entities)
# paths for converting data
item2kg_path = 'datasets/www_data/www_data/Movielens/rs/i2kg_map.tsv'
emap_path = 'datasets/www_data/www_data/Movielens/kg/e_map.dat'
# maps movie lense id's to free base html links
ml2fb_map = {}
with open(item2kg_path) as f:
    for line in f:
        ml_id = re.search('(.+?)\t', line)
        fb_http = re.search('\t(.+?)\n', line)
        ml2fb_map.update({int(ml_id.group(1)) : fb_http.group(1)})
# maps free base html links to free base id's (final format)
id2html_map = {}
fb2id_map = {}
with open(emap_path) as f:
    for kg_id, line in enumerate(f):
        fb_http = re.search('\t(.+?)\n', line)
        fb2id_map.update({fb_http.group(1) : kg_id})
        id2html_map.update({kg_id : fb_http.group(1)})
# convert movielens id's to freebase id's
i = 0
while True:
    if i == rec.shape[0]:
        break
    if rec[i,2] in ml2fb_map: 
        # get correct freebase id from data
        fb_http = ml2fb_map[rec[i,2]]
        fb_id = fb2id_map[fb_http]
        rec[i,2] = fb_id
        i += 1
    # remove from rec (only use movies that are in kg)
    else:
        rec = np.delete(rec, i, axis=0)

Finally, we match the user ids with our KG entity codes

In [None]:
umap_path = 'datasets/www_data/www_data/Movielens/rs/u_map.dat'
userid2fbid_map = {}
new_ids = 0
with open(umap_path) as f:
    for line in f:
        ml_id = re.search('\t(.+?)\n', line)
        if int(ml_id.group(1)) in rec[:,0]:
            new_ids += 1
            userid2fbid_map.update({int(ml_id.group(1)) : TOTAL_FB_IDS + new_ids})
# convert movielens user id's into freebase id's
for i in range(rec.shape[0]):
    rec[i,0] = userid2fbid_map[rec[i,0]]
NEW_USER_IDS = new_ids


Splitting the recommendation and KG data into train, test and validation. Also, we should remove items and users in the test/validation datasets that are not included in the training set (we can't expect the model to predict items/users that it hasn't been trained on. 

In [None]:
#split data randomly to train, tets, and val sets
def split(rec, split_test = 0.2, split_val = 0.2):
    np.random.shuffle(rec)
    test_start = int((1-(split_test + split_val))*rec.shape[0])
    val_start = int((1-(split_val))*rec.shape[0])
    rec_train = rec[:test_start]
    rec_test = rec[test_start:val_start]
    rec_val = rec[val_start:]
    return rec_train , rec_test, rec_val

In [None]:
# remove items from test and val that aren't in train
def remove_new(rec_test, rec_train):
    train_items = np.unique(rec_train[:,2])
    test_items = np.unique(rec_test[:,2])
    invalid_items = [item for item in test_items if item not in train_items]
    for invalid_item in invalid_items:
      rec_test = np.delete(rec_test, np.where(rec_test[:,2]== invalid_item),axis=0)
    train_users = np.unique(rec_train[:,0])
    test_users = np.unique(rec_test[:,0])
    invalid_users = [user for user in test_users if user not in train_users]
    for invalid_user in invalid_users:
      rec_test = np.delete(rec_test,np.where(rec_test[:,0]== invalid_user),axis=0)
    return rec_test


In [None]:
rec_train, rec_test, rec_val = split(rec)
rec_test = remove_new(rec_test, rec_train)
rec_val = remove_new(rec_val, rec_train)

kg_train, kg_test, kg_val = split(kg)

train = np.concatenate((rec_train, kg_train))
test = np.concatenate((rec_test, kg_test))
val = np.concatenate((rec_val, kg_val))

np.savetxt("train.tsv", train,fmt="%1d", delimiter="\t")
np.savetxt("test.tsv", test,fmt="%1d", delimiter="\t")
np.savetxt("val.tsv", val,fmt="%1d", delimiter="\t")

In order to have an efficient testing, we make a dictionary with users as the keys and the items they like in each of the train, test, and validation sets as the values and save them. 

In [None]:
# user likes for testing recommendation
def user_likes(test, train):
    tvt = (test, train)

    ul = []
    for data in tvt:
        user_likes = {}
        for i in range(data.shape[0]):
            if data[i,0] not in user_likes:
                user_likes.update({data[i,0]: [data[i,2]]})
            else:
                if data[i,2] not in user_likes[data[i,0]]:
                    user_likes[data[i,0]].append(data[i,2])
        ul.append(user_likes)

    return (ul[0], ul[1]) 


In [None]:
ul_train, ul_test = user_likes(rec_train, rec_test)


with open('ul_train.pkl', 'wb') as f:
    pickle.dump(ul_train,f)
with open('ul_test.pkl', 'wb') as f:
    pickle.dump(ul_test,f)
with open('ul_val.pkl', 'wb') as f:
    pickle.dump(ul_val,f)

In [None]:
with open('/home/admin/Desktop/Empirical/ul_train.pkl','rb') as f:
    ul_train = pickle.load(f)
with open('/home/admin/Desktop/Empirical/ul_test.pkl','rb') as f:
    ul_test = pickle.load(f)
with open('/home/admin/Desktop/Empirical/ul_val.pkl','rb') as f:
    ul_val = pickle.load(f)

Training and Testing using Pykeen library. Once your datasets are ready, Pykeen makes training KG models easy (the following lines). However, since the Pykeen tests all triplest (from all relations), but we are only interested on evaluating the recommendation performance of the system (only the "likes" relation), we implement our own testing.

In [None]:
def train_func(config):

    hit1s_all=[]
    hit3s_all=[]
    hit10s_all=[]
    checkpoint_freq = 1
    # in order to train a KG embedding model with Pykeen, you just need to make a "pipeline" like this.
    #You specify the hyperparameters, datasets, optimizer, embedding model, and the rest is done by Pykeen
    result = pipeline(
            training='/home/admin/Desktop/Empirical/train.tsv',
            testing='/home/admin/Desktop/Empirical/test.tsv',
            #validation='/home/admin/Desktop/Empirical/val.tsv',
            model='TransE',
            model_kwargs=dict(embedding_dim=config["embedding_dim"]),
            optimizer='Adam',
            optimizer_kwargs=dict(lr=config["lr"]),
            training_kwargs=dict(num_epochs=config["epoch"], batch_size=config["batch_size"],
            checkpoint_name='my_checkpoint.pt',
            checkpoint_frequency=checkpoint_freq),
            regularizer_kwargs=dict(weight=config["reg_lambda"]),
            negative_sampler_kwargs=dict(num_negs_per_pos=config["neg_ratio"]),
                #num_negs_per_pos=dict(type=int, low=10, high=100, step=10, log=True),
            )

    result.save_to_directory('transe2')
    #testing performance of the trained model
    ranks=[]
    all_items = np.unique(rec_train[:,2])

    for user in tqdm(list(ul_test.keys())[0:1000]):
      # This is how you can ask Pykeen to make predictions of the head of relation "0" for this specific user
      # the output is the unsorted scores for entities
        predicted_tails_df = predict.get_tail_prediction_df(
            result.model, str(user), '0', triples_factory=result.training)
      # we sort tails based on their scores and only keep "items" (the ones we want to recommend to user)
        tails_sorted = predicted_tails_df['tail_id']
        items_sorted = np.array(tails_sorted[tails_sorted.isin(all_items)])

      # we require to know what are all items that the user likes for filtering correct ones when testing each ground truth
        liked_items_all = ul_train[user]
        try:
            liked_items_all += ul_test[user]
        except:
            pass
        
        try:
            liked_items_all += ul_val[user]
        except:
            pass
        whole_liked_items = np.array(liked_items_all)
        whole_ranks = [(np.where(items_sorted==item))[0][0] for item in whole_liked_items]
        # filter other correct items and calculate the ranks
        for gt in ul_test[user]:
            whole_index = np.where(whole_liked_items==gt)[0][0]
            unfiltered_rank = whole_ranks[whole_index]
            higher_others = np.where(whole_ranks<unfiltered_rank)[0].shape[0]
            filtered_rank = unfiltered_rank-higher_others
            ranks.append(filtered_rank)

    #calculating the metrics
    ranksarray=np.array(ranks)
    hits1=((ranksarray<2).sum())/len(ranks)
    hit1s_all.append(hits1)
    hits3=((ranksarray<4).sum())/len(ranks)
    hit3s_all.append(hits3)
    hits10=((ranksarray<11).sum())/len(ranks)
    hit10s_all.append(hits10)
    print("hit1s:",hit1s_all)
    print("hit3s:",hit3s_all)
    print("hit10s:",hit10s_all)



Passing in hyperparameters, training, and testing

In [None]:
config={
    "embedding_dim" : 64,
    "lr": 0.0291,
    "batch_size" : 256,
    "reg_lambda" : 0.2,
    "neg_ratio" : 10,
    "epoch":1
    }
train_func(config)