# Packages Load

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import model_selection
import gzip
import json
from tqdm import tqdm
import os
from collections import Counter
import os
import sys
import seaborn as sns
import matplotlib as mpl
from scipy import sparse
import scipy
#import spacy
from collections import Counter
from string import punctuation
from copy import deepcopy
import bottleneck as bn

In [None]:
pre_path = "Set4/"
df_path = pre_path+"DFs/"
matr_path = pre_path+"matrix/"

# Property Encodind Methods

In [None]:
encoded_c_p = dict()
encoded_p_c = dict()

In [None]:
# Encodes a type in a sequencial int
def encode_property(property_name, plaintext):
  global encoded_c_p, encoded_p_c
  code = 0
  plaintext_str = str(plaintext)
  const_last_given_code = '$%lgc%$'
  try:
    tmp = encoded_p_c[property_name] #Check if property is in dict
    try:
      code = tmp[plaintext_str] #Check if category is in dict
    except:
      encoded_p_c[property_name][const_last_given_code] += 1
      encoded_p_c[property_name][plaintext_str] = encoded_p_c[property_name][const_last_given_code]
      encoded_c_p[property_name][str(encoded_p_c[property_name][const_last_given_code])] = plaintext_str
      code = encoded_p_c[property_name][const_last_given_code]
  except:
    encoded_c_p[property_name] = dict()
    encoded_p_c[property_name] = dict()
    encoded_p_c[property_name][const_last_given_code] = 0
    encoded_p_c[property_name][plaintext_str] = encoded_p_c[property_name][const_last_given_code]
    encoded_c_p[property_name][str(0)] = plaintext_str
    code = encoded_p_c[property_name][const_last_given_code]
  return code

In [None]:
def decode_property(property_name, code):
  global encoded_c_p
  tmp = encoded_c_p[property_name] #Check if property is in dict
  plaintext = tmp[code] #Check if category is in dict
  return plaintext

# Load Json Dataset

In [None]:
#read file line-by-line and parse json, returns dataframe
def parse_json(filename_gzipped_python_json, read_max=-1):
  #read gzipped content
  f=gzip.open(filename_gzipped_python_json,'r')
  
  #parse json
  parse_data = []
  for line in tqdm(f): #tqdm is for showing progress bar, always good when processing large amounts of data
    line = line.decode('utf-8')
    line = line.replace('true','True') #difference json/python
    line = line.replace('false','False')
    parsed_result = eval(line) #load python nested datastructure
    parse_data.append(parsed_result)
    if read_max !=-1 and len(parse_data) > read_max:
      print(f'Break reading after {read_max} records')
      break
  print(f"Reading {len(parse_data)} rows.")

  #create dataframe
  df= pd.DataFrame.from_dict(parse_data)
  return df

# Data Load

In [None]:
df_books_v1 = parse_json('books.json.gz')
df_interactions_v0 = parse_json('interactions.json.gz', read_max= 1000000)

# Data Preprocessing

The interaction for books not present in Book_DF will be removed


In [None]:
df_interactions_v1 = df_interactions_v0.copy()

In [None]:
df_interactions_v1 = df_books_v1.merge(df_interactions_v1, how='left', on='book_id')[['book_id','user_id']]

In [None]:
df_interactions_v1 = df_interactions_v1.dropna()

## Selecting only relevant interaction

Here we select only the book which are read more than N times and the user which had read more than M books

In [None]:
df_interactions_v1_match = df_interactions_v1.drop_duplicates(subset=["user_id","book_id"])

book_size = df_interactions_v1.groupby('book_id', as_index=False).user_id.size()
book_size = book_size.rename({'size': 'count_item'}, axis='columns')
user_size = df_interactions_v1.groupby('user_id', as_index=False).book_id.size()
user_size = user_size.rename({'size': 'count_user'}, axis='columns')

df_interactions_v1_match = pd.merge(df_interactions_v1_match, book_size, how='left', on=['book_id'])
df_interactions_v1_match = pd.merge(df_interactions_v1_match, user_size, how='left', on=['user_id'])

df_interactions_v1_match = df_interactions_v1_match[df_interactions_v1_match['count_item'] > 5]
df_interactions_v1_match = df_interactions_v1_match[df_interactions_v1_match['count_user'] > 5]
#df_interactions_v1_match = df_interactions_v1_match[['book_id', 'user_id', 'weighted_rating']]

In [None]:
df_interactions_v1 = df_interactions_v1_match.copy()
df_interactions_v1 = df_interactions_v1.sort_values(by=['user_id'])

In [None]:
df_interactions_v1 = df_interactions_v1[['book_id', 'user_id']]

In [None]:
df_interactions_v1

In [None]:
print(df_interactions_v1.book_id.nunique())
print(df_interactions_v1.user_id.nunique())

# Prepare Train and Test

In [None]:
uuid = df_interactions_v1.user_id.unique()

In [None]:
np.random.seed(98765)
uuid_perm = np.random.permutation(uuid.size)
uuid = uuid[uuid_perm]

Randomly divides the user in train set and user set

In [None]:
n_users = uuid.size
n_heldout_users = int(n_users*0.30)

tr_users = uuid[:(n_users - n_heldout_users)]
te_users = uuid[(n_users - n_heldout_users):]

In [None]:
train_set = df_interactions_v1.loc[df_interactions_v1['user_id'].isin(tr_users)]

In [None]:
buid = train_set.book_id.unique()

In [None]:
test_set = df_interactions_v1.loc[df_interactions_v1['user_id'].isin(te_users)]
test_set = test_set.loc[test_set['book_id'].isin(buid)]

In [None]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user_id')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [None]:
test_set_tr, test_set_te = split_train_test_proportion(test_set, 0.3)

## Encode DF Property(Creates consecutive int id)

In [None]:
df_list = [test_set_tr, test_set_te, train_set]
for udf in df_list:
    udf['book_id'] = udf.book_id.apply(lambda x: encode_property('book_id', x))
    udf['user_id'] = udf.user_id.apply(lambda x: encode_property('user_id', x))


In [None]:
train_set

In [None]:
print((train_set.book_id.max()))
print((train_set.user_id.max()))
print((test_set_tr.book_id.max()))
print((test_set_tr.user_id.max()))
print((test_set_te.book_id.max()))
print((test_set_te.user_id.max()))

In [None]:
test_set_tr.to_csv(df_path+'test_set_tr.csv', index = False)
test_set_te.to_csv(df_path+'test_set_te.csv', index = False)
train_set.to_csv(df_path+'train_set.csv', index = False)

# Training

In [None]:
train_set = pd.read_csv(df_path+'train_set.csv')
test_set_tr = pd.read_csv(df_path+'test_set_tr.csv')
test_set_te = pd.read_csv(df_path+'test_set_te.csv')

In [None]:
train_set = train_set.groupby('user_id').book_id.apply(list).reset_index()
test_set_tr = test_set_tr.groupby('user_id').book_id.apply(list).reset_index()
test_set_te = test_set_te.groupby('user_id').book_id.apply(list).reset_index()

Creates a matrix from a dataframe

In [None]:
def create_sparse_matrix(sessions_df, column='history', shape=None):
      #flatten
  user_ids = []
  item_ids = []
  for idx, row in sessions_df.iterrows():
    items = row[column]
    user = row['user_id']
    user_ids.extend([user] * len(items))
    item_ids.extend(items)
  #create csr matrix
  values = np.ones(len(user_ids))
  matrix = sparse.csr_matrix((values, (user_ids, item_ids)), shape=shape, dtype=np.int32)
  return matrix

Creates the pair list and matrix Z

In [None]:
def create_list_feature_pairs(XtX, threshold):
  AA= sparse.triu(np.abs(XtX), format='csr').todense()
  AA[ np.diag_indices(AA.shape[0]) ]=0.0
  ii_pairs = np.where((AA>threshold)==True)
  return ii_pairs

def create_matrix_Z(ii_pairs, X):
  MM = sparse.csr_matrix((len(ii_pairs[0]), X.shape[1]), dtype=np.float32)
  MM_lil = sparse.lil_matrix(MM)
  MM_lil[np.arange(MM_lil.shape[0]) , ii_pairs[0]   ]=1.0
  MM_lil[np.arange(MM_lil.shape[0]) , ii_pairs[1]   ]=1.0
  MM = sparse.csr_matrix(MM_lil)
  #CCmask = 1-MM.todense() # see Eq. 8 in the paper
  CCmask = MM.nonzero()
  MM=sparse.csc_matrix(MM.T)
  Z=  X @ MM
  Z= (Z == 2.0 )
  Z=Z*1.0
  return [ Z, CCmask]

Calculate matrix PP and QQ

In [None]:
def calculate_PP(XtX, XtXdiag, lambdaBB):
    ii_diag=np.diag_indices(XtX.shape[0])
    XtX[ii_diag] = XtXdiag+lambdaBB
    #XtX = XtX.todense()
    #XtX_csc = sparse.csc_matrix(XtX, dtype=np.float64)
    PP=np.linalg.inv(XtX.todense())
    #PP_sparse = sparse.csr_matrix(PP)
    return [PP, XtX]

def calculate_QQ(ZtZ, ZtZdiag, lambdaCC, rho):
    ii_diag_ZZ=np.diag_indices(ZtZ.shape[0])
    ZtZ[ii_diag_ZZ] = ZtZdiag+lambdaCC+rho
    #ZtZ = ZtZ.todense()
    #ZtZ_csc = sparse.csc_matrix(ZtZ, dtype=np.float64)
    QQ=np.linalg.inv(ZtZ.todense())
    QQ_sparse = sparse.csr_matrix(QQ)
    return [QQ_sparse, ZtZ]

Param Setting

In [None]:
epochs = 40
threshold, lambdaBB, lambdaCC, rho = 110,  500,  5000, 10000 #79,  500,  5000, 10000

In [None]:
X = create_sparse_matrix(train_set, 'book_id')
XtX = sparse.csr_matrix(X.T) @ X
XtXdiag= deepcopy( XtX.diagonal())
X.shape

In [None]:
XtX[ np.diag_indices(XtX.shape[0]) ]=XtXdiag

ii_feature_pairs = create_list_feature_pairs(XtX, threshold)
print("number of feature-pairs: {}".format(len(ii_feature_pairs[0])))
Z, CCmask = create_matrix_Z(np.array(ii_feature_pairs), X)

In [None]:
Z = sparse.csr_matrix(Z, dtype=np.float64)
Zt = sparse.csr_matrix(Z.T, dtype = np.float64)
Zt.indptr = Zt.indptr.astype(np.uint64)
Zt.indices = Zt.indices.astype(np.uint64)

In [None]:
ZtZ = Zt.dot(Z)
ZtX = sparse.csr_matrix(Z.T) @ X
ZtZdiag=deepcopy(ZtZ.diagonal())

In [None]:
PP, XtX = calculate_PP(XtX, XtXdiag, lambdaBB)
QQ, ZtZ = calculate_QQ(ZtZ, ZtZdiag, lambdaCC, rho)
PP = np.array(PP)

In [None]:
def train_higher(XtX, XtXdiag, ZtZ, CCmask, ZtX, rho, epochs, QQ, PP):
    ii_diag=np.diag_indices(XtX.shape[0])
    CC = sparse.csr_matrix((ZtZ.shape[0], XtX.shape[0]),dtype=np.float )
    DD = sparse.csr_matrix((ZtZ.shape[0], XtX.shape[0]),dtype=np.float )
    UU = sparse.csr_matrix((ZtZ.shape[0], XtX.shape[0]),dtype=np.float ) # is Gamma in paper
    #PP_d = PP.todense()
    for iter in range(epochs):
        print("epoch {}".format(iter))
        # learn BB
        print('learn BB')
        XtX[ii_diag] = XtXdiag
        tmp = (XtX-(ZtX.T.todense() @ CC.todense()))
        BB = np.matmul(PP, tmp)
        gamma = np.array(np.divide(BB.diagonal(), PP.diagonal()))[0]
        #gamma = sparse.csr_matrix(BB).diagonal() / PP.diagonal()
        BB -= PP * gamma
        # learn CC
        print('learn CC')
        CC = sparse.csr_matrix(QQ.todense() @ ((ZtX - sparse.csr_matrix(ZtX.todense() @ BB) +(rho *(DD-UU))).todense()))
        #CC= QQ.dot(ZtX - sparse.csr_matrix(ZtX.todense() @ BB) +(rho *(DD-UU)))
        # learn DD
        print('learn DD')
        CC_temp = CC.copy()
        CC_temp = sparse.lil_matrix(CC_temp)
        CC_temp[CCmask] = 0
        CC_temp = sparse.csr_matrix(CC_temp)
        DD =  CC_temp
        #DD= np.maximum(0.0, DD) # if you want to enforce non-negative parameters
        # learn UU (is Gamma in paper)
        print('learn UU')
        UU+= CC-DD
    return [BB,DD]

In [None]:
BB, CC = train_higher(XtX, XtXdiag, ZtZ, CCmask, ZtX, rho, epochs, QQ, PP)

In [None]:
np.save(matr_path + "CC.npy", CC.todense())
np.save(matr_path + "BB.npy", BB)

# Evaluation

In [None]:
CC = np.load(matr_path + "CC.npy")
BB = np.load(matr_path + "BB.npy")

In [None]:
N_test = test_set_tr.shape[0]
idxlist_test = range(N_test)
N_test

In [None]:
test_data_tr = create_sparse_matrix(test_set_tr, 'book_id')
test_data_te = create_sparse_matrix(test_set_te, 'book_id')

In [None]:
supp_tr = sparse.csr_matrix((test_data_tr.shape[0], (X.shape[1] - test_data_tr.shape[1])), dtype=int)
supp_te = sparse.csr_matrix((test_data_te.shape[0], (X.shape[1] - test_data_te.shape[1])), dtype=int)
test_data_tr = sparse.csr_matrix(sparse.hstack([test_data_tr, supp_tr]))
test_data_te = sparse.csr_matrix(sparse.hstack([test_data_te, supp_te]))

In [None]:
Z_test_data_tr , _ = create_matrix_Z(ii_feature_pairs, test_data_tr)

In [None]:
print(test_data_tr.shape)
print(test_data_te.shape)
print(X.shape)
print(Z.shape)
print(CC.shape)
print(BB.shape)
print(Z_test_data_tr.shape)

In [None]:
print(Z.shape)
print(CC.shape)

In [None]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                        idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                            idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                        for n in heldout_batch.getnnz(axis=1)])
    
    NDCG = DCG / IDCG
    NDCG[np.isnan(NDCG)] = 0
    return NDCG

In [None]:
def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]
    
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True
    print(X_pred_binary.shape)
    X_true_binary = (heldout_batch > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float64)
    recall = np.divide(tmp, np.minimum(k, X_true_binary.sum(axis=1)))
    
    recall = np.nan_to_num(recall, nan=0.0)
    return recall

In [None]:
Xtest = test_data_tr
Ztest = Z_test_data_tr

if sparse.isspmatrix(Xtest):
        Xtest = Xtest.toarray()
        Ztest = Ztest.toarray()
Xtest = Xtest.astype('int64')
Ztest = Ztest.astype('float64')
pred_val = (Xtest @ BB) + (Ztest @ CC)

In [None]:
r3_list, r5_list, r20_list, r50_list, r10_list = [], [], [], [], []

r20_list = (Recall_at_k_batch(pred_val, test_data_te, k=20))
r50_list = (Recall_at_k_batch(pred_val, test_data_te, k=50))
r10_list = (Recall_at_k_batch(pred_val, test_data_te, k=10))
r3_list = (Recall_at_k_batch(pred_val, test_data_te, k=3))
r5_list = (Recall_at_k_batch(pred_val, test_data_te, k=5))

print("Test Recall@3=%.5f (%.5f)" % (np.mean(r3_list), np.std(r3_list) / np.sqrt(len(r3_list))))
print("Test Recall@5=%.5f (%.5f)" % (np.mean(r5_list), np.std(r5_list) / np.sqrt(len(r5_list))))
print("Test Recall@10=%.5f (%.5f)" % (np.mean(r10_list), np.std(r10_list) / np.sqrt(len(r10_list))))
print("Test Recall@20=%.5f (%.5f)" % (np.mean(r20_list), np.std(r20_list) / np.sqrt(len(r20_list))))
print("Test Recall@50=%.5f (%.5f)" % (np.mean(r50_list), np.std(r50_list) / np.sqrt(len(r50_list))))

In [None]:
sparse.save_npz(matr_path+"Xtest.npz", sparse.csr_matrix(Xtest))
sparse.save_npz(matr_path+"Ztest.npz", sparse.csr_matrix(Ztest))

# Plot

In [None]:
from matplotlib import pyplot as plt 

a = pred_val[np.where(pred_val > -3.0)].flatten()
plt.figure(dpi=300)
plt.hist(a, bins=500, log=True)
plt.title("Predicted Values Distribution") 
plt.axvline(x=0.0, label='0', c='r', linewidth=0.1)
plt.show()