In [32]:
import numpy as np
import torch
import pandas as pd
from sklearn.metrics import roc_auc_score
np.random.seed(2020)
torch.manual_seed(2020)
import pdb
import scipy.sparse as sps
import numpy as np
import torch
torch.manual_seed(2020)
from torch import nn
import torch.nn.functional as F
from math import sqrt

def sigmoid(x):
    return 1./(1. + np.exp(-x))

from dataset import load_data, load_features
from matrix_factorization import MF, MF_N_IPS, MF_N_DR_JL, MF_N_MRDR_JL
from baselines import MF, MF_IPS, MF_ASIPS, MF_SNIPS, MF_DR, MF_DR_JL, MF_MRDR_JL, MF_BaseModel
from models import MLP, MLP_exp, MLP_weibull, MLP_lognormal, MF_IPS_DF, MF_DR_JL_DF

from utils import gini_index, ndcg_func, get_user_wise_ctr, rating_mat_to_sample, binarize, shuffle, minU,recall_func, precision_func
from utils import ndcg_func_both, ndcg_func_feature, recall_func_both, recall_func_feature, generate_total_sample
mse_func = lambda x,y: np.mean((x-y)**2)
acc_func = lambda x,y: np.sum(x == y) / len(x)


dataset_name = "coat"

if dataset_name == "coat":
    train_mat, test_mat = load_data("coat")        
    x_train, y_train = rating_mat_to_sample(train_mat)
    x_test, y_test = rating_mat_to_sample(test_mat)
    num_user = train_mat.shape[0]
    num_item = train_mat.shape[1]

elif dataset_name == "yahoo":
    x_train, y_train, x_test, y_test = load_data("yahoo")
    x_train, y_train = shuffle(x_train, y_train)
    num_user = x_train[:,0].max() + 1
    num_item = x_train[:,1].max() + 1

print("# user: {}, # item: {}".format(num_user, num_item))
# binarize
y_train = binarize(y_train)
y_test = binarize(y_test)
n_train = x_train.shape[0]

train_user_ind = x_train[:, 0].astype('int')
train_item_ind = x_train[:, 1].astype('int')
test_user_ind = x_test[:, 0].astype('int')
test_item_ind = x_test[:, 1].astype('int')

# recover the complete matrix and retrieve the features

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
mf = MF(num_user, num_item, batch_size=128)
mf.to(device)
mf.fit(x_train, y_train, 
    lr=0.05,
    lamb=1e-3,
    tol=1e-4)

x_train_x, u_emb_train, v_emb_train = mf.forward(x_train, True)
x_test_x, u_emb_test, v_emb_test = mf.forward(x_test, True)

# for the feature set of training dataset and test dataset
feature_train = torch.cat([u_emb_train, v_emb_train], axis = 1).detach().cpu()
feature_test = torch.cat([u_emb_test, v_emb_test], axis = 1).detach().cpu()

# the feature of user and item
user_W = mf.W(torch.LongTensor(np.arange(num_user) ).to(device) ).detach().cpu()
item_H = mf.H(torch.LongTensor(np.arange(num_item) ).to(device) ).detach().cpu()

# L = 5, 20%, 19.66%
# L = 6.7, 15%, 15.05%
# L = 3.2, 30%, 30.12%
# L = 9.7, 10%, 10.02%
# L = 3.92, 25%, 24.99%

L = 5
sigmaH = 0.1
sigmaB = 0.1

num_feature = feature_train.shape[1]
identity_p = np.diag(np.ones(num_feature))
mean_p = np.zeros(num_feature)

W_d = np.random.multivariate_normal(mean_p, sigmaH**2*identity_p)
print(W_d)
W_b = np.random.multivariate_normal(mean_p, sigmaB**2*identity_p)
print(W_b)

y_train_mask = np.zeros_like(y_train)
e_train = np.zeros_like(y_train, dtype='float')
d_train = np.zeros_like(y_train, dtype='float') + 1e5

prod = 0
for i in range(n_train):

    ts_i = np.random.uniform(0, L)
    # lambda_i = np.exp( np.dot(W_d, feature_train[i, :]) )
    shape_i = np.exp(np.dot(W_d, feature_train[i, :]))
    scale_i = np.exp(np.dot(W_b, feature_train[i, :]))
    d_i = scale_i*np.random.weibull(shape_i)
    # d_i = np.random.exponential(lambda_i)
    e_i = L - ts_i
    if d_i <= e_i:
        y_train_mask[i] = y_train[i]
        d_train[i] = d_i
    else:
        if y_train[i] == 1:
            prod += 1
        y_train_mask[i] = 0
    
    e_train[i] = e_i

print('mask proportion (within the original y_train = 1 )')
print( prod/(sum(y_train) )*100 )

test_pred = mf.predict(x_test)
mse_mf = mse_func(y_test, test_pred)
auc_mf = roc_auc_score(y_test, test_pred)
print('prediction for base model')
print(auc_mf)

ips_idxs = np.arange(len(y_test))
np.random.shuffle(ips_idxs)
y_ips = y_test[ips_idxs[:int(0.05 * len(ips_idxs))]]

feature_test = feature_test.to(device)

===>Load from coat data set<===
[train] rating ratio: 0.080000
[test]  rating ratio: 0.053333
# user: 290, # item: 300
[MF] epoch:8, xent:37.39917492866516
[-0.04494046 -0.09064704  0.11771674 -0.20835941  0.08493842 -0.07913483
 -0.04008389  0.0490871 ]
[ 0.09056021  0.04759849  0.07210223  0.03382676 -0.02803162  0.0334028
  0.08332263 -0.12170204]
mask proportion (within the original y_train = 1 )
20.016565433462176
prediction for base model
0.6076920935476612


In [33]:
# for our model
# for IPS
mf_ips_df_acc = []
for repeat in np.arange(10):
    mf_ips_df = MF_IPS_DF(num_user, num_item, num_feature, batch_size=128)
    mf_ips_df.to(device)
    mf_ips_df.fit(x_train, y_train_mask, e_train, d_train, feature_train, y_ips,lr=0.025, lamb=6e-3, lamb1=4e-2, tol=1e-5)
    test_pred = mf_ips_df.predict(x_test, feature_test)
    mse = mse_func(y_test, test_pred)
    auc = roc_auc_score(y_test, test_pred)
    ndcg_res = ndcg_func_both(mf_ips_df, x_test, y_test, feature_test)
    recall_res = recall_func_both(mf_ips_df, x_test, y_test, feature_test)

    print(auc)

    mf_ips_df_acc.append([ mse, auc, np.mean(ndcg_res["ndcg_5"]), np.mean(ndcg_res['ndcg_10']), np.mean(recall_res['recall_5']), np.mean(recall_res['recall_10'])   ])

mf_ips_df_acc = np.array(mf_ips_df_acc)
mf_ips_df_mean = mf_ips_df_acc.mean(0)
mf_ips_df_sd = mf_ips_df_acc.std(0)

print("[MF_IPS_DF] test auc:", mf_ips_df_mean[1], ' sd: ', mf_ips_df_sd[1])
print(mf_ips_df_mean)
print(mf_ips_df_sd)

[MF-IPS-DF] epoch:26, xent:532.1969261169434
0.6969418493781507
[MF-IPS-DF] epoch:33, xent:529.6597728729248
0.696547949633417
[MF-IPS-DF] epoch:30, xent:531.7536392211914
0.6964803825360997
[MF-IPS-DF] epoch:26, xent:532.2422885894775
0.6992843687435188
[MF-IPS-DF] epoch:30, xent:531.7623872756958
0.6981660801185314
[MF-IPS-DF] epoch:35, xent:530.8436880111694
0.696543503157771
[MF-IPS-DF] epoch:30, xent:531.9662780761719
0.7029873163315572
[MF-IPS-DF] epoch:32, xent:530.8772306442261
0.6980517283644161
[MF-IPS-DF] epoch:24, xent:532.2524747848511
0.7013356439540691
[MF-IPS-DF] epoch:30, xent:531.7971487045288
0.69763530625391
[MF_IPS_DF] test auc: 0.6983974128471442  sd:  0.0020950439467808894
[0.23286581 0.69839741 0.60200379 0.67233188 0.411722   0.68977626]
[0.00030359 0.00209504 0.0067427  0.0055132  0.01020151 0.00572407]


In [34]:
# for DR
mf_dr_df_acc = []
for repeat in np.arange(10):
    mf_dr_df = MF_DR_JL_DF(num_user, num_item, num_feature, batch_size=128)
    mf_dr_df.to(device)
    mf_dr_df.fit(x_train, y_train_mask, e_train, d_train, feature_train, user_W, item_H,  y_ips, lr=0.03, lamb=8e-2, lambv=8e-3, tol=1e-5)
    test_pred = mf_dr_df.predict(x_test)
    mse = mse_func(y_test, test_pred)
    auc = roc_auc_score(y_test, test_pred)
    ndcg_res = ndcg_func(mf_dr_df, x_test, y_test)
    recall_res = recall_func(mf_dr_df, x_test, y_test)

    print(auc)

    mf_dr_df_acc.append([ mse, auc, np.mean(ndcg_res["ndcg_5"]), np.mean(ndcg_res['ndcg_10']), np.mean(recall_res['recall_5']), np.mean(recall_res['recall_10'])   ])

mf_dr_df_acc = np.array(mf_dr_df_acc)
mf_dr_df_mean = mf_dr_df_acc.mean(0)
mf_dr_df_sd = mf_dr_df_acc.std(0)

print("[MF_DR_JL_DF] test auc:", mf_dr_df_mean[1], ' sd: ', mf_dr_df_sd[1])
print(mf_dr_df_mean)
print(mf_dr_df_sd)

[MF-DR-JL-DF] epoch:19, xent:48671060.5625
0.7138383601707138
[MF-DR-JL-DF] epoch:20, xent:48721251.375
0.7092493073164243
[MF-DR-JL-DF] epoch:24, xent:48653858.375
0.7123517873672147
[MF-DR-JL-DF] epoch:26, xent:48670957.4375
0.7101658612745996
[MF-DR-JL-DF] epoch:22, xent:48737774.3125
0.7218780714513837
[MF-DR-JL-DF] epoch:24, xent:48686773.0
0.746241954778956
[MF-DR-JL-DF] epoch:18, xent:48722182.625
0.7111101573743058
[MF-DR-JL-DF] epoch:22, xent:48654837.125
0.7183001084940057
[MF-DR-JL-DF] epoch:22, xent:48770902.9375
0.7258584404547314
[MF-DR-JL-DF] epoch:21, xent:48719167.6875
0.7336002378671145
[MF_DR_JL_DF] test auc: 0.7202594286549451  sd:  0.011408109737734599
[0.3374387  0.72025943 0.62182461 0.69322807 0.44132738 0.72354069]
[0.00087395 0.01140811 0.02343545 0.01937338 0.02048756 0.01474513]


In [36]:
# for weibull
mlp_weibull_acc = []
for repeat in np.arange(10):
    mlp_weibull = MLP_weibull(num_feature, batch_size=128)
    mlp_weibull.to(device)
    mlp_weibull.fit(x_train, y_train_mask, e_train, d_train, feature_train, lr=0.05, lamb=1e-5,tol=1e-5)
    test_pred = mlp_weibull.predict(feature_test)
    mse = mse_func(y_test, test_pred)
    auc = roc_auc_score(y_test, test_pred)
    ndcg_res = ndcg_func_feature(mlp_weibull, x_test, y_test, feature_test)
    recall_res = recall_func_feature(mlp_weibull, x_test, y_test, feature_test)

    print(auc)

    mlp_weibull_acc.append([ mse, auc, np.mean(ndcg_res["ndcg_5"]), np.mean(ndcg_res['ndcg_10']), np.mean(recall_res['recall_5']), np.mean(recall_res['recall_10'])   ])

mlp_weibull_acc = np.array(mlp_weibull_acc)
mlp_weibull_mean = mlp_weibull_acc.mean(0)
mlp_weibull_sd = mlp_weibull_acc.std(0)

print("[MLP_weibull] test auc:", mlp_weibull_mean[1], ' sd: ', mlp_weibull_sd[1])
print(mlp_weibull_mean)
print(mlp_weibull_sd)

[MLP_weibull] epoch:17, xent:46.37765032052994
0.6327377375867933
[MLP_weibull] epoch:18, xent:46.303588688373566
0.6326629208009225
[MLP_weibull] epoch:20, xent:46.223091304302216
0.6327529136014984
[MLP_weibull] epoch:17, xent:46.290791511535645
0.6326787734532258
[MLP_weibull] epoch:19, xent:46.35124361515045
0.6328103311348412
[MLP_weibull] epoch:17, xent:46.25705134868622
0.6326357586344757
[MLP_weibull] epoch:17, xent:46.34184193611145
0.6326512246367231
[MLP_weibull] epoch:17, xent:46.352620124816895
0.6326238691452482
[MLP_weibull] epoch:25, xent:46.30915826559067
0.6328831180079171
[MLP_weibull] epoch:20, xent:46.305408239364624
0.6327790124802906
[MLP_weibull] test auc: 0.6327215659481935  sd:  8.102378606978565e-05
[0.23746481 0.63272157 0.52693991 0.6039505  0.36258174 0.64406837]
[3.22806066e-03 8.10237861e-05 9.60235543e-04 4.15404686e-04
 1.41966820e-03 8.40981003e-04]


In [39]:
# for DR
mf_dr_acc = []
for repeat in np.arange(10):
    mf_dr = MF_DR(num_user, num_item, batch_size=128)
    mf_dr.to(device)
    mf_dr.fit(x_train, y_train_mask, y_ips, lr=0.05, lamb=1e-4,tol=1e-5)
    test_pred = mf_dr.predict(x_test)
    mse = mse_func(y_test, test_pred)
    auc = roc_auc_score(y_test, test_pred)
    ndcg_res = ndcg_func(mf_dr, x_test, y_test)
    recall_res = recall_func(mf_dr, x_test, y_test)

    print(auc)

    mf_dr_acc.append([ mse, auc, np.mean(ndcg_res["ndcg_5"]), np.mean(ndcg_res['ndcg_10']), np.mean(recall_res['recall_5']), np.mean(recall_res['recall_10'])   ])

mf_dr_acc = np.array(mf_dr_acc)
mf_dr_mean = mf_dr_acc.mean(0)
mf_dr_sd = mf_dr_acc.std(0)

print("[MF_DR] test auc:", mf_dr_mean[1], ' sd: ', mf_dr_sd[1])
print(mf_dr_mean)
print(mf_dr_sd)

[MF-DR] epoch:14, xent:56306.5693359375
0.6793686043247582
[MF-DR] epoch:16, xent:56248.477600097656
0.6639758722631943
[MF-DR] epoch:17, xent:56253.173889160156
0.6798941584136213
[MF-DR] epoch:15, xent:56235.28790283203
0.6851399363883327
[MF-DR] epoch:13, xent:56240.603576660156
0.6709156607965455
[MF-DR] epoch:17, xent:56270.27429199219
0.6695574558116983
[MF-DR] epoch:15, xent:56295.341796875
0.6664842645026637
[MF-DR] epoch:15, xent:56292.69189453125
0.6648363619632235
[MF-DR] epoch:14, xent:56259.29040527344
0.6601560596956755
[MF-DR] epoch:19, xent:56316.093505859375
0.6670462603593217
[MF_DR] test auc: 0.6707374634519034  sd:  0.007690383316882942
[0.23439641 0.67073746 0.57992421 0.65429288 0.39471662 0.68412066]
[0.00030512 0.00769038 0.01025854 0.00553215 0.01357578 0.00806498]
