In [1]:
from argparse import ArgumentParser
from cpo import CPO
from memory import Memory
from models import Actor, Critic
# from simulators import SinglePathSimulator
import pandas as pd
import numpy as np

from data_util import read_file
# from fair_env_simulator import *
from fair_env import *
import torch
import os
import time
import torch.nn.functional as F
from gini import gini
import matplotlib.pyplot as plt
from test_ranking import *

In [2]:
model_name = 'cpo'
data_name = 'ml-100k'
data = read_file('../data/'+data_name+'/train_data.csv')
item_embeddings = np.load('../data/'+data_name+'/pmf_item_embed.npy')
user_embeddings = np.load('../data/'+data_name+'/pmf_user_embed.npy')
item_indicator = np.load('../data/'+data_name+'/item_cost_indicator_28.npy')

nb_item = item_embeddings.shape[0]
nb_user = user_embeddings.shape[0]
print('num of users: %d, num of items: %d' %(nb_user, nb_item))

device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

env_args = {}
env_args['data'] = data
env_args['nb_user'] = nb_user
env_args['nb_item'] = nb_item
env_args['item_embeddings'] = item_embeddings
env_args['user_embeddings'] = user_embeddings
env_args['item_indicator'] = item_indicator
env_args['device'] = device
env_args['gamma'] = 0.95
env_args['frac'] = 1

env = Environment(**env_args)
print('Successfully create Training Env!')

num of users: 943, num of items: 1682
Successfully create Training Env!


In [3]:
from collections import defaultdict, namedtuple
import numpy as np
import torch

from autoassign import autoassign
from memory import Memory, Trajectory
from torch_utils.torch_utils import get_device
from gini import gini
import random


class Simulator:
    def __init__(self, env, policy, n_trajectories, trajectory_len, **env_args):
        self.env = env
        self.policy = policy


class SinglePathSimulator(Simulator):
    def __init__(self, env, policy, n_trajectories, trajectory_len, **env_args):
        Simulator.__init__(self, env, policy, n_trajectories, trajectory_len, **env_args)
        self.item_embeddings= env_args['item_embeddings']
        self.trajectory_len = trajectory_len
        self.n_trajectories = n_trajectories
        self.nb_item = env_args['nb_item']
        self.device = env_args['device']
        self.hit_rate = []
        self.gini_coefficient = []
        self.pop_rate = []

    def run_sim(self):
        self.policy.eval()
        with torch.no_grad():
            trajectories = np.asarray([Trajectory() for i in range(self.n_trajectories)])
            ra_length = 1
            
            epsilon_start = 0.2
            epsilon_end = 0.1
            
            item_embeds = torch.from_numpy(self.item_embeddings).to(self.device).float()

            ave_score = 0
            ave_cost = 0
            states = self.env.reset()
            users = self.env.current_user
            
            recommended_item_onehot = torch.FloatTensor(self.n_trajectories, self.nb_item).zero_().to(self.device)
            recommendations = []
            for t in range(self.trajectory_len): 
                """
                epsilon decay
                """
                temp_r = max((self.trajectory_len-t)/self.trajectory_len, 0)
                epsilon = (epsilon_start - epsilon_end) * temp_r + epsilon_end
                
                policy_input = torch.FloatTensor(states).to(self.device)
                user_input = torch.FloatTensor(users).to(self.device)

                if np.random.rand() >= epsilon:
                    weight_dists = self.policy(policy_input, user_input)
                    w = weight_dists.sample()
#                     print(w.shape, item_embeds.shape)
#                     input()
#                     w.view(-1,item_embeds.shape[1])
#                     w = weight_dists.mean
                    item_weights = torch.mm(w, item_embeds.transpose(0,1)).view(self.n_trajectories, ra_length, -1)
#                     print(item_weights.shape)
#                     input()
                    item_weights = torch.mul(item_weights.transpose(0,1), 1-recommended_item_onehot).reshape(states.shape[0],ra_length,-1)
                    item_idxes = torch.argmax(item_weights,dim=2)
                else:
                    item_weights = torch.FloatTensor(self.n_trajectories, ra_length, nb_item).uniform_(0, 1).to(device)
                    item_weights = torch.mul(item_weights.transpose(0,1), 1-recommended_item_onehot).reshape(states.shape[0],ra_length,-1)
                    item_idxes = torch.argmax(item_weights,dim=2)

                recommendations.append(item_idxes)
                recommended_item_onehot = recommended_item_onehot.scatter_(1, item_idxes, 1)

                actions = item_embeds[item_idxes.cpu().detach()]
                states_prime, users_prime, rewards, costs, info = self.env.step(actions, item_idxes)

                for i in range(len(trajectories)):
#                     if rewards[i] == 0 and random.random() <= 0.5:
#                         continue
#                     else:
                    trajectory = trajectories[i]
                    trajectory.observations.append(policy_input[i].to(self.device).squeeze())
                    trajectory.users.append(user_input[i].to(self.device).squeeze())
                    trajectory.actions.append(actions[i].to(self.device).squeeze())
                    trajectory.rewards.append(rewards[i].to(self.device).squeeze())
                    trajectory.costs.append(costs[i].to(self.device).squeeze())


                states = states_prime
                users = users_prime
                ave_score += torch.sum(info).detach().cpu()
                ave_cost += torch.sum(costs).detach().cpu()
                 
            memory = Memory(trajectories)
    
#             print(ave_score.float()/(self.trajectory_len*self.n_trajectories), ave_cost/(self.trajectory_len*self.n_trajectories))
#             self.pop_rate.append(ave_cost/(self.trajectory_len*self.n_trajectories))

#             recommendation_tensor = torch.cat(recommendations,1)
#             idx, val = torch.unique(torch.cat(recommendations), return_counts=True)
#             hr = (ave_score.float()/(self.trajectory_len*self.n_trajectories)).cpu().numpy()
#             self.hit_rate.append(hr)
            
#             val_ = torch.cat((val.float(),torch.zeros(self.nb_item-len(val)).to(self.device)))
#             g = gini(val_.cpu().numpy())
#             self.gini_coefficient.append(g)
            
            return memory

### Train

In [4]:
history_length = 5 
ra_length = 1 
item_embedding_size = item_embeddings.shape[1]
user_embedding_size = user_embeddings.shape[1]


vf_hidden_dims = [64]
vf_args = (history_length, ra_length, user_embedding_size+item_embedding_size+item_embedding_size+1, vf_hidden_dims, 1)
value_fun = Critic(*vf_args)

cost_fun = Critic(*vf_args)

policy_hidden_dims = [64]
policy_args = (history_length, ra_length, user_embedding_size+item_embedding_size+item_embedding_size, policy_hidden_dims, item_embedding_size)
policy = Actor(*policy_args)

policy.to(device)
value_fun.to(device)
cost_fun.to(device)
# print(policy)
# print(value_fun)
# print(cost_fun)

n_trajectories = env.nb_user
trajectory_len = 10
simulator = SinglePathSimulator(env, policy, n_trajectories, trajectory_len, **env_args)



"""
max_val_step: lr for value_fun
max_cost_step: lr for cost_fun
line_search_max_step_len: initial search step for policy
line_search_coef: search step decay rate
"""
max_constraint_val = 10
cpo = CPO(policy, value_fun, cost_fun, simulator, device, model_name=model_name, \
          max_kl=1e-4, max_val_step=5e-2, max_cost_step=5e-2, max_constraint_val=max_constraint_val, \
          val_l2_reg=1e-3, cost_l2_reg=1e-3, discount_val=0.99, discount_cost=0.99, \
          line_search_max_step_len=1, line_search_coef=0.9, line_search_max_iter=20, \
          line_search_accept_ratio=1e-2, continue_from_file=False)

n_episodes = 100

modelPath = "model/"+data_name+"/tempmodel_" + data_name + '_'
cpo.train(n_episodes, modelPath)

Step Len.: tensor(0.9000, device='cuda:3')
[Episode]: 1 | [Avg. Reward]: 0.8822905421257019 | [Avg. Cost]: 3.7476139068603516 | [Elapsed Time]: 0:00:08
Step Len.: tensor(0.9000, device='cuda:3')
[Episode]: 2 | [Avg. Reward]: 0.7433722019195557 | [Avg. Cost]: 3.121951103210449 | [Elapsed Time]: 0:00:17
Step Len.: tensor(0.9000, device='cuda:3')
[Episode]: 3 | [Avg. Reward]: 0.924708366394043 | [Avg. Cost]: 4.029692649841309 | [Elapsed Time]: 0:00:26
Step Len.: tensor(1.0000, device='cuda:3')
[Episode]: 4 | [Avg. Reward]: 0.8388122916221619 | [Avg. Cost]: 3.7794272899627686 | [Elapsed Time]: 0:00:35
Step Len.: tensor(1.0000, device='cuda:3')
[Episode]: 5 | [Avg. Reward]: 0.7338281869888306 | [Avg. Cost]: 3.3541886806488037 | [Elapsed Time]: 0:00:45
Step Len.: tensor(1.0000, device='cuda:3')
[Episode]: 6 | [Avg. Reward]: 0.7804877758026123 | [Avg. Cost]: 3.420996904373169 | [Elapsed Time]: 0:00:54
Step Len.: tensor(1.0000, device='cuda:3')
[Episode]: 7 | [Avg. Reward]: 0.9003181457519531 

KeyboardInterrupt: 

### Test

In [None]:
from fair_env import *
import pandas as pd
import numpy as np

from data_util import read_file
# from fair_env_simulator import *
# from fair_env import *
import torch
import os
import time
import torch.nn.functional as F
from gini import gini
import matplotlib.pyplot as plt
from test_ranking import *

"""
10:_
4:_1.0413574
3:_0.8780488
"""
data_name = 'ml-100k'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
modelPath = "model/"+data_name+"/tempmodel_" + data_name + '_1.0413574.pkl'
model = torch.load(modelPath)
model.to(device)
item_embeddings = np.load('../data/'+data_name+'/pmf_item_embed.npy')
user_embeddings = np.load('../data/'+data_name+'/pmf_user_embed.npy')
item_indicator = np.load('../data/'+data_name+'/item_cost_indicator_28.npy')

nb_item = item_embeddings.shape[0]
nb_user = user_embeddings.shape[0]

history_length = 5 # N in article
ra_length = 1 # K in article

with torch.no_grad():  
    test_data = read_file('../data/'+data_name+'/test_data.csv')
    test_env_args = {}
    test_env_args['data'] = test_data
    test_env_args['nb_user'] = nb_user
    test_env_args['nb_item'] = nb_item
    test_env_args['item_embeddings'] = item_embeddings
    test_env_args['user_embeddings'] = user_embeddings
    test_env_args['item_indicator'] = item_indicator
    test_env_args['device'] = device
    test_env_args['gamma'] = 0.95
    test_env_args['frac'] = 1
    
    test_trajectory_len = 200
    test_env = Environment(**test_env_args)
    states = test_env.reset()
    users = test_env.current_user
    item_embeds = torch.from_numpy(item_embeddings).to(device).float()
    
    num_click = 0
    num_cost = 0
    test_res = []
    recommendations = []
    recommended_item_onehot = torch.FloatTensor(test_env.nb_user, test_env.nb_item).zero_().to(device)  
    test_gini_coefficient = []
    test_pop_rate = []
    
    for t in range(test_trajectory_len):
        policy_input = torch.FloatTensor(states).to(device)
        user_input = torch.FloatTensor(users).to(device)
        weight_dists = model(policy_input, user_input)
#         w = weight_dists.sample()
        w = weight_dists.mean
        item_weights = torch.mm(w.view(-1,item_embeds.shape[1]), item_embeds.transpose(0,1)).view(test_env.nb_user, ra_length, -1)
        item_weights = torch.mul(item_weights.transpose(0,1), 1-recommended_item_onehot).reshape(states.shape[0],ra_length,-1)
        item_idxes = torch.argmax(item_weights,dim=2)
        actions = item_embeds[item_idxes.cpu().detach()]
        recommendations.append(item_idxes.squeeze())
        recommended_item_onehot = recommended_item_onehot.scatter_(1, item_idxes, 1)
        
        states_prime, user_prime, rewards, costs, test_info = test_env.step(actions, item_idxes)
        states = states_prime

        num_click += torch.sum(test_info)
        num_cost += torch.sum(costs).detach().cpu()
        test_pop_rate.append(num_cost/((t+1)*states.shape[0]))
        idx, val = torch.unique(torch.stack(recommendations), return_counts=True)
        
        test_res.append(test_info.squeeze())
        val_ = torch.cat((val.float(),torch.zeros(nb_item-len(val)).to(device)))
        g = gini(val_.cpu().numpy())
        test_gini_coefficient.append(g)
        
    recommendation_indicator = item_indicator[torch.stack(recommendations).transpose(0,1).detach().cpu().numpy()]
    pop_rate = []
    for i in range(recommendation_indicator.shape[1]):
        nb_rec = (i+1)*recommendation_indicator.shape[0]
        nb_pop = np.sum(recommendation_indicator[:, :i+1])
        pop_rate.append(nb_pop/nb_rec)

In [None]:
torch.stack(recommendations).transpose(0,1)

In [None]:
max_k = 200
user_history_length = torch.sum(test_env.current_user_history, 1).detach().cpu().numpy()
test_res_ = torch.stack(test_res).transpose(0,1).detach().cpu().numpy()
report = get_test_results(test_res_, user_history_length, test_gini_coefficient, max_k)

%matplotlib inline
recall = np.mean(report["recall"],0)
hit_rate = np.mean(report["hit_rate"],0)
precision = np.mean(report["precision"],0)
ndcg = np.mean(report["ndcg"],0)
gini_index = report["gini_index"]

plot_k = 100
plt.figure(figsize = (10,10))
plt.subplot(3,2,1)
plt.plot(np.arange(plot_k), recall[:plot_k])
plt.title("Recall")
plt.subplot(3,2,2)
plt.plot(np.arange(plot_k), hit_rate[:plot_k])
plt.title("Hit Rate")
plt.subplot(3,2,3)
plt.plot(np.arange(plot_k), precision[:plot_k])
plt.title("Precision")
plt.subplot(3,2,4)
plt.plot(np.arange(plot_k), ndcg[:plot_k])
plt.title("NDCG")
plt.subplot(3,2,5)
plt.plot(np.arange(plot_k), gini_index[:plot_k])
plt.title("Gini")
plt.subplot(3,2,6)
plt.plot(np.arange(plot_k), pop_rate[:plot_k])
plt.title("Pop Rate")
plt.show()

In [None]:
max_constraint_val = 4
np.save('../results/'+data_name+'/'+model_name+'_'+str(max_constraint_val)+'_recall.npy', recall)
np.save('../results/'+data_name+'/'+model_name+'_'+str(max_constraint_val)+'_hit_rate.npy', hit_rate)
np.save('../results/'+data_name+'/'+model_name+'_'+str(max_constraint_val)+'_precision.npy', precision)
np.save('../results/'+data_name+'/'+model_name+'_'+str(max_constraint_val)+'_ndcg.npy', ndcg)
np.save('../results/'+data_name+'/'+model_name+'_'+str(max_constraint_val)+'_gini_index.npy', gini_index)
np.save('../results/'+data_name+'/'+model_name+'_'+str(max_constraint_val)+'_pop_rate.npy', pop_rate)

In [None]:
ndcg[99], gini_index[99]

In [None]:
ndcg[99]

In [None]:
ndcg[100]