In [1]:
from argparse import ArgumentParser
# from gym import make
# from gym.spaces import Box, Discrete
# import gym
# import gym_fairrec
# import roboschool
# from yaml import load
# import yaml

from models import build_diag_gauss_policy, build_mlp, build_multinomial_policy
# from simulators import *
from transforms import *
from torch_utils import get_device
from trpo import TRPO

import pandas as pd
import numpy as np
# from read_data import read_file, read_embeddings, Embeddings

from data_util import read_file
from environment import *
# from env import *
# from ddpg import *
import torch
import os
import time
import torch.nn.functional as F

In [2]:
model_name = 'fairrec'
data_name = 'ml-1m'
data = read_file('./data/'+data_name+'/train_data.csv')
item_embeddings = np.load('./data/'+data_name+'/item_embed.npy')
user_embeddings = np.load('./data/'+data_name+'/user_embed.npy')


nb_item = item_embeddings.shape[0]
nb_user = user_embeddings.shape[0]
print('num of users: %d, num of items: %d' %(nb_user, nb_item))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


env_args = {}
env_args['data'] = data
env_args['nb_user'] = nb_user
env_args['nb_item'] = nb_item
env_args['item_embeddings'] = item_embeddings
env_args['user_embeddings'] = user_embeddings
env_args['device'] = device
env_args['gamma'] = 0.95

env = Environment(**env_args)
print('Successfully create Training Env!')

num of users: 6040, num of items: 3952
Successfully create Training Env!


In [3]:
history_length = 5 # N in article
ra_length = 1 # K in article
state_space_size = item_embeddings.shape[1] * history_length
action_space_size = item_embeddings.shape[1] * ra_length


vf_hidden_dims = [64]
vf_args = (state_space_size + 1, vf_hidden_dims, 1)
value_fun = build_mlp(*vf_args)

policy_hidden_dims = [64]
policy_args = (state_space_size, policy_hidden_dims, action_space_size)
policy = build_diag_gauss_policy(*policy_args)


policy.to(device)
value_fun.to(device)
print(policy)
print(value_fun)

Sequential(
  (0): Linear(in_features=500, out_features=64, bias=True)
  (1): Tanh()
  (2): Linear(in_features=64, out_features=100, bias=True)
  (3): DiagGaussianLayer()
)
Sequential(
  (0): Linear(in_features=501, out_features=64, bias=True)
  (1): Tanh()
  (2): Linear(in_features=64, out_features=1, bias=True)
)


In [8]:
from collections import defaultdict
import gym
from gym.spaces import Box, Discrete
import numpy as np
import torch
# from torch_utils import get_device


class Simulator:
    def __init__(self, env, policy, n_trajectories, trajectory_len, **env_args):
        self.env = env
        self.policy = policy


class SinglePathSimulator(Simulator):
    def __init__(self, env, policy, n_trajectories, trajectory_len, **env_args):
        Simulator.__init__(self, env, policy, n_trajectories, trajectory_len, **env_args)
        self.item_embeddings= env_args['item_embeddings']
        self.trajectory_len = trajectory_len
        self.n_trajectories = n_trajectories
        self.nb_item = env_args['nb_item']
        self.device = env_args['device']

    def sample_trajectories(self):
        self.policy.eval()

        with torch.no_grad():
            memory = np.asarray([defaultdict(list) for i in range(self.n_trajectories)])
        #     done = [False] * n_trajectories

            ra_length = 1
#             len_trajectory = 10
#             epsilon = 0.9
            item_embeds = torch.from_numpy(self.item_embeddings).to(self.device).float()

#             memory_states = []
#             memory_actions = []
#             memory_rewards = []
#             memory_done = []

            score = 0
            states = self.env.reset()
            recommended_item_onehot = torch.FloatTensor(self.n_trajectories, self.nb_item).zero_().to(device)
            recommendations = []
            for t in range(self.trajectory_len): 
                policy_input = torch.FloatTensor(states).to(self.device).view(self.n_trajectories, -1)
                weight_dists = self.policy(policy_input)
                w = weight_dists.sample()
                item_weights = torch.mm(w.view(-1,item_embeds.shape[1]), item_embeds.transpose(0,1)).view(self.n_trajectories, ra_length, -1)
                item_weights = torch.mul(item_weights.transpose(0,1), 1-recommended_item_onehot).reshape(states.shape[0],ra_length,-1)
                item_idxes = torch.argmax(item_weights,dim=2)

                recommendations.append(item_idxes)
                recommended_item_onehot = recommended_item_onehot.scatter_(1, item_idxes, 1)

                actions = item_embeds[item_idxes.cpu().detach()]
                states_prime, rewards, info = self.env.step(actions, item_idxes)

        #         states_prime, rewards, info = env.step(item_idxes)
        #         memory_states.append(policy_input)
        #         memory_actions.append(actions)
        #         memory_rewards.append(rewards)
        #         memory_done.append(done)

                for i in range(len(memory)):
                    trajectory = memory[i]
                    trajectory['states'].append(policy_input[i].to(device).squeeze())
                    trajectory['actions'].append(actions[i].to(device).squeeze())
                    trajectory['rewards'].append(rewards[i].to(device).squeeze())


                states = states_prime
                score += torch.sum(info).detach().cpu()
                
            for trajectory in memory:
                trajectory['done'] = True    
            print(score/self.trajectory_len)
            print(torch.cat(recommendations,1))
            return memory

In [9]:
n_trajectories = nb_user
trajectory_len = 10
simulator = SinglePathSimulator(env, policy, n_trajectories, trajectory_len, **env_args)

In [10]:
try:
    trpo_args = config['trpo_args']
except:
    trpo_args = {}

trpo = TRPO(policy, value_fun, simulator, model_name=model_name,
            continue_from_file=False, **trpo_args)

In [11]:
n_episodes = 10
trpo.train(n_episodes)

tensor([[3179, 2492,  954,  ..., 3223, 3947, 3543],
        [1855, 2506, 2194,  ..., 2092, 1923, 3615],
        [3568,  828,  717,  ..., 1421, 1034, 3798],
        ...,
        [2430, 1404, 2314,  ..., 3577,  140, 2040],
        [3158, 1242, 3821,  ..., 1981,  321, 3871],
        [2238, 1725, 3162,  ..., 3909,  582,  713]], device='cuda:0')
[EPISODE]: 1	[AVG. REWARD]: 4.4139	 [ELAPSED TIME]: 0:00:10
tensor([[3408, 3744, 2771,  ..., 1940, 3793, 1204],
        [3090, 3846, 3879,  ..., 2549,  777, 1168],
        [3124, 2605,  212,  ..., 1742, 1942, 3752],
        ...,
        [ 677, 2148, 1326,  ...,  179, 2065, 2313],
        [3265, 2274, 3816,  ..., 2816,   69, 1599],
        [   5, 1720,  972,  ..., 1915,  923, 2790]], device='cuda:0')
[EPISODE]: 2	[AVG. REWARD]: 4.4997	 [ELAPSED TIME]: 0:00:21
tensor([[3577, 1210, 1214,  ..., 1249,  827,  607],
        [2382,  241,  717,  ..., 3942, 2312, 2458],
        [ 319, 2711, 1250,  ..., 3625, 3452, 1428],
        ...,
        [2871, 1494, 3826

In [8]:
samples = trpo.simulator.sample_trajectories()

In [12]:
np.sum(samples[0]['rewards'])

tensor(6., device='cuda:0')

In [20]:
mean_reward = np.mean(torch.stack([np.sum(trajectory['rewards']) for trajectory in samples]))

TypeError: mean() received an invalid combination of arguments - got (out=NoneType, axis=NoneType, dtype=NoneType, ), but expected one of:
 * (torch.dtype dtype)
 * (tuple of names dim, bool keepdim, torch.dtype dtype)
      didn't match because some of the keywords were incorrect: out, axis
 * (tuple of ints dim, bool keepdim, torch.dtype dtype)
      didn't match because some of the keywords were incorrect: out, axis


In [23]:
np.mean([np.sum(trajectory['rewards']) for trajectory in samples])

AttributeError: 'torch.dtype' object has no attribute 'type'