In [1]:
from data_util import read_file
from environment import *
# from env import *
from ddpg import *
import torch
import os
import time
import torch.nn.functional as F


data_name = 'ml-1m'
data = read_file('./data/'+data_name+'/train_data.csv')
item_embeddings = np.load('./data/'+data_name+'/item_embed.npy')
user_embeddings = np.load('./data/'+data_name+'/user_embed.npy')


nb_item = item_embeddings.shape[0]
nb_user = user_embeddings.shape[0]
print('num of users: %d, num of items: %d' %(nb_user, nb_item))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


env_args = {}
env_args['data'] = data
env_args['nb_user'] = nb_user
env_args['nb_item'] = nb_item
env_args['item_embeddings'] = item_embeddings
env_args['user_embeddings'] = user_embeddings
env_args['device'] = device
env_args['gamma'] = 0.95

env = Environment(**env_args)
print('Successfully create Training Env!')

num of users: 6040, num of items: 3952
Successfully create Training Env!


In [2]:
history_length = 5 # N in article
ra_length = 1 # K in article
state_space_size = item_embeddings.shape[1] * history_length
action_space_size = item_embeddings.shape[1] * ra_length
print('size of state space: %d, size of action space: %d' %(state_space_size, action_space_size))

#Hyperparameters
lr_mu        = 1e-5
lr_q         = 1e-4
gamma        = 0.99
batch_size   = 1000
buffer_limit = 100000
tau          = 5e-3 # for target network soft update

memory = ReplayBuffer(buffer_limit)

q, q_target = QNet(state_space_size, action_space_size).cuda(), QNet(state_space_size, action_space_size).to(device)
q_target.load_state_dict(q.state_dict())
mu, mu_target = MuNet(state_space_size, action_space_size).to(device), MuNet(state_space_size, action_space_size).to(device)
mu_target.load_state_dict(mu.state_dict())

score = 0.0
print_interval = 1

mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
q_optimizer  = optim.Adam(q.parameters(), lr=lr_q)

item_embeds = torch.from_numpy(item_embeddings).to(device).float()
len_trajectory = 10
epsilon = 0.9

start = time.time()
for n_epi in range(100):
    n_epi = n_epi + 1
    states = env.reset()
    done = [False] * nb_user
    recommended_item_onehot = torch.FloatTensor(nb_user, nb_item).zero_().to(device)
    recommendations = []
    for t in range(len_trajectory): 
        if np.random.rand() <= epsilon:
            w = mu(torch.from_numpy(states).float().to(device))

    #         item_idxes = torch.argmax(torch.mm(w.view(-1,item_embeddings.size()), item_embeds).view(nb_user,ra_length,-1),dim=2)
    #         item_weights = torch.sigmoid(torch.mm(w.view(-1,item_embeddings.size()), item_embeds))
            item_weights = torch.mm(w.view(-1,item_embeds.shape[1]), item_embeds.transpose(0,1)).view(nb_user, ra_length, -1)
            item_weights = torch.mul(item_weights.transpose(0,1), 1-recommended_item_onehot).reshape(states.shape[0],ra_length,-1)

            item_idxes = torch.argmax(item_weights,dim=2)
        else:
            item_weights = torch.FloatTensor(states.shape[0], ra_length, nb_item).uniform_(0, 1).to(device)
            item_weights = torch.mul(item_weights.transpose(0,1), 1-recommended_item_onehot).reshape(states.shape[0],ra_length,-1)
            item_idxes = torch.argmax(item_weights,dim=2)

        recommendations.append(item_idxes)
        recommended_item_onehot = recommended_item_onehot.scatter_(1, item_idxes, 1)

        actions = item_embeds[item_idxes.cpu().detach()]
        states_prime, rewards, info = env.step(actions, item_idxes)
#         states_prime, rewards, info = env.step(item_idxes)
        
        if t == len_trajectory-1:
            done = [True] * nb_user
        
        for s,a,r,s_prime,do in zip(states, actions, rewards, states_prime, done):
#             if r == 0:
#                 continue
                
            memory.put((s,a,r,s_prime,do))
            
        score += torch.sum(info).detach().cpu()
        states = states_prime
        
    print(memory.size())          
    if memory.size()>50000:
        for i in range(10):
            train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer, batch_size, gamma)
            soft_update(mu, mu_target, tau)
            soft_update(q,  q_target, tau)

    if n_epi%print_interval==0 :
        end = time.time()
        print("# of episode:{}, avg score: {:.2f}, time: {:.2f}s".format(n_epi, score/print_interval/len_trajectory, end-start))
        print(torch.cat(recommendations,1))
        score = 0.0
        start = time.time()

size of state space: 500, size of action space: 100
60400
# of episode:1, avg score: 2140.30, time: 5.20s
tensor([[1618, 3116, 2324,  ..., 2231, 3948, 3538],
        [1187, 3879, 2231,  ..., 2164,  696, 3948],
        [2963, 2164, 3948,  ...,   47,   91, 2249],
        ...,
        [2571, 2494, 3538,  ..., 2386,   23, 1388],
        [ 124, 2383, 3042,  ..., 3202, 3652,   96],
        [3690, 2383, 1524,  ..., 1859, 2231, 2317]], device='cuda:0')


KeyboardInterrupt: 

### Test 

In [9]:
from env import *
with torch.no_grad():  
    test_data = read_file('./data/'+data_name+'/test_data.csv')
    test_env_args = {}
    test_env_args['data'] = test_data
    test_env_args['nb_user'] = nb_user
    test_env_args['nb_item'] = nb_item
    test_env_args['item_embeddings'] = item_embeddings
    test_env_args['user_embeddings'] = user_embeddings
    test_env_args['gamma'] = 0.95
    test_env_args['device'] = device


    test_env = Environment(**test_env_args)
    test_states = test_env.reset()
    
#     rand_state = torch.FloatTensor(test_states.shape[0],test_states.shape[1],test_states.shape[2]).uniform_(0, 1)
#     test_state = rand_state

    
    w = mu(torch.from_numpy(test_states).float().to(device)) 

#     item_idxes = torch.argmax(torch.mm(w.view(-1,item_embeddings.shape[1]), item_embeds.transpose(0,1)).view(nb_user,ra_length,-1),dim=2)
#     actions = item_embeds[item_idxes.cpu().detach()]

    k = 2
    item_values, item_idxes = torch.topk(torch.mm(w.view(-1,item_embeddings.shape[1]), item_embeds.transpose(0,1)).view(nb_user,ra_length,-1), k, dim=2)

    states_prime, _, test_info = test_env.step(item_idxes.view(nb_user,-1))
    states = states_prime
    print(torch.sum(test_info))
    print(item_idxes)    

tensor(128., device='cuda:0')
tensor([[[2383, 3563]],

        [[1463, 3798]],

        [[2383, 3919]],

        ...,

        [[3798,   91]],

        [[2231, 2958]],

        [[2616, 2383]]], device='cuda:0')


In [5]:
torch.unique(item_idxes, return_counts=True)

(tensor([  16,   17,   18,   23,   24,   27,   47,   54,   62,   83,   91,   96,
          151,  155,  165,  174,  179,  201,  202,  207,  213,  230,  233,  236,
          248,  259,  265,  287,  308,  319,  335,  343,  356,  365,  406,  416,
          428,  495,  503,  508,  512,  514,  531,  532,  535,  554,  589,  610,
          669,  673,  679,  696,  717,  734,  735,  736,  837,  846,  884,  895,
          900,  919,  923,  967, 1026, 1034, 1058, 1084, 1113, 1151, 1172, 1182,
         1185, 1193, 1198, 1204, 1205, 1210, 1212, 1214, 1242, 1244, 1250, 1252,
         1260, 1284, 1295, 1297, 1309, 1310, 1328, 1346, 1348, 1353, 1360, 1370,
         1372, 1373, 1386, 1388, 1390, 1393, 1397, 1405, 1406, 1444, 1445, 1453,
         1463, 1482, 1524, 1534, 1537, 1561, 1574, 1599, 1604, 1610, 1623, 1638,
         1645, 1654, 1672, 1680, 1693, 1720, 1725, 1742, 1795, 1811, 1823, 1825,
         1833, 1844, 1859, 1886, 1896, 1904, 1920, 1922, 1965, 1969, 1977, 1998,
         2002, 2020, 2023, 2