In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F 
import os
import tqdm

Data

In [19]:
!unzip "ml-1m.zip" -d "data/"

Archive:  ml-1m.zip
replace data/ml-1m/movies.dat? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace data/ml-1m/ratings.dat? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace data/ml-1m/README? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [42]:
DATA_PATH = "data/ml-1m"

In [43]:
ratings = [i.strip().split("::") for i in open(os.path.join(DATA_PATH,'ratings.dat'), 'r').readlines()]
users = [i.strip().split("::") for i in open(os.path.join(DATA_PATH,'users.dat'), 'r').readlines()]
movies = [i.strip().split("::") for i in open(os.path.join(DATA_PATH,'movies.dat'), 'r', encoding='latin-1').readlines()]

In [44]:
#UserID::MovieID::Rating::Timestamp
ratings = pd.DataFrame(ratings, columns=['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype=np.int32)
ratings = pd.DataFrame(ratings, columns=['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype=np.int32)
movies = pd.DataFrame(movies, columns=['MovieID', 'Title', 'Genres'])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [45]:
movies['MovieID'] = movies['MovieID'].apply(pd.to_numeric)

In [46]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [47]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [48]:
watched_movies = ratings[['UserID', 'MovieID']]
watched_movies.head()

Unnamed: 0,UserID,MovieID
0,1,1193
1,1,661
2,1,914
3,1,3408
4,1,2355


In [49]:
train_data = watched_movies.sample(frac=0.8, random_state=16)
test_data = watched_movies.drop(train_data.index).values.tolist()
train_data = train_data.values.tolist()

In [50]:
user_num = max(watched_movies['UserID'])
movie_num = max(watched_movies['MovieID'])
user_num, movie_num

(6040, 3952)

Enviroment

In [28]:
class Enviroment():
  def __init__(self, num_users=6040):

    self.num_users = num_users
    self.movie_num = movie_num
    self.user_id = None
    self.N = 5
    self.memory = None
    self.eps = 0.1

  def reset(self, user_id):
    self.user_id = torch.tensor([user_id])
    
    self.watched = np.array(ratings[ratings['UserID'] == user_id]['MovieID'])
    self.memory = np.zeros([1, self.N], dtype=int)
    return self.user_id, torch.tensor(self.memory)
    
  def step(self, state, pos_memory, action, movies, movies_emb):
    
    scores = torch.matmul(action, movies_emb)
    
    #epsilon-greedy
    p = np.random.random()
    if self.eps > p:
      best_choice = np.random.choice(movies).detach().cpu().numpy()
    else:
      best_choice = movies[torch.argmax(scores).item()].detach().cpu().numpy()

    reward = self.get_reward(best_choice)
    
    if reward > 0:
      self.memory[:, :-1] = self.memory[:, 1:]
      self.memory[:, -1] =  best_choice.item() 
      
    return state, action, reward, self.memory
  
  def get_reward(self, next_step):
    r = (ratings.UserID == int(self.user_id)) & (ratings.MovieID == int(next_step))
    rate = ratings[r]['Rating']
    if len(rate):
      reward = 1/2 * (int(rate) - 3)
    else:
      reward = 0
    return reward

Actor, Critic, State

In [29]:
class StateRepresentation(nn.Module):
    def __init__(self, user_num, item_num, N=5, embedding_dim=100):
        super().__init__()
        self.user_emb = nn.Embedding(user_num, embedding_dim)
        self.item_emb = nn.Embedding(item_num+1, embedding_dim, padding_idx=0) #чтобы с чего-то начинать, вводим 0
        self.ave_with_weights = torch.nn.Conv1d(in_channels=N, out_channels=1, kernel_size=1) #потому что ave с весами это свертка

    def forward(self, user, memory):
        user_embed = self.user_emb(user)
        item_embed = self.item_emb(memory)
        ave = self.ave_with_weights(item_embed).squeeze(dim=0)
        s = torch.cat((user_embed, user_embed * ave, ave), 1)
        return s

In [30]:
class Actor(nn.Module):
  def __init__(self, hidden_dim=200, embedding_dim=100):
    super().__init__()
    
    self.fc1 = nn.Linear(3*embedding_dim, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, hidden_dim)
    self.fc3 = nn.Linear(hidden_dim, embedding_dim)

  def forward(self, states):
    y = F.relu(self.fc1(states))
    y = F.relu(self.fc2(y))
    y = self.fc3(y)
    y = F.tanh(y)
    return y

In [31]:
class Critic(nn.Module):
  def __init__(self, hidden_dim=200, embedding_dim=100):
    super().__init__()
    self.fc1 = nn.Linear(3*embedding_dim, embedding_dim)
    self.fc2 = nn.Linear(2*embedding_dim, hidden_dim)
    self.fc3 = nn.Linear(hidden_dim, hidden_dim)
    self.fc_out = nn.Linear(hidden_dim, 1)

  def forward(self, y, a):
    y = F.relu(self.fc1(y))
    x = torch.cat([a, y], 1)
    z = F.relu(self.fc2(x))
    f = F.relu(self.fc3(z))
    q_sa = self.fc_out(f)
    
    return q_sa

Train

In [34]:
class Play_Info(object):
  def __init__(self, state, action, reward, next_state):
    self.state = state
    self.action = action
    self.reward = reward
    self.next_state = next_state

class ReplayBuffer(object):

    def __init__(self, capacity=5):
        self.alpha = 0.6
        self.capacity = capacity
        self.memory = []
        self.position = 0
        self.priorities = np.zeros((capacity,))

    def push(self, play_info):
        state, action, reward, next_state = play_info
        max_priority = self.priorities.max() if self.memory else 1.0

        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Play_Info(state, action, reward, next_state)
        self.priorities[self.position] = max_priority
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):

        priorities = self.priorities if len(self.memory) == self.capacity else self.priorities[:self.position]
        probs = priorities ** self.alpha
        probs = probs/probs.sum()

        indx = np.random.choice(np.arange(len(self.memory)), batch_size, p=probs)
        samples = [self.memory[i] for i in indx]
        samples_probs = probs[indx]
        weights = (len(self.memory) * samples_probs) ** (-beta)
        weights /= weights.max()
        
        states, actions, rewards, next_states = [], [], [], []        
        for sample in samples:
          states.append(sample.state)
          actions.append(sample.action)
          rewards.append(sample.reward)
          next_states.append(sample.next_state)
        
        states = torch.cat(states, dim=0)
        actions = torch.cat(actions, dim=0)
        rewards = torch.tensor(rewards)
        next_states = torch.cat(next_states, dim=0)
        return states, actions, rewards, next_states

In [35]:
class Agent():
  def __init__(self, user_num, item_num):

    self.actor = Actor() 
    self.target_actor = Actor()

    self.critic = Critic() 
    self.target_critic = Critic()

    for target_param, param in zip(self.critic.parameters(), self.target_critic.parameters()):
      target_param.data.copy_(target_param.data)

    for target_param, param in zip(self.actor.parameters(), self.target_actor.parameters()):
      target_param.data.copy_(target_param.data)

    self.state_repr = StateRepresentation(user_num, item_num)
    self.env = Enviroment(user_num) 
    self.item_num = item_num
    self.eps = 0.1

  def recommend(self, user, memory):
    state = self.state_repr(user, torch.tensor(memory)) #f(i_s) embedding #ВСТАВИТЬ НОРМАЛЬНУЮ ИНИЦИАЛИЗАЦИЮ

    action = self.actor(state) 
   
    movies = torch.tensor(self.env.watched) #это изменится, сейчас цель просто завести
    movies_emb = self.state_repr.item_emb(movies).T
    state, action, reward, memory = self.env.step(state, torch.tensor(memory), action, movies, movies_emb)
    next_state = self.state_repr(user, torch.tensor(memory))
    play_info = (state, action, reward, next_state)
    return play_info, memory

In [37]:
class Trainer():

  def __init__(self, user_num, item_num):
    self.memory = [] #должна быть другая инициализация 
    self.agent = Agent(user_num, item_num)
    self.gamma = 0.9 # discount rate
    self.criterion_critic = nn.MSELoss()
    self.optimizer_critic = torch.optim.Adam(self.agent.critic.parameters(), lr=1e-4)
    self.optimizer_state = torch.optim.Adam(self.agent.state_repr.parameters(), lr=1e-4)
    self.optimizer_actor = torch.optim.Adam(self.agent.actor.parameters(), lr=1e-4)
    self.tau = 1e-3
    self.N = 5
    self.T = 10

  def train_step(self, user, memory=[]):
    
    self.memory = memory
    self.replay_buffer = ReplayBuffer(self.N) 

    for i in range(1, self.T+1):
      play_info, memory = self.agent.recommend(user, self.memory) #от чего рекомменд
      self.memory = memory
      self.replay_buffer.push(play_info)
      
      states, actions, rewards, next_states = self.replay_buffer.sample(self.N)
      rewards = rewards.unsqueeze(dim=1)
      actions_target = self.agent.target_actor(states)
 
      Q_target = self.agent.target_critic(states, actions_target)
      Q = self.agent.critic(states, actions)
      y = rewards + self.gamma * Q_target
      
      #update critic net

      self.optimizer_critic.zero_grad()
      loss_critic = self.criterion_critic(Q, y.detach()) #если расписать формулу это действительно то, что нужно
      loss_critic.backward(retain_graph=True)
      self.optimizer_critic.step()

      #update actor net
      
      self.optimizer_actor.zero_grad()
      self.optimizer_state.zero_grad()
      
      loss_actor = -self.agent.critic(states.detach(), actions_target).mean()  #если расписать формулу это действительно то, что нужно
      loss_actor.backward(retain_graph=True)
      self.optimizer_actor.step()
      self.optimizer_state.step()

      #update target net

      for target_param, param in zip(self.agent.critic.parameters(), self.agent.target_critic.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - self.tau) + param.data * self.tau)

      for target_param, param in zip(self.agent.actor.parameters(), self.agent.target_actor.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - self.tau) + param.data * self.tau)

Offline Evaluation

Experiments

In [38]:
user_num, movie_num

(6040, 3952)

In [39]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f973e418910>

In [51]:
trainer = Trainer(user_num, movie_num)
users = np.arange(user_num) + 1

for user in tqdm.tqdm(users):
  user_tensor, memory_tensor = trainer.agent.env.reset(user)
  trainer.train_step(user_tensor, memory_tensor)
  print("Ok")

  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelapp.py", line 499, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.7/dist-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
    self._run_once()
  File "/usr/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
    handle._run()
  File "/usr/lib/python3.7/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/usr

RuntimeError: ignored