In [None]:
import tensorflow as tf
import numpy as np
from collections import deque
import random
from datetime import datetime
import matplotlib.pyplot as plt
import os
# import wandb
from tensorflow.python.ops.gen_math_ops import Exp
import itertools
import matplotlib.pyplot as plt
import pandas as pd

# Actor network

In [None]:
class ActorNetwork(tf.keras.Model):
    def __init__(self, embedding_dim, hidden_dim):
        super(ActorNetwork, self).__init__()
        self.inputs = tf.keras.layers.InputLayer(name='input_layer', input_shape=(10*embedding_dim,))
        self.fc = tf.keras.Sequential([
            tf.keras.layers.Dense(hidden_dim, activation='relu'),
            tf.keras.layers.Dense(hidden_dim, activation='relu'),
            tf.keras.layers.Dense(embedding_dim, activation='tanh')
        ])

    def call(self, x):
        x = self.inputs(x)
        return self.fc(x)

class Actor(object):

    def __init__(self, embedding_dim, hidden_dim, learning_rate, state_size, tau):

        self.embedding_dim = embedding_dim
        self.state_size = state_size

        # actor network / target network
        self.network = ActorNetwork(embedding_dim, hidden_dim)
        self.target_network = ActorNetwork(embedding_dim, hidden_dim)
        # 옵티마이저 optimizer
        self.optimizer = tf.keras.optimizers.Adam(learning_rate)
        # soft target network update hyperparameter
        self.tau = tau

    def build_networks(self):
        # Build networks
        self.network(np.zeros((1, 10*self.embedding_dim)))
        self.target_network(np.zeros((1, 10*self.embedding_dim)))

    def update_target_network(self):
        # soft target network update
        c_theta, t_theta = self.network.get_weights(), self.target_network.get_weights()
        for i in range(len(c_theta)):
            t_theta[i] = self.tau * c_theta[i] + (1 - self.tau) * t_theta[i]
        self.target_network.set_weights(t_theta)

    def train(self, states, dq_das):
        with tf.GradientTape() as g:
            outputs = self.network(states)
            # loss = outputs*dq_das
        dj_dtheta = g.gradient(outputs, self.network.trainable_weights, -dq_das)
        grads = zip(dj_dtheta, self.network.trainable_weights)
        self.optimizer.apply_gradients(grads)

    def save_weights(self, path):
        self.target_network.save_weights(path)

    def load_weights(self, path):
        self.network.load_weights(path)

# Critic network

In [None]:
class CriticNetwork(tf.keras.Model):
    def __init__(self, embedding_dim,hidden_dim):
        super(CriticNetwork, self).__init__()
        self.inputs = tf.keras.layers.InputLayer(input_shape=(embedding_dim, 10*embedding_dim))
        self.fc1 = tf.keras.layers.Dense(embedding_dim, activation = 'relu')
        self.concat = tf.keras.layers.Concatenate()
        self.fc2 = tf.keras.layers.Dense(hidden_dim, activation = 'relu')
        self.fc3 = tf.keras.layers.Dense(hidden_dim, activation = 'relu')
        self.out = tf.keras.layers.Dense(1, activation = 'linear')

    def call(self, x):
        s = self.fc1(x[1])
        s = self.concat([x[0],s])
        s = self.fc2(s)
        s = self.fc3(s)
        return self.out(s)

class Critic(object):

    def __init__(self, hidden_dim, learning_rate, embedding_dim, tau):

        self.embedding_dim = embedding_dim

        #  critic network / target network
        self.network = CriticNetwork(embedding_dim, hidden_dim)
        self.target_network = CriticNetwork(embedding_dim, hidden_dim)
        # 옵티마이저 optimizerq
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        # MSE
        self.loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)

        #soft target network update hyperparameter
        self.tau = tau

    def build_networks(self):
        self.network([np.zeros((1,self.embedding_dim)), np.zeros((1,10*self.embedding_dim))])
        self.target_network([np.zeros((1,self.embedding_dim)), np.zeros((1,10*self.embedding_dim))])
        self.network.compile(self.optimizer, self.loss)

    def update_target_network(self):
        c_omega = self.network.get_weights()
        t_omega = self.target_network.get_weights()
        for i in range(len(c_omega)):
            t_omega[i] = self.tau * c_omega[i] + (1 - self.tau) * t_omega[i]
        self.target_network.set_weights(t_omega)

    def dq_da(self, inputs):
        actions = inputs[0]
        states = inputs[1]
        with tf.GradientTape() as g:
            actions = tf.convert_to_tensor(actions)
            g.watch(actions)
            outputs = self.network([actions, states])
        q_grads = g.gradient(outputs, actions)
        return q_grads

    def train(self, inputs, td_targets, weight_batch):
        weight_batch = tf.convert_to_tensor(weight_batch, dtype=tf.float32)
        with tf.GradientTape() as g:
            outputs = self.network(inputs)
            loss = self.loss(td_targets, outputs)
            weighted_loss = tf.reduce_mean(loss*weight_batch)
        dl_domega = g.gradient(weighted_loss, self.network.trainable_weights)
        grads = zip(dl_domega, self.network.trainable_weights)
        self.optimizer.apply_gradients(grads)
        return weighted_loss


    def train_on_batch(self, inputs, td_targets, weight_batch):
        loss = self.network.train_on_batch(inputs, td_targets, sample_weight=weight_batch)
        return loss

    def save_weights(self, path):
        self.target_network.save_weights(path)

    def load_weights(self, path):
        self.network.load_weights(path)

# State


## embedding item and user

In [None]:
class MovieGenreEmbedding(tf.keras.Model):
    def __init__(self, len_movies, len_genres, embedding_dim):
        super(MovieGenreEmbedding, self).__init__()
        self.m_g_input = tf.keras.layers.InputLayer(name='input_layer', input_shape=(2,))
        # embedding
        self.m_embedding = tf.keras.layers.Embedding(name='movie_embedding', input_dim=len_movies, output_dim=embedding_dim)
        self.g_embedding = tf.keras.layers.Embedding(name='genre_embedding', input_dim=len_genres, output_dim=embedding_dim)
        # dot product
        self.m_g_merge = tf.keras.layers.Dot(name='movie_genre_dot', normalize=True, axes=1)
        # output
        self.m_g_fc = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, x):
        x = self.m_g_input(x)
        memb = self.m_embedding(x[0])
        gemb = self.g_embedding(x[1])
        m_g = self.m_g_merge([memb, gemb])
        return self.m_g_fc(m_g)

class UserMovieEmbedding(tf.keras.Model):
    def __init__(self, len_users, len_movies, embedding_dim):
        super(UserMovieEmbedding, self).__init__()
        self.m_u_input = tf.keras.layers.InputLayer(name='input_layer', input_shape=(2,))
        # embedding
        self.u_embedding = tf.keras.layers.Embedding(name='user_embedding', input_dim=len_users, output_dim=embedding_dim)
        self.m_embedding = tf.keras.layers.Embedding(name='movie_embedding', input_dim=len_movies, output_dim=embedding_dim)
        # dot product
        self.m_u_merge = tf.keras.layers.Dot(name='movie_user_dot', normalize=False, axes=1)
        # output
        self.m_u_fc = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, x):
        x = self.m_u_input(x)
        uemb = self.u_embedding(x[0])
        memb = self.m_embedding(x[1])
        m_u = self.m_u_merge([memb, uemb])
        return self.m_u_fc(m_u)

## state representation

In [None]:
class DRRAveStateRepresentation(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(DRRAveStateRepresentation, self).__init__()
        self.embedding_dim = embedding_dim
        self.wav = tf.keras.layers.Conv1D(1, 1, 1)
        self.concat = tf.keras.layers.Concatenate()
        self.flatten = tf.keras.layers.Flatten()

    def call(self, x):
        items_eb = tf.transpose(x[1], perm=(0,2,1))/self.embedding_dim
        wav = self.wav(items_eb)
        wav = tf.transpose(wav, perm=(0,2,1))
        wav = tf.squeeze(wav, axis=1)
        user_wav = tf.keras.layers.multiply([x[0], wav])
        concat = self.concat([x[0], user_wav, wav])
        return self.flatten(concat)

In [None]:
# new = tf.expand_dims(new_row, axis=0)
# tf.expand_dims(new, axis=1), new

In [None]:

class DRRUStateRepresentation(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(DRRUStateRepresentation, self).__init__()

    def multi_layer(self, user_item):
        S = tf.constant([], dtype=tf.float32)
        for i in range(user_item.shape[0]-1):
            for j in range(i+1, user_item.shape[0]):
                multi = tf.math.multiply(user_item[i], user_item[j])
                S = tf.concat([S, multi], axis=0)
        return S

    def call(self, x):
        user_item = tf.concat([x[0], x[1][0]], axis=0)
        multi_user_item = tf.expand_dims(self.multi_layer(user_item), axis=0)
        return tf.keras.layers.Flatten()(multi_user_item)



In [None]:
class DRRPStateRepresentation(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(DRRPStateRepresentation, self).__init__()

    def multi_layer(self, item):

        S = tf.reshape(item, (-1))
        for i in range(item.shape[0]-1):
            for j in range(i+1, item.shape[0]):
                multi = tf.math.multiply(item[i], item[j])
                S = tf.concat([S, multi], axis=0)
        return S

    def call(self, x):
        multi_item = tf.expand_dims(self.multi_layer(x[1][0]), axis=0)
        return tf.keras.layers.Flatten()(multi_item)

In [None]:
class simpleRepresentation(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(simpleRepresentation, self).__init__()
        self.flatten = tf.keras.layers.Flatten()
    def call(self, x):
        em = tf.expand_dims(tf.reshape(x,(-1)), axis = 0)

        return self.flatten(em)

# Environment

In [None]:
import numpy as np

class OfflineEnv(object):

    def __init__(self, users_dict, users_history_lens, movies_id_to_movies, state_size, fix_user_id=None):

        self.users_dict = users_dict
        self.users_history_lens = users_history_lens
        self.items_id_to_name = movies_id_to_movies

        self.state_size = state_size
        self.available_users = self._generate_available_users()

        self.fix_user_id = fix_user_id

        self.user = fix_user_id if fix_user_id else np.random.choice(self.available_users)
        self.user_items = {data[0]:data[1] for data in self.users_dict[self.user]}
        self.items = [data[0] for data in self.users_dict[self.user][:self.state_size]]
        self.done = False
        self.recommended_items = set(self.items)
        self.done_count = 3000

    def _generate_available_users(self):
        available_users = []
        for i, length in zip(self.users_dict.keys(), self.users_history_lens):
            if length > self.state_size:
                available_users.append(i)
        return available_users

    def reset(self):
        self.user = self.fix_user_id if self.fix_user_id else np.random.choice(self.available_users)
        self.user_items = {data[0]:data[1] for data in self.users_dict[self.user]}
        self.items = [data[0] for data in self.users_dict[self.user][:self.state_size]]
        self.done = False
        self.recommended_items = set(self.items)
        return self.user, self.items, self.done

    def step(self, action, top_k=False):

        reward = -0.5

        if top_k:
            correctly_recommended = []
            rewards = []
            for act in action:
                if act in self.user_items.keys() and act not in self.recommended_items:
                    correctly_recommended.append(act)
                    rewards.append((self.user_items[act] - 3)/2)
                else:
                    rewards.append(-0.5)
                self.recommended_items.add(act)
            if max(rewards) > 0:
                self.items = self.items[len(correctly_recommended):] + correctly_recommended
            reward = rewards

        else:
            if action in self.user_items.keys() and action not in self.recommended_items:
                reward = self.user_items[action] -3  # reward
            if reward > 0:
                self.items = self.items[1:] + [action]
            self.recommended_items.add(action)

        if (len(self.recommended_items)/10 >= 10) or len(self.recommended_items) >= self.users_history_lens[self.user-1]:
            self.done = True

        return self.items, reward, self.done, self.recommended_items

    # def get_items_names(self, items_ids):
    #     items_names = []
    #     for id in items_ids:
    #         try:
    #             items_names.append(self.items_id_to_name[str(id)])
    #         except:
    #             items_names.append(list(['Not in list']))
    #     return items_names

# Replay buffer

## Tree
### SumTree

In [None]:
class SumTree:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.tree = np.zeros((buffer_size * 2 - 1))
        self.index = buffer_size - 1

    def update_tree(self, index):
        while True:
            index = (index - 1) // 2
            left = (index * 2) + 1
            right = (index * 2) + 2
            self.tree[index] = self.tree[left] + self.tree[right]
            if index == 0:
                break

    def add_data(self, priority):
        if self.index == self.buffer_size * 2 - 1:
            self.index = self.buffer_size - 1

        self.tree[self.index] = priority
        self.update_tree(self.index)
        self.index += 1

    def search(self, num):
        current = 0
        while True:
            left = (current * 2) + 1
            right = (current * 2) + 2

            if num <= self.tree[left]:
                current = left
            else:
                num -= self.tree[left]
                current = right

            if current >= self.buffer_size - 1:
                break

        return self.tree[current], current, current - self.buffer_size + 1

    def update_prioirty(self, priority, index):
        self.tree[index] = priority
        self.update_tree(index)

    def sum_all_prioirty(self):
        return float(self.tree[0])



### MinTree

In [None]:
class MinTree:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.tree = np.ones((buffer_size * 2 - 1))
        self.index = buffer_size - 1

    def update_tree(self, index):
        while True:
            index = (index - 1) // 2
            left = (index * 2) + 1
            right = (index * 2) + 2
            if self.tree[left] > self.tree[right]:
                self.tree[index] = self.tree[right]
            else:
                self.tree[index] = self.tree[left]
            if index == 0:
                break

    def add_data(self, priority):
        if self.index == self.buffer_size * 2 - 1:
            self.index = self.buffer_size - 1

        self.tree[self.index] = priority
        self.update_tree(self.index)
        self.index += 1

    def update_prioirty(self, priority, index):
        self.tree[index] = priority
        self.update_tree(index)

    def min_prioirty(self):
        return float(self.tree[0])

## Replay buffer

In [None]:
class PriorityExperienceReplay(object):

    '''
    apply PER
    '''

    def __init__(self, buffer_size, embedding_dim):
        self.buffer_size = buffer_size
        self.crt_idx = 0
        self.is_full = False

        '''
            state : (300,),
            next_state : (300,)
            actions : (100,),
            rewards : (1,),
            dones : (1,)
        '''
        self.states = np.zeros((buffer_size, 10*embedding_dim), dtype=np.float32)
        self.actions = np.zeros((buffer_size, embedding_dim), dtype=np.float32)
        self.rewards = np.zeros((buffer_size), dtype=np.float32)
        self.next_states = np.zeros((buffer_size, 10*embedding_dim), dtype=np.float32)
        self.dones = np.zeros(buffer_size, np.bool)

        self.sum_tree = SumTree(buffer_size)
        self.min_tree = MinTree(buffer_size)

        self.max_prioirty = 1.0
        self.alpha = 0.6
        self.beta = 0.55
        self.beta_constant = 0.00001

    def append(self, state, action, reward, next_state, done):
        self.states[self.crt_idx] = state
        self.actions[self.crt_idx] = action
        self.rewards[self.crt_idx] = reward
        self.next_states[self.crt_idx] = next_state
        self.dones[self.crt_idx] = done

        self.sum_tree.add_data(self.max_prioirty ** self.alpha)
        self.min_tree.add_data(self.max_prioirty ** self.alpha)

        self.crt_idx = (self.crt_idx + 1) % self.buffer_size
        if self.crt_idx == 0:
            self.is_full = True

    def sample(self, batch_size):
        rd_idx = []
        weight_batch = []
        index_batch = []
        sum_priority = self.sum_tree.sum_all_prioirty()

        N = self.buffer_size if self.is_full else self.crt_idx
        min_priority = self.min_tree.min_prioirty() / sum_priority
        max_weight = (N * min_priority) ** (-self.beta)

        segment_size = sum_priority/batch_size
        for j in range(batch_size):
            min_seg = segment_size * j
            max_seg = segment_size * (j + 1)

            random_num = random.uniform(min_seg, max_seg)
            priority, tree_index, buffer_index = self.sum_tree.search(random_num)
            rd_idx.append(buffer_index)

            p_j = priority / sum_priority
            w_j = (p_j * N) ** (-self.beta) / max_weight
            weight_batch.append(w_j)
            index_batch.append(tree_index)
        self.beta = min(1.0, self.beta + self.beta_constant)

        batch_states = self.states[rd_idx]
        batch_actions = self.actions[rd_idx]
        batch_rewards = self.rewards[rd_idx]
        batch_next_states = self.next_states[rd_idx]
        batch_dones = self.dones[rd_idx]

        return batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones, np.array(weight_batch), index_batch

    def update_priority(self, priority, index):
        self.sum_tree.update_prioirty(priority ** self.alpha, index)
        self.min_tree.update_prioirty(priority ** self.alpha, index)
        self.update_max_priority(priority ** self.alpha)

    def update_max_priority(self, priority):
        self.max_prioirty = max(self.max_prioirty, priority)

# Recommender

In [None]:
class DRRAgent:

    def __init__(self, env, users_num, items_num, state_size, is_test=False, use_wandb=False):

        self.env = env

        self.users_num = users_num
        self.items_num = items_num

        self.embedding_dim = 100
        self.actor_hidden_dim = 128
        self.actor_learning_rate = 0.001
        self.critic_hidden_dim = 128
        self.critic_learning_rate = 0.001
        self.discount_factor = 0.9
        self.tau = 0.001

        self.replay_memory_size = 1000000
        self.batch_size = 32

        self.actor = Actor(self.embedding_dim, self.actor_hidden_dim, self.actor_learning_rate, state_size, self.tau)
        self.critic = Critic(self.critic_hidden_dim, self.critic_learning_rate, self.embedding_dim, self.tau)

        self.embedding_network = UserMovieEmbedding(users_num,items_num, self.embedding_dim)#***************************chú ý
        self.embedding_network([np.zeros((1)),np.zeros((1,100))])

        embedding_save_file_dir = 'u_i_weights.h5'

        self.embedding_network.load_weights(embedding_save_file_dir)

        self.srm_ave = simpleRepresentation(self.embedding_dim)
        self.srm_ave([np.zeros((1,state_size, 100))])

        # PER
        self.buffer = PriorityExperienceReplay(self.replay_memory_size, self.embedding_dim)
        self.epsilon_for_priority = 1e-6

        # ε-탐욕 탐색 하이퍼파라미터 ε-greedy exploration hyperparameter
        self.epsilon = 1.
        self.epsilon_decay = (self.epsilon - 0.1)/500000
        self.std = 1.5

        self.is_test = is_test

    def calculate_td_target(self, rewards, q_values, dones):
        y_t = np.copy(q_values)
        for i in range(q_values.shape[0]):
            y_t[i] = rewards[i] + (1 - dones[i])*(self.discount_factor * q_values[i])
        return y_t

    def recommend_item(self, action, recommended_items, user_items, top_k=False, items_ids=None):
        if items_ids == None:
            items_ids = np.array(list(set(i for i in range(self.items_num)) - recommended_items))
            # items_ids = np.array(list(set(user_items) - recommended_items))

        items_ebs = self.embedding_network.get_layer('movie_embedding')(items_ids)
        # items_ebs = self.m_embedding_network.get_layer('movie_embedding')(items_ids)
        action = tf.transpose(action, perm=(1,0))
        if top_k:
            item_indice = np.argsort(tf.transpose(tf.keras.backend.dot(items_ebs, action), perm=(1,0)))[0][-top_k:]
            return items_ids[item_indice]
        else:
            item_idx = np.argmax(tf.keras.backend.dot(items_ebs, action))
            return items_ids[item_idx]

    def train(self, max_episode_num, top_k=False, load_model=False):
        # 타겟 네트워크들 초기화
        self.actor.update_target_network()
        self.critic.update_target_network()

        episodic_precision_history = []
        episodic_reward_history = []
        episodic_ndcg_history = []
        total_precision = 0
        total_reward = 0
        total_ndcg = 0
        img_check = 1
        for episode in range(max_episode_num):
            # episodic reward 리셋
            episode_reward = 0
            correct_count = 0
            steps = 0
            q_loss = 0
            mean_action = 0
            mean_ndcg = 0
            mean_precision = 0


            # Environment 리셋
            user_id, items_ids, done = self.env.reset()

            while not done:

                # Observe current state & Find action
                ## Embedding
                user_eb = self.embedding_network.get_layer('user_embedding')(np.array(user_id))
                items_eb = self.embedding_network.get_layer('movie_embedding')(np.array(items_ids))

                ## SRM으로 state 출력
                state = self.srm_ave([np.expand_dims(items_eb, axis=0)])

                ## Action(ranking score) 출력
                action = self.actor.network(state)

                ## ε-greedy exploration
                if self.epsilon > np.random.uniform() and not self.is_test:
                    self.epsilon -= self.epsilon_decay
                    action += np.random.normal(0,self.std,size=action.shape)

                ## Item 추천
                recommended_item = self.recommend_item(action, self.env.recommended_items, self.env.user_items.keys(), top_k=top_k)

                # Calculate reward & observe new state (in env)
                ## Step
                next_items_ids, rewards, done, _ = self.env.step(recommended_item, top_k=top_k)
                if top_k:
                    reward = np.sum(rewards)

                # get next_state
                next_items_eb = self.embedding_network.get_layer('movie_embedding')(np.array(next_items_ids))
                # next_items_eb = self.m_embedding_network.get_layer('movie_embedding')(np.array(next_items_ids))
                next_state = self.srm_ave([np.expand_dims(next_items_eb, axis=0)])

                # buffer에 저장
                self.buffer.append(state, action, reward, next_state, done)

                if self.buffer.crt_idx > 1 or self.buffer.is_full:
                    # Sample a minibatch
                    batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones, weight_batch, index_batch = self.buffer.sample(self.batch_size)

                    # Set TD targets
                    target_next_action= self.actor.target_network(batch_next_states)
                    qs = self.critic.network([target_next_action, batch_next_states])
                    target_qs = self.critic.target_network([target_next_action, batch_next_states])
                    min_qs = tf.raw_ops.Min(input=tf.concat([target_qs, qs], axis=1), axis=1, keep_dims=True) # Double Q method tại sao lại dùng cái này
                    ##########
                    td_targets = self.calculate_td_target(batch_rewards, target_qs, batch_dones)########

                    # Update priority
                    for (p, i) in zip(td_targets, index_batch):
                        self.buffer.update_priority(abs(p[0]) + self.epsilon_for_priority, i)

                    # Update critic network
                    q_loss += self.critic.train([batch_actions, batch_states], td_targets, weight_batch)

                    # Update actor network
                    s_grads = self.critic.dq_da([batch_actions, batch_states])
                    self.actor.train(batch_states, s_grads)

                    self.actor.update_target_network()
                    self.critic.update_target_network()

                items_ids = next_items_ids
                episode_reward += reward
                mean_action += np.sum(action[0])/(len(action[0]))
                steps += 1

                if top_k:
                  correct_list = [1 if r > 0 else 0 for r in rewards]
                  # ndcg
                  dcg, idcg = self.calculate_ndcg(rewards, [1 for _ in range(len(rewards))])
                  mean_ndcg += dcg/idcg
                  # mean_precision += correct_list.count(1)/top_k

                if reward > 0:
                    correct_count += 1

                # print(f'recommended items : {len(self.env.recommended_items)},  epsilon : {self.epsilon:0.3f}, reward : {reward:+}', end='\r')

                if done:
                    print()
                    precision = correct_count/steps
                    print(f'{episode}/{max_episode_num}, precision : {precision}, total_reward:{episode_reward}, q_loss : {q_loss/steps}, mean_action : {mean_action/steps}')
                    total_reward += (episode_reward/steps)
                    total_precision += precision
                    total_ndcg += (mean_ndcg/steps)
                    if episode%20 == 0:
                      img_check = 1

                      episodic_precision_history.append(total_precision/20)
                      episodic_ndcg_history.append(total_ndcg/20)
                      episodic_reward_history.append(total_reward/20)

                      total_reward = 0
                      total_precision = 0
                      total_ndcg = 0
            if len(episodic_precision_history)%101 == 0 and img_check:
                img_check = 0
                fig, axs = plt.subplots(3, 1, figsize=(10, 30), sharex=True)

                # Vẽ biểu đồ cho precision
                axs[0].plot(episodic_precision_history)
                axs[0].set_title('Precision Over Episodes')
                axs[0].set_ylabel('Precision Value')

                # Vẽ biểu đồ cho NDCG
                axs[1].plot(episodic_ndcg_history)
                axs[1].set_title('NDCG Over Episodes')
                axs[1].set_ylabel('NDCG Value')

                # Vẽ biểu đồ cho reward
                axs[2].plot(episodic_reward_history)
                axs[2].set_title('Reward Over Episodes')
                axs[2].set_xlabel('Episode')
                axs[2].set_ylabel('Reward Value')

                # Chú thích cho tất cả đồ thị
                for ax in axs:
                    ax.legend(['Value'])

                # Lưu bảng biểu đồ thành một hình ảnh
                plt.savefig(f'episode_{episode }_training_metrics.png')


            if (episode+1)%2000 == 0 or episode == max_episode_num-1:
                self.save_model(f'actor_top10_{episode +1}_DRR-p.h5',
                                f'critic_top10_{episode + 1}_DRR-p.h5')
                drive.mount('/content/gdrive')


    def save_model(self, actor_path, critic_path):
        self.actor.save_weights(actor_path)
        self.critic.save_weights(critic_path)

    def load_model(self, actor_path, critic_path):
        self.actor.load_weights(actor_path)
        self.critic.load_weights(critic_path)
    def calculate_ndcg(self, rel, irel):
        dcg = 0
        idcg = 0
        # rel = [1 if r>0 else 0 for r in rel]
        for i, (r, ir) in enumerate(zip(rel, irel)):
            dcg += (2**r )/np.log2(i+2)
            idcg += (2**ir )/np.log2(i+2)
        return dcg, idcg

# Load data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/My\ Drive/learn-recommendation

Mounted at /content/gdrive
/content/gdrive/My Drive/learn-recommendation


In [None]:
!pip install -q --upgrade tensorflow-datasets

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
ratings = tfds.load("movie_lens/100k-ratings", split="train")



Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movie_lens/100k-ratings/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movie_lens/100k-ratings/0.1.1.incompleteNAK54F/movie_lens-train.tfrecord*.…

Dataset movie_lens downloaded and prepared to /root/tensorflow_datasets/movie_lens/100k-ratings/0.1.1. Subsequent calls will reuse this data.


In [None]:
rating_data = []

for data in ratings:
  rating_data.append({
      'user_id': data['user_id'].numpy(),
      'movie_id': data['movie_id'].numpy(),
      'user_rating': data['user_rating'].numpy(),
      'timestamp': data['timestamp'].numpy()
  })

rating_df = pd.DataFrame(rating_data, columns = ['user_id', 'movie_id', 'user_rating', 'timestamp'])
rating_df['user_id'] = rating_df['user_id'].astype(int)
rating_df['movie_id'] = rating_df['movie_id'].astype(int)
rating_df['user_rating'] = rating_df['user_rating'].astype(int)
rating_df['timestamp'] = rating_df['timestamp'].astype(int)

In [None]:
users_dict = {user : [] for user in set(rating_df["user_id"])}
users_dict[1]

[]

In [None]:
ratings_df_gen = rating_df.iterrows()
users_dict_for_history_len = {user : [] for user in set(rating_df["user_id"])}
for data in ratings_df_gen:
    users_dict[data[1]['user_id']].append((data[1]['movie_id'], data[1]['user_rating']))
    if data[1]['user_rating'] >= 4:
        users_dict_for_history_len[data[1]['user_id']].append((data[1]['movie_id'], data[1]['user_rating']))

In [None]:
users_history_lens = [len(users_dict_for_history_len[u]) for u in set(rating_df["user_id"])]

In [None]:
# get the number of users and items
users_num = max(rating_df["user_id"])+1
items_num = max(rating_df["movie_id"])+1

# Train

In [None]:
train_users_num = int(users_num * 0.1)
train_items_num = items_num
train_users_dict = {k:users_dict[k] for k in range(1, train_users_num+1)}
train_users_history_lens = users_history_lens[:train_users_num]
item_id_to_item = []

STATE_SIZE = 10
MAX_EPISODE_NUM = 2000
TOP_K = 10
env = OfflineEnv(train_users_dict, train_users_history_lens, item_id_to_item, STATE_SIZE)
recommender = DRRAgent(env, users_num, items_num, STATE_SIZE, use_wandb=False)
recommender.actor.build_networks()
recommender.critic.build_networks()
# recommender.load_model('actor_top10_10000_final_fixed.h5', 'critic_top10_10000_final_fixed.h5')
recommender.train(MAX_EPISODE_NUM,TOP_K ,load_model=False)


# Evaluate

In [None]:
def evaluate(recommender, env, check_movies = False, top_k=False):

    # episodic reward 리셋
    episode_reward = 0
    steps = 0
    mean_precision = 0
    mean_ndcg = 0
    # Environment 리셋
    user_id, items_ids, done = env.reset()

    if check_movies:
        print(f'user_id : {user_id}, rated_items_length:{len(env.user_items)}')
    #     print('history items : \n', np.array(env.get_items_names(items_ids)))

    while not done:

        # Observe current state & Find action
        ## Embedding 해주기
        user_eb = recommender.embedding_network.get_layer('user_embedding')(np.array(user_id))
        items_eb = recommender.embedding_network.get_layer('movie_embedding')(np.array(items_ids))
        ## SRM state
        state = recommender.srm_ave([np.expand_dims(items_eb, axis=0)])
        ## Action(ranking score) 출력
        action = recommender.actor.network(state)
        ## Item 추천
        recommended_item = recommender.recommend_item(action, env.recommended_items, env.user_items.keys(), top_k=top_k)


        next_items_ids, reward, done, _= env.step(recommended_item, top_k=top_k)
        if top_k:
            correct_list = [1 if r > 0 else 0 for r in reward]
            # ndcg
            dcg, idcg = calculate_ndcg(correct_list, [1 for _ in range(len(reward))])
            mean_ndcg += dcg/idcg

            #precision
            correct_num = top_k-correct_list.count(0)
            mean_precision += correct_num/top_k

        reward = np.sum(reward)
        items_ids = next_items_ids
        episode_reward += reward
        steps += 1


    if check_movies:
        print(f'precision : {mean_precision/steps}, ngcg : {mean_ndcg/steps}, episode_reward : {episode_reward}')
        print()

    return mean_precision/steps, mean_ndcg/steps

def calculate_ndcg(rel, irel):
    dcg = 0
    idcg = 0
    rel = [1 if r>0 else 0 for r in rel]
    for i, (r, ir) in enumerate(zip(rel, irel)):
        dcg += (r)/np.log2(i+2)
        idcg += (ir)/np.log2(i+2)
    return dcg, idcg



# TEST

In [None]:
len(users_dict[81]), users_history_lens[80]

In [None]:
# test_items_num = int(users_num * 0.2)
# eval_users_dict = {k:users_dict[k] for k in range(users_num-test_items_num, users_num)}
# test_users_history_lens = users_history_lens[-test_items_num:]

test_items_num = int(users_num * 0.1)
eval_users_dict = {k:users_dict[k] for k in range(1, test_items_num+1)}
test_users_history_lens = users_history_lens[:test_items_num]

movies_id_to_movies = []
sum_precision = 0
sum_ndcg = 0
TOP_K = 10
STATE_SIZE = 10
MAX_EPISODE_NUM = 1
end_evaluation = 100
count = 0
for i, user_id in enumerate(eval_users_dict.keys()):
    count +=1
    env = OfflineEnv(eval_users_dict, users_history_lens, movies_id_to_movies, STATE_SIZE, fix_user_id=user_id)
    recommender = DRRAgent(env, users_num, items_num, STATE_SIZE, )
    recommender.actor.build_networks()
    recommender.critic.build_networks()
    recommender.load_model('actor_top10_2000_DRR-p.h5', 'critic_top10_2000_DRR-p.h5')
    precision, ndcg = evaluate(recommender, env, check_movies=True, top_k=TOP_K) # if check movies is true, you can check the recommended movies
    sum_precision += precision
    sum_ndcg += ndcg

    if i > end_evaluation:
        break
print(count)
print(f'precision@{TOP_K} : {sum_precision/len(eval_users_dict)}, ndcg@{TOP_K} : {sum_ndcg/len(eval_users_dict)}')

In [1]:
my_list = [3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5]

my_list.sort()
print(my_list)


[1, 1, 2, 3, 3, 4, 5, 5, 5, 6, 9]
