In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# plt.rcParams['axes.grid'] = True

%matplotlib inline

In [None]:
from gym import spaces
from recsim import document, user
from recsim.choice_model import AbstractChoiceModel
from recsim.simulator import recsim_gym, environment

In [None]:
from recsim_exp import GaussNoise, WolpertingerRecommender

# RecSim environment

In this tutorial we will break a RecSim environment down into its basic components. 
![Detailed view of RecSim](https://github.com/google-research/recsim/blob/master/recsim/colab/figures/simulator.png?raw=true)

The green and blue blocks in the above diagram constitute the classes that need to be implemented within a RecSim environment. The goal of this tutorial is to explain the purpose of these blocks and how they come together in a simulation.  In the process, we will go over an example end-to-end implementation.



# Overview

A single step of a RecSim simulation can be summarized roughly as follows:


1.   the document database provides a corpus of *D* documents to the recommender. This could be a different set at each step (e.g., sampled, or produced by some "candidate generation" process), or fixed throughout the simulation. Each document is represented by a list of features. In a fully observable situation, the recommender observes all features of each document that impact the user's state and choice of document (and other aspects of the user's response), but this need not be the case in general. (In fact, most interesting scenarios involve latent features.)
2.   The recommender observes the *D* documents (and their features) together with the user's response to the last recommendation. It then makes a selection (possibly ordered) of *k* documents and presents them to the user. The ordering may or may not impact the user choice or user state, depending on our simulation goals.
3.   The user examines the list of documents and makes a choice of one document. Note that not consuming any of the documents is also a valid choice. This leads to a transition in the user's state. Finally the user emits an observation, which the recommender observes at the next iteration. The observation generally includes (noisy) information about the user's reaction to the content and potentially clues about the user's latent state. Typically, the user's state is not fully revealed. 

If we examine at the diagram above carefully, we notice that the flow of information along arcs is acyclic---a RecSim environment is a dynamic Bayesian network (DBN), where the various boxes represent conditional probability distributions. We will now define a simple simulation problem and implement it. 

In [None]:
SEED = 42
DOC_NUM = 10
P_EXIT_ACCEPTED = 0.1
P_EXIT_NOT_ACCEPTED = 0.2

# let's define a matrix W for simulation of users' respose
# (based on the section 7.3 of the paper https://arxiv.org/pdf/1512.07679.pdf)
# W_ij defines the probability that a user will accept recommendation j
# given that he is consuming item i at the moment

np.random.seed(SEED)
W = (np.ones((DOC_NUM, DOC_NUM)) - np.eye(DOC_NUM)) * \
     np.random.uniform(0.0, P_EXIT_NOT_ACCEPTED, (DOC_NUM, DOC_NUM)) + \
     np.diag(np.random.uniform(1.0 - P_EXIT_ACCEPTED, 1.0, DOC_NUM))
W = W[:, np.random.permutation(DOC_NUM)]

### Document

In [None]:
class Document(document.AbstractDocument):

    def __init__(self, doc_id):
        super(Document, self).__init__(doc_id)

    def create_observation(self):
        return (self._doc_id,)

    @staticmethod
    def observation_space():
        return spaces.Discrete(DOC_NUM)

    def __str__(self):
        return "Document #{}".format(self._doc_id)


class DocumentSampler(document.AbstractDocumentSampler):

    def __init__(self, doc_ctor=Document):
        super(DocumentSampler, self).__init__(doc_ctor)
        self._doc_count = 0

    def sample_document(self):
        doc = self._doc_ctor(self._doc_count % DOC_NUM)
        self._doc_count += 1
        return doc

### User

In [None]:
class UserState(user.AbstractUserState):

    def __init__(self, user_id, current, active_session=True):
        self.user_id = user_id
        self.current = current
        self.active_session = active_session

    def create_observation(self):
        return (self.current,)

    def __str__(self):
        return "User #{}".format(self.user_id)

    @staticmethod
    def observation_space():
        return spaces.Discrete(DOC_NUM)

    def score_document(self, doc_obs):
        return W[self.current, doc_obs[0]]


class StaticUserSampler(user.AbstractUserSampler):

    def __init__(self, user_ctor=UserState):
        super(StaticUserSampler, self).__init__(user_ctor)
        self.user_count = 0

    def sample_user(self):
        self.user_count += 1
        sampled_user = self._user_ctor(
            self.user_count, np.random.randint(DOC_NUM))
        return sampled_user


class Response(user.AbstractResponse):

    def __init__(self, accept=False):
        self.accept = accept

    def create_observation(self):
        return (int(self.accept),)

    @classmethod
    def response_space(cls):
        return spaces.Discrete(2)


class UserChoiceModel(AbstractChoiceModel):
    def __init__(self):
        super(UserChoiceModel, self).__init__()
        self._score_no_click = P_EXIT_ACCEPTED

    def score_documents(self, user_state, doc_obs):
        if len(doc_obs) != 1:
            raise ValueError(
                "Expecting single document, but got: {}".format(doc_obs))
        self._scores = np.array(
            [user_state.score_document(doc) for doc in doc_obs])

    def choose_item(self):
        if np.random.random() < self.scores[0]:
            return 0


class UserModel(user.AbstractUserModel):
    def __init__(self):
        super(UserModel, self).__init__(Response, StaticUserSampler(), 1)
        self.choice_model = UserChoiceModel()

    def simulate_response(self, slate_documents):
        if len(slate_documents) != 1:
            raise ValueError("Expecting single document, but got: {}".format(
                slate_documents))

        responses = [self._response_model_ctor() for _ in slate_documents]

        self.choice_model.score_documents(
            self._user_state,
            [doc.create_observation() for doc in slate_documents]
        )
        selected_index = self.choice_model.choose_item()

        if selected_index is not None:
            responses[selected_index].accept = True

        return responses

    def update_state(self, slate_documents, responses):
        if len(slate_documents) != 1:
            raise ValueError(
                f"Expecting single document, but got: {slate_documents}"
            )

        response = responses[0]
        doc = slate_documents[0]
        if response.accept:
            self._user_state.current = doc.doc_id()
            self._user_state.active_session = bool(
                np.random.binomial(1, 1 - P_EXIT_ACCEPTED))
        else:
            self._user_state.current = np.random.choice(DOC_NUM)
            self._user_state.active_session = bool(
                np.random.binomial(1, 1 - P_EXIT_NOT_ACCEPTED))

    def is_terminal(self):
        """Returns a boolean indicating if the session is over."""
        return not self._user_state.active_session


def clicked_reward(responses):
    reward = 0.0
    for response in responses:
        if response.accept:
            reward += 1
    return reward

### Environment

In [None]:
def make_env():
    env = recsim_gym.RecSimGymEnv(
        environment.Environment(
            UserModel(), 
            DocumentSampler(), 
            DOC_NUM, 
            1, 
            resample_documents=False
        ),
        clicked_reward
    )
    return env

# RecSim Agent

For solving of this toy environment we'll try using a variant of DDPG algorithm for discrete actions.
We need to embed our discrete actions into continuous space to use DDPG (it outputs "proto action").
Then we choose k nearest embedded actions and take the action with maximum Q value.
Thus, we can avoid taking maximum over all the action space as in DQN, which can be too large in case of RecSys.
In our example embeddings are just one hot vectors. Therefore the nearest neighbour is argmax of proto action.

<img src="../presets/wolpertinger_scheme.png" width=400 height=800>

In [None]:
def run_agent(
    env, 
    agent, 
    num_steps: int = int(3e3), 
    log_every: int = int(1e3)
):
    reward_history = []
    step, episode = 1, 1

    observation = env.reset()
    while step < num_steps:
        action = agent.begin_episode(observation)
        episode_reward = 0
        while True:
            observation, reward, done, info = env.step(action)
            episode_reward += reward

            if step % log_every == 0:
                print(step, np.mean(reward_history[-50:]))
            step += 1
            if done:
                break
            else:
                action = agent.step(reward, observation)

        agent.end_episode(reward, observation)
        reward_history.append(episode_reward)

    return reward_history

In [None]:
parameters = {
    "action_dim": DOC_NUM,
    "state_dim": DOC_NUM,
    "noise": GaussNoise(sigma=0.05),
    "critic_lr": 1e-3,
    "actor_lr": 1e-3,
    "tau": 1e-3,
    "hidden_dim": 256,
    "batch_size": 128,
    "buffer_size": int(1e4),
    "gamma": 0.8,
    "actor_weight_decay": 0.0001,
    "critic_weight_decay": 0.001,
    "eps": 1e-2
}

In [None]:
env = make_env()
agent = WolpertingerRecommender(
    env=env, 
    k_ratio=0.33, 
    **parameters
)
reward_history = run_agent(env, agent)
plt.plot(pd.Series(reward_history).rolling(50).mean())

---

### Extra - 1

In [None]:
predicted_qvalues = np.hstack([
    agent.agent.predict_qvalues(i) for i in range(DOC_NUM)
]).T
predicted_actions = np.vstack([
    agent.agent.predict_action(np.eye(DOC_NUM)[i], with_noise=False)
    for i in range(DOC_NUM)
])

In [None]:
# learned Qvalues 
plt.subplots(figsize=predicted_qvalues.shape)
sns.heatmap(predicted_qvalues.round(3), annot=True);

In [None]:
# learned actions (aka policy)
plt.subplots(figsize=predicted_qvalues.shape)
sns.heatmap(predicted_actions.round(3), annot=True);

In [None]:
# true actions (aka policy)
plt.subplots(figsize=predicted_qvalues.shape)
sns.heatmap(W, annot=True);

### Extra - 2

In [None]:
from recsim.agent import AbstractEpisodicRecommenderAgent

class OptimalRecommender(AbstractEpisodicRecommenderAgent):

    def __init__(self, environment, W):
        super().__init__(environment.action_space)
        self._observation_space = environment.observation_space
        self._W = W

    def _extract_state(self, observation):
        user_space = self._observation_space.spaces["user"]
        return spaces.flatten(user_space, observation["user"])

    def step(self, reward, observation):
        state = self._extract_state(observation)
        return [self._W[state.argmax(), :].argmax()]

In [None]:
env = make_env()
agent = OptimalRecommender(env, W)

reward_history = run_agent(env, agent)
plt.plot(pd.Series(reward_history).rolling(50).mean())

---

In [None]:
# from recsim.agents.tabular_q_agent import TabularQAgent

# env = make_env()
# q_agent = TabularQAgent(env.observation_space, env.action_space)

# reward_history = run_agent(env, agent)
# plt.plot(pd.Series(reward_history).rolling(50).mean())