### Main program on estimating error function for Off-Policy-Evaluation

In [1]:
# !pip install Box2D
# !pip install 'gym[all]'
# !pip install pyyaml
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, defaultdict

import time
import sys
from tqdm import tqdm
import yaml
import re

from dqnetwork import DQNetwork
from agent import Agent

import torch
import torch.nn as nn
import torch.nn.functional as F

from os import listdir, getcwd
from os.path import isabs, join

In [2]:
## Load the environment
env_id = 'CartPole-v0'
env = gym.make(env_id)

## Step 1. Load Policies

In [3]:
### get all model policies - "../model/policies/behavior_policy_x.pth"
def get_policies(path, config):
    """
    Loads all policies in a given directory into an agent
    @Param:
    1. Path - model path.
    2. config - path for configuration of corresponding model
    @Return:
    - policies: list of agent
    """
    cwd = getcwd() #get current working directory
    index = lambda index : int(re.findall(r'[0-9]+', index)[-1]) #get policy number
        
    #Get paths for config and policy.
    policy_files = {index(f): join(path, f) for f in listdir(join(cwd, path)) if isabs(join(cwd, f))}
    yaml_files   = [join(cwd, join(config, f)) for f in listdir(join(cwd, config)) if isabs(join(cwd, f))]

    policies = [] #loads an agent policy
    
    for config_file in yaml_files:
        with open(r''+ config_file) as file:
            config_data = yaml.load(file, Loader=yaml.FullLoader) #get output as dict
            info = config_data[0] #gets model size info
            num = index(config_file) #get file number
            agent = Agent(fc1=info['fc1'], fc2=info['fc2'], path=policy_files[num])
            policies.append( agent ) #add agent
    return policies

In [4]:
agents = get_policies("../model/policies", "../model/config")

Model loaded into local and target networks!
Model loaded into local and target networks!
Model loaded into local and target networks!


## Step 2 - generate policy matrix
<p> A policy matrix is a dictionary of dimensions (K, K - 1), where K is the total number of behavior policies </p>

In [6]:
def policy_matrix(agents):
    """Generates policy matrix (dictionary) of shape (n, n-1) for x agents"""
    matrix = {}
    for i, evaluation in enumerate(agents):
        matrix[evaluation] = []
        for j, behavior in enumerate(agents):
            if(i != j):
                matrix[evaluation].append(behavior)
    return matrix        

In [7]:
#each key indicates evaluation policy, and corresponding values indicate behavior policies
policy_dict = policy_matrix(agents)

In [40]:
def get_behavior_policies(evaluation_policy):
    """Generates respective behvaior policies for a particular evaluation policy"""
    return policy_dict[evaluation_policy] if evaluation_policy in policy_dict else None

## Step 3. Get Horizons

$$ \xi_k =  \prod_{t=1}^{H} \frac{ \pi_e({a_t}^k | {s_t}^k) }{ \pi_k({a_t}^k | {s_t}^k) } $$

In [41]:
def horizon(evaluation_agent, behavior_agent):
    """
    > Calculates ratio between an evaluation and a behavior agent for infinite horizon
    > Calculates Return value (total reward) for behavior_agent
    NOTE: behavior_agent (π_k) generates all states and action.
    @Param:
    1. evaluation_agent - agent class object representing evaluation policy.
    2. behavior_agent - agent class object representing behavior policy.
    @Returns:
    - ratio : value ratio b/w evaluation and behavior agent.
    - total_reward : return of behavior_agent.
    """
    ratio = 1
    total_reward = 0
    state = env.reset() #reset
    while True:
        action, prob_behv = behavior_agent.get_action(state, eps=0) #generate best action and prob for behavior.
        _, prob_eval = evaluation_agent.get_action(state, eps=0) #generate max probability for evaluation policy.
        
        ratio *= float(prob_eval/prob_behv) #compute ratio
        
        next_state, reward, done, info = env.step(action) #transition
        
        total_reward += reward #update reward
        state = next_state #update state
        
        if(done): #stopping condition
            break
    
    return ratio, total_reward

## Step 4. Get Trajectories

$$ \sigma = \sum\limits_{i=1}^N {R_k}^i \times \xi_k  $$

In [5]:
def get_trajectories():
    """
    See formula above.
    > Computes the sum of ratio b/w evaluation and behavior agent, 𝜉
    > Compute sigma that represents 
    """
    trajectories = []
    rewards = []
    
    for n in range(N):
        transitions = []
        total_reward = 0
        state = env.reset() #reset
        while True:
            action, prob = agent.get_action(state, eps=0)
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            transitions.append((state, action, reward, next_state)) #store transition
            state = next_state
            if(render):
                env.render() #display agent
            if(done): #stopping condition
                trajectories.append(transitions)
                rewards.append(total_reward)
                break
    
    assert(np.mean(rewards) >= 195) #condition for expert policy
    
    return np.array(trajectories), np.array(rewards)