### Main program on estimating error function for Off-Policy-Evaluation

In [1]:
# !pip install Box2D
# !pip install 'gym[all]'
# !pip install pyyaml
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, defaultdict

import time
import sys
from tqdm import tqdm
import yaml
import re

from dqnetwork import DQNetwork
from agent import Agent

import torch
import torch.nn as nn
import torch.nn.functional as F

from os import listdir, getcwd
from os.path import isabs, join

In [2]:
## Load the environment
env_id = 'CartPole-v0'
env = gym.make(env_id)

## Step 1. Load Policies

In [3]:
### get all model policies - "../model/policies/behavior_policy_x.pth"
def get_policies(path, config):
    """
    Loads all policies in a given directory into an agent
    @Param:
    1. Path - model path.
    2. config - path for configuration of corresponding model
    @Return:
    - policies: list of agent
    """
    cwd = getcwd() #get current working directory
    index = lambda index : int(re.findall(r'[0-9]+', index)[-1]) #get policy number
        
    #Get paths for config and policy.
    policy_files = {index(f): join(path, f) for f in listdir(join(cwd, path)) if isabs(join(cwd, f))}
    yaml_files   = [join(cwd, join(config, f)) for f in listdir(join(cwd, config)) if isabs(join(cwd, f))]

    policies = [] #loads an agent policy
    
    for config_file in yaml_files:
        with open(r''+ config_file) as file:
            config_data = yaml.load(file, Loader=yaml.FullLoader) #get output as dict
            info = config_data[0] #gets model size info
            num = index(config_file) #get file number
            agent = Agent(fc1=info['fc1'], fc2=info['fc2'], path=policy_files[num])
            policies.append( agent ) #add agent
    return policies

In [4]:
agents = get_policies("../model/policies", "../model/config")

Model loaded into local and target networks!
Model loaded into local and target networks!
Model loaded into local and target networks!


## Step 2. Get Trajectories

In [5]:
def get_trajectories(agent, N=10, render=False):
    """
    Returns N trajectories for an agent, å, with behavior policy π*.
    Expected return should be ≥ 195, otherwise, model loaded is not an expert policy.
    @Param:
    1. agent - (Agent) Agent object representing an expert policy.
    2. N - number of trajectories/episodes to run.
    3. render - (bool) render iPython gui.
    @Return:
    1. trajectories - State, Action, Reward, Next_State Transitions for finite Horizon, H.
    2. Return - total reward for a single trajectory
    """
    trajectories = []
    rewards = []
    
    for n in range(N):
        transitions = []
        total_reward = 0
        state = env.reset() #reset
        while True:
            action = agent.get_action(state, eps=0.01)
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            transitions.append((state, action, reward, next_state)) #store transition
            state = next_state
            if(render):
                env.render() #display agent
            if(done): #stopping condition
                trajectories.append(transitions)
                rewards.append(total_reward)
                break
    
    assert(np.mean(rewards) >= 195) #condition for expert policy
    
    return np.array(trajectories), np.array(rewards)

## Step 3 - generate policy matrix
<p> A policy matrix is of dimensions (K, K - 1), where K is the total number of behavior policies </p>

In [6]:
def policy_matrix(agents):
    """Generates policy matrix (dictionary) of shape (n, n-1) for x agents"""
    matrix = {}
    for i, evaluation in enumerate(agents):
        matrix[evaluation] = []
        for j, behavior in enumerate(agents):
            if(i != j):
                matrix[evaluation].append(behavior)
    return matrix        

In [7]:
#each key indicates evaluation policy, and corresponding values indicate behavior policies
policy_dict = policy_matrix(agents)

In [8]:
def get_behavior_policies(evaluation_policy):
    """Generates respective behvaior policies for a particular evaluation policy"""
    return policy_dict[evaluation_policy] if evaluation_policy in policy_dict else None