## Analysis on the initial state selection

### Hypothesis: Initial state selection affects the performance of the PbPI algorithm

#### Analysis method:

- Design a two sets of state pools with equal number of states
    - First, generate a large pool of states by letting the agent follow a random policy for 1000 steps; if an episode terminates while executing this number of actions, the environment is reset and the remaining actions are executed.
    - Afterwards, derive three sets of initial states by following the below approach:
        - From the initial pool of states, randomly sample 50 states to derive a initial state distribution of the pendulum angle skewed to right.
        - From the initial pool of states, randomly sample 50 states to derive a initial state distribution of the pendulum angle skewed to left.
        - From the initial pool of states, divide the captured pendulum angle (state) into 10 equidistant partitions and sample 4 states from each partition:
            - When partitioning the state pendulum angle state values, I define a upper and lower bounds on the pendulum angle as -45 degrees  (-0.77 rads) and + 45 degrees (+0.77 rads).
            - This is due to the assumption that once the pendulum exceeds these thresholds, the agent will not be able to balance the pendulum again regardless how large the magnitute of the force would be.

In [2]:
########################################
### importing the necessary packages ###
########################################

import gym
from gym import wrappers
import custom_cartpole  # custom cart-pole environment

import numpy as np
import pandas as pd

import random
from scipy.stats import rankdata as rd
from scipy import stats

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from IPython import display
from IPython.display import HTML

import io
import base64
import itertools
import tqdm
import os

In [2]:
########################################
########## helper functions ##########
########################################


# generate a random action from a given environment
def random_action(environment, seed=10):
    """ return a random action from the given environment. """
    
    # set env. seeds for reproducibility
    #environment.action_space.np_random.seed(seed) 
    #environment.seed(seed) 
    
    return environment.action_space.sample()

In [5]:
env = gym.make('CustomCartPole-v0') 

env.reset()

array([ 0.01495557, -0.03289365,  0.00216083,  0.04515281])

In [36]:
env.observation_space.low[2]

-3.1415927

In [27]:
env.action_space.np_random.seed(1)
env.action_space.sample()

array([[-0.16595599]], dtype=float32)

In [None]:
# We are only interested in pole-angle and pole-velocity.
# They corresponds to the third and fourth values of the state space.



In [2]:
# generate a list of initial states from a given environment
def generate_init_states_S(seed
                           , env = 'CustomCartPole-v0'
                           , sample_size = 10 # how many states to include in the sample
                          ):
    """ this function returns a list of randomly generated initial states from a given environment. """
    
    # set the random seed for reproducibility
    np.random.seed(seed)
    
    # define how many initial states to generate altogether
    n_states = np.random.randint(low=201, high=301) 
    
    # define how many states to sample from the generated states
    n_states_sample = np.random.randint(low=sample_size, high=sample_size+1) 

    # define a list to store the generated initial states
    init_states_S = []

    # create a given environment object
    env = gym.make(env)
    env.action_space.np_random.seed(seed) # set env. seeds for reproducibility
    env.seed(seed) # set env. seeds for reproducibility
    env.reset(init_state = np.array([0,0,0,0]))

    # generate initial states
    s_count = 0
    while s_count < n_states:

        # step through the environment by taking random actions
        state, reward, done, info = env.step(env.action_space.sample())  
            
        # If terminates, reset the environment and continue to next step
        #   (without appending the termination state to the list).
        # Increment 'n_states' count by 7 since last 7 states from the termination state are removed
        #  to avoid having states close to termination in the initial state list.
        if done: 
            env.reset(init_state = np.array([0,0,0,0]))
            n_states+=7
            init_states_S = init_states_S[:-7]
            continue
            
        # append the observed state to the initial state list
        init_states_S.append(state)
        
        s_count +=1
      
    env.close()
    
    # remove any duplicate state values from the list
    state_str_li = []
    for state in init_states_S:
        state_str_li.append("".join([str(item[0]) for item in [item.reshape(-1) for item in state.flatten()]]))

    uniq, uni_id = np.unique(state_str_li, return_index=True)
    init_states_S = [init_states_S[j] for j in uni_id]
    
    # sample the required number of states (uniform random sampling)
    sampled_states = random.sample(init_states_S, n_states_sample)
            
    return sampled_states #init_states_S
    

# partition the action space of a given environment 
def partition_action_space(env_name:'string'
                           , n_actions:'int'):
    """function to partitions the action space of an environment into a given number of actions`"""
    
    # initialize environment
    env = gym.make(env_name)

    # partition the action space to a given number of actions
    part_act_space = np.linspace(env.action_space.low[0,0]
                                 ,env.action_space.high[0,0],n_actions)
    
    return part_act_space  


########################################