In [0]:
# import OpenAI gym
import gym

# create the environment
env = gym.make('CartPole-v0')

# initialize the env
env.reset()

### Display CartPole

In [0]:
# show a few frames of CartPole
for i in range(100):
    # display the env (optional)
    # env.render()
    # randomly chose an action from all available actions
    action = env.action_space.sample()
    # agent takes an action and interacts with the env, receiving state, reward, done and info
    state, reward, done, info = env.step(action)
    # if episode is over reset the env
    if done:
        env.reset()

### The Reinforcement Learning Loop

In [0]:
episodes = 10

# run environment for 10 episodes
for ep in range(episodes):
    episode_reward = 0
    while True:
        # randomly chose an action from all available actions
        action = env.action_space.sample()
        # agent takes an action and interacts with the env, receiving state, reward, done and info
        state, reward, done, info = env.step(action)
        episode_reward += 1
        # if episode is over reset the env
        if done:
            print("Episode {} done with reward: {}".format(ep, episode_reward))
            env.reset()
            break

Episode 0 done with reward: 25
Episode 1 done with reward: 10
Episode 2 done with reward: 103
Episode 3 done with reward: 24
Episode 4 done with reward: 14
Episode 5 done with reward: 18
Episode 6 done with reward: 37
Episode 7 done with reward: 48
Episode 8 done with reward: 15
Episode 9 done with reward: 11


### Episodes and Timesteps

In [0]:
episodes = 10
max_timesteps = 200
# run environment for 10 episodes
for ep in range(episodes):
    timestep = 0
    while timestep < max_timesteps:
        # randomly chose an action from all available actions
        action = env.action_space.sample()
        # agent takes an action and interacts with the env, receiving state, reward, done and info
        state, reward, done, info = env.step(action)
        timestep += 1
        # if episode is over reset the env
        if done:
            print("Episode {} done after {} timesteps".format(ep, timestep))
            env.reset()
            break

Episode 0 done after 15 timesteps
Episode 1 done after 12 timesteps
Episode 2 done after 23 timesteps
Episode 3 done after 16 timesteps
Episode 4 done after 18 timesteps
Episode 5 done after 14 timesteps
Episode 6 done after 29 timesteps
Episode 7 done after 33 timesteps
Episode 8 done after 42 timesteps
Episode 9 done after 68 timesteps


### Interacting with the Environment: actions, done and env.step()

In [0]:
episodes = 1

max_timesteps = 20

for ep in range(episodes):
    timestep = 0
    while timestep < max_timesteps:
        # randomly chose an action from all available actions
        action = env.action_space.sample()
        # agent takes an actiona nd interacts with the env, receiving state, reward, done and info
        state, reward, done, info = env.step(action)
        timestep += 1
        print("Timestep {}: agent took action {}".format(timestep, action))
        print("Timestep {}: state {}, reward {}, done {}, info {}".format(timestep, state, reward, done, info))
        # if episode is over reset the env
        if done:
            env.reset()
            break

Timestep 1: agent took action 0
Timestep 1: state [0.04505125 0.00220196 0.01351047 0.00187966], reward 1.0, done False, info {}
Timestep 2: agent took action 1
Timestep 2: state [ 0.04509529  0.19712758  0.01354806 -0.28651012], reward 1.0, done False, info {}
Timestep 3: agent took action 0
Timestep 3: state [0.04903784 0.00181506 0.00781786 0.01041478], reward 1.0, done False, info {}
Timestep 4: agent took action 0
Timestep 4: state [ 0.04907414 -0.19341814  0.00802616  0.30555405], reward 1.0, done False, info {}
Timestep 5: agent took action 1
Timestep 5: state [0.04520578 0.00158853 0.01413724 0.01541314], reward 1.0, done False, info {}
Timestep 6: agent took action 1
Timestep 6: state [ 0.04523755  0.19650491  0.0144455  -0.272776  ], reward 1.0, done False, info {}
Timestep 7: agent took action 1
Timestep 7: state [ 0.04916764  0.39141779  0.00898998 -0.56086799], reward 1.0, done False, info {}
Timestep 8: agent took action 1
Timestep 8: state [ 0.056996    0.58641243 -0.002