In [1]:
import gym
from gym.core import Env
from collections import deque

import torch
import torch.nn.functional as F

import math
import numpy as np
import matplotlib.pyplot as plt

from utils import DQN, ReplayBuffer, greedy_action, epsilon_greedy, update_target, loss, test_agent, fit_agent, plot_data_for_states, eps_sequence, get_n_episodes

In [2]:
NUM_RUNS = 10
BASE_EPS = 0.995
N_EPISODES = get_n_episodes(BASE_EPS)

print("Number of episodes in runs = ", N_EPISODES)

Number of episodes in runs =  600


In [3]:
BUFFER_SIZE = 3000
BATCH_SIZE = 35
COLLECTING_STEPS = 400
TARGET_UPDATE_FREQUENCY = 80
GAMMA = 1.0
DELAY = 100
LAYERS = [4, 15, 2]
ACTIVATIONS = [F.tanh, F.leaky_relu]
LR = 0.001

P_SHUFFLE = 0.15

In [4]:
runs_results = []
loss_results = []
performance_results = []

env = gym.make('CartPole-v1')


for run in range(NUM_RUNS):
  print(f"Run {run + 1} of {NUM_RUNS}")

  agent, buffer, episode_durations, loss_progress, results_of_test = fit_agent(env=env,
                                                                               base=BASE_EPS,
                                                                               delay=DELAY,
                                                                               buffer_size=BUFFER_SIZE,
                                                                               p_shuffle=P_SHUFFLE,
                                                                               batch_size=BATCH_SIZE,
                                                                               collecting_steps=COLLECTING_STEPS,
                                                                               update_frequency=TARGET_UPDATE_FREQUENCY,
                                                                               gamma=GAMMA,
                                                                               layers=LAYERS,
                                                                               activations=ACTIVATIONS,
                                                                               lr=LR)
  performance_results.append(results_of_test)
  runs_results.append(episode_durations)
  loss_results.append(loss_progress)

print('Complete')

  deprecation(
  deprecation(


Run 1 of 10
Progress: 0% 10% 20% 

KeyboardInterrupt: 

In [None]:
# process the results
reward_results = np.array(runs_results)

mean_duration = np.mean(reward_results, axis=0)
std_duration = np.std(reward_results, axis=0)

upper_bound = mean_duration + std_duration
lower_bound = mean_duration - std_duration
lower_bound[lower_bound <= 0] = 0.0

In [None]:
x_axis_length = len(runs_results[0]) + 1
x_axis = np.arange(1, x_axis_length)

plt.figure(figsize=(20, 8))
plt.plot(x_axis, mean_duration, color='r', label='Mean number of steps')
plt.fill_between(x_axis, lower_bound, upper_bound, alpha=0.4, color='gray', label='STD in number of steps')
plt.xlabel('Number of episodes')
plt.ylabel('Livetime of the cart')
plt.legend()
plt.title(label='Learning progress for DQN', loc='center')
plt.show()

In [None]:
performance_results = np.array(performance_results).reshape(-1)

# This histogramm gives more-less clear picture about performance
plt.figure(figsize=(8, 6))
plt.title(label='Results of learning', loc='center')
plt.hist(performance_results, bins=50, color='c', edgecolor='k')

# mean + std
plt.axvline(np.mean(performance_results), color='r', linestyle='dashed', label='Mean Value')
plt.axvline(np.mean(performance_results) - np.std(performance_results), color='y', linestyle='dashed', label='Standard Deviation')
plt.axvline(np.mean(performance_results) + np.std(performance_results), color='y', linestyle='dashed')

plt.xlabel('Lifetime of the pole')
plt.ylabel('Frequency')
plt.xticks(np.arange(0, 500, 25))
plt.legend()
plt.show()

In [None]:
error_results = np.array(loss_results)

mean_loss = np.mean(error_results, axis=0)
std_loss = np.std(error_results, axis=0)

upper_bound = mean_loss + std_loss
lower_bound = mean_loss - std_loss
lower_bound[lower_bound <= 0] = 0.0

In [None]:
x_axis_length = len(runs_results[0]) + 1
x_axis = np.arange(1, x_axis_length)

plt.figure(figsize=(20, 8))
plt.plot(x_axis, mean_loss, color='r', label='Mean loss over learning')
plt.fill_between(x_axis, lower_bound, upper_bound, alpha=0.4, color='gray', label='STD in loss over learning')
plt.xlabel('Number of episodes')
plt.ylabel('Loss value')
plt.legend()
plt.title(label='Loss dynamics over learning', loc='center')
plt.show()

In [None]:
# model for plots later
model, _, _, _, _ = fit_agent(env=env,
                              base=BASE_EPS,
                              delay=DELAY,
                              buffer_size=BUFFER_SIZE,
                              p_shuffle=P_SHUFFLE,
                              batch_size=BATCH_SIZE,
                              collecting_steps=COLLECTING_STEPS,
                              update_frequency=TARGET_UPDATE_FREQUENCY,
                              gamma=GAMMA,
                              layers=LAYERS,
                              activations=ACTIVATIONS,
                              lr=LR)

In [None]:
results_of_test = test_agent(model, env, n_runs=200)
plt.scatter(range(len(results_of_test)), results_of_test, color='r', marker='.')
plt.ylabel('Number of steps')
plt.show()

print('Average lifetime = ', round(np.mean(results_of_test), 2))

In [None]:
# data for the following plots
N_SPLIT = 500

cart_pos = 0.0
cart_velocities = [0.0, 0.5, 1.0, 2.0]
pole_angles = np.linspace(-0.2095, 0.2095, endpoint=True, num=N_SPLIT)
pole_velocities = np.linspace(-5.0, 5.0, endpoint=True, num=N_SPLIT)

In [None]:
resulting_actions = {}
resulting_q_values = {}

for i, cart_velocity in enumerate([0.0, 0.5, 1.0, 2.0]):
  angle, velocity = np.meshgrid(pole_angles, pole_velocities)

  states = np.array([np.repeat(cart_pos, N_SPLIT ** 2), np.repeat(cart_velocity, N_SPLIT ** 2), angle.ravel(), velocity.ravel()]).T

  batch = torch.Tensor(states)
  results = model(batch).cpu().detach().numpy().reshape((N_SPLIT, N_SPLIT, -1))

  resulting_actions[cart_velocity] = (angle, velocity, np.argmax(results, axis=2))
  resulting_q_values[cart_velocity] = (angle, velocity, np.max(results, axis=2))

In [None]:
# plot acitons depending ob states
plot_data_for_states(resulting_actions, title='Actions depending on states', discrete=True)

# plot Q-Values dependin on states
plot_data_for_states(resulting_q_values, title='Q-Values depending on states')