In [None]:
'''
FULL RETURNS
'''

import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple


# Create the environment
env = gym.make("Acrobot-v1")

# Set seed for experiment reproducibility
seed = 42
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

In [None]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""

  def __init__(
      self, 
      num_actions: int, 
      num_hidden_units1: int,
      num_hidden_units2: int):
    """Initialize."""
    super().__init__()

    self.common2 = layers.Dense(num_hidden_units2, activation="relu")
    self.common1 = layers.Dense(num_hidden_units1, activation="relu")
    self.actor = layers.Dense(num_actions)
    self.critic = layers.Dense(1)

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    x = self.common1(inputs)
    y = self.common2(x)
    return self.actor(y), self.critic(y)

In [None]:
num_actions = env.action_space.n  # 2
num_hidden_units1 = 72
num_hidden_units2 = 36

model = ActorCritic(num_actions, num_hidden_units1,num_hidden_units2)

In [None]:
# Wrap OpenAI Gym's `env.step` call as an operation in a TensorFlow function.
# This would allow it to be included in a callable TensorFlow graph.

def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """Returns state, reward and done flag given an action."""

  state, reward, done, _ = env.step(action)
  return (state.astype(np.float32), 
          np.array(reward, np.int32), 
          np.array(done, np.int32))


def tf_env_step(action: tf.Tensor) -> List[tf.Tensor]:
  return tf.numpy_function(env_step, [action], 
                           [tf.float32, tf.int32, tf.int32])

In [None]:
def run_episode(
    initial_state: tf.Tensor,  
    model: tf.keras.Model, 
    max_steps: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
  """Runs a single episode to collect training data."""

  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

  initial_state_shape = initial_state.shape
  state = initial_state

  for t in tf.range(max_steps):
    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)
  
    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model(state)
  
    # Sample next action from the action probability distribution
    action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)

    # Store critic values
    values = values.write(t, tf.squeeze(value))

    # Store log probability of the action chosen
    action_probs = action_probs.write(t, action_probs_t[0, action])
  
    # Apply action to the environment to get next state and reward
    state, reward, done = tf_env_step(action)
    state.set_shape(initial_state_shape)
  
    # Store reward
    rewards = rewards.write(t, reward)

    if tf.cast(done, tf.bool):
      break

  action_probs = action_probs.stack()
  values = values.stack()
  rewards = rewards.stack()
  
  return action_probs, values, rewards

In [None]:
def get_expected_return(
    rewards: tf.Tensor, 
    gamma: float, 
    standardize: bool = True) -> tf.Tensor:
  """Compute expected returns per timestep."""

  n = tf.shape(rewards)[0]
  returns = tf.TensorArray(dtype=tf.float32, size=n)

  # Start from the end of `rewards` and accumulate reward sums
  # into the `returns` array
  rewards = tf.cast(rewards[::-1], dtype=tf.float32)
  discounted_sum = tf.constant(0.0)
  discounted_sum_shape = discounted_sum.shape
  for i in tf.range(n):
    reward = rewards[i]
    discounted_sum = reward + gamma * discounted_sum
    discounted_sum.set_shape(discounted_sum_shape)
    returns = returns.write(i, discounted_sum)
  returns = returns.stack()[::-1]

  if standardize:
    returns = ((returns - tf.math.reduce_mean(returns)) / 
               (tf.math.reduce_std(returns) + eps))

  return returns

In [None]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
    action_probs: tf.Tensor,  
    values: tf.Tensor,  
    returns: tf.Tensor) -> tf.Tensor:
  """Computes the combined actor-critic loss."""

  advantage = returns - values

  action_log_probs = tf.math.log(action_probs)
  actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)

  critic_loss = huber_loss(values, returns)

  return actor_loss + critic_loss

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)


@tf.function
def train_step(
    initial_state: tf.Tensor, 
    model: tf.keras.Model, 
    optimizer: tf.keras.optimizers.Optimizer, 
    gamma: float, 
    max_steps_per_episode: int) -> tf.Tensor:
  """Runs a model training step."""

  with tf.GradientTape() as tape:

    # Run the model for one episode to collect training data
    action_probs, values, rewards = run_episode(
        initial_state, model, max_steps_per_episode) 

    # Calculate expected returns
    returns = get_expected_return(rewards, gamma)

    # Convert training data to appropriate TF tensor shapes
    action_probs, values, returns = [
        tf.expand_dims(x, 1) for x in [action_probs, values, returns]] 

    # Calculating loss values to update our network
    loss = compute_loss(action_probs, values, returns)

  # Compute the gradients from the loss
  grads = tape.gradient(loss, model.trainable_variables)

  # Apply the gradients to the model's parameters
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  episode_reward = tf.math.reduce_sum(rewards)

  return episode_reward

In [None]:
%%time

min_episodes_criterion = 100
max_episodes = 10000
max_steps_per_episode = 1000
 
# consecutive trials
reward_threshold = -100
running_reward = 0

# Discount factor for future rewards
gamma = 0.99

# Keep last episodes reward
episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)

# rewards
rewards = []

with tqdm.trange(max_episodes) as t:
  for i in t:
    initial_state = tf.constant(env.reset(), dtype=tf.float32)
    episode_reward = int(train_step(
        initial_state, model, optimizer, gamma, max_steps_per_episode))
    
    rewards.append(episode_reward)
    
    episodes_reward.append(episode_reward)
    running_reward = statistics.mean(episodes_reward)
  
    t.set_description(f'Episode {i}')
    t.set_postfix(
        episode_reward=episode_reward, running_reward=running_reward)
  
    # Show average episode reward every 10 episodes
    if i % 10 == 0:
      pass # print(f'Episode {i}: average reward: {avg_reward}')
  
    # if running_reward > reward_threshold and i >= min_episodes_criterion:  
    #     break

print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')

In [None]:
%%time

from keras.initializers import glorot_uniform  # Or your initializer of choice
import keras.backend as K



min_episodes_criterion = 100
max_episodes = 10000
max_steps_per_episode = 1000
 
# consecutive trials
reward_threshold = -100
running_reward = 0

# Discount factor for future rewards
gamma = 0.99

def actor_critic():
  # initial_weights = model.get_weights()

  # backend_name = K.backend()
  # if backend_name == 'tensorflow': 
  #     k_eval = lambda placeholder: placeholder.eval(session=K.get_session())
  # elif backend_name == 'theano': 
  #     k_eval = lambda placeholder: placeholder.eval()
  # else: 
  #     raise ValueError("Unsupported backend")

  # new_weights = [k_eval(glorot_uniform()(w.shape)) for w in initial_weights]

  # model.set_weights(new_weights)
      
  # Keep last episodes reward
  episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)

  # rewards
  rewards = []

  with tqdm.trange(max_episodes) as t:
    for i in t:
      initial_state = tf.constant(env.reset(), dtype=tf.float32)
      episode_reward = int(train_step(
          initial_state, model, optimizer, gamma, max_steps_per_episode))
      
      rewards.append(episode_reward)
      
      episodes_reward.append(episode_reward)
      running_reward = statistics.mean(episodes_reward)
    
      t.set_description(f'Episode {i}')
      t.set_postfix(
          episode_reward=episode_reward, running_reward=running_reward)
    
      # Show average episode reward every 10 episodes
      if i % 10 == 0:
        pass # print(f'Episode {i}: average reward: {avg_reward}')
    
      # if running_reward > reward_threshold and i >= min_episodes_criterion:  
      #     break
  return rewards

# accumator = []
for _ in range(1):
  l = actor_critic()
  accumator.append(l)

In [None]:
for l in avg_rew_plots:
    avg = []
    for i in range(len(l)):
        if(i <= 100):
            avg.append(np.mean(l[:i]))
        else:
            avg.append(np.mean(l[i-100:i]))
    plt.plot(avg)
    plt.show()

In [None]:
'''
SINGLE STEP RETURNS
'''

import numpy as np
import datetime
import gym
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
import threading

In [None]:
class ActorCriticModel(tf.keras.Model):
    """
    Defining policy and value networkss
    """
    def __init__(self, action_size, n_hidden1=24, n_hidden2=24):
        super(ActorCriticModel, self).__init__()

        #Hidden Layer 1
        self.fc1 = tf.keras.layers.Dense(n_hidden1, activation='relu')
        #Hidden Layer 2
        self.fc2 = tf.keras.layers.Dense(n_hidden2, activation='relu')
        
        #Output Layer for policy
        self.pi_out = tf.keras.layers.Dense(action_size, activation='softmax')
        #Output Layer for state-value
        self.v_out = tf.keras.layers.Dense(1)

    def call(self, state):
        """
        Computes policy distribution and state-value for a given state
        """
        layer1 = self.fc1(state)
        layer2 = self.fc2(layer1)

        pi = self.pi_out(layer2)
        v = self.v_out(layer2)

        return pi, v

In [None]:
class Agent:
    """
    Agent class
    """
    def __init__(self, action_size, lr=3e-5, gamma=0.99):
        self.gamma = gamma
        self.ac_model = ActorCriticModel(action_size=action_size)
        self.ac_model.compile(tf.keras.optimizers.Adam(learning_rate=lr))
    
    def sample_action(self, state):
        """
        Given a state, compute the policy distribution over all actions and sample one action
        """
        pi,_ = self.ac_model(state)

        action_probabilities = tfp.distributions.Categorical(probs=pi)
        sample = action_probabilities.sample()

        return int(sample.numpy()[0])

    def actor_loss(self, action, pi, delta):
        """
        Compute Actor Loss
        """
        return -tf.math.log(pi[0,action]) * delta

    def critic_loss(self,delta):
        """
        Critic loss aims to minimize TD error
        """
        return delta**2

    @tf.function
    def learn(self, state, action, reward, next_state, done):
        """
        For a given transition (s,a,s',r) update the paramters by computing the
        gradient of the total loss
        """
        with tf.GradientTape(persistent=True) as tape:
            pi, V_s = self.ac_model(state)
            _, V_s_next = self.ac_model(next_state)

            V_s = tf.squeeze(V_s)
            V_s_next = tf.squeeze(V_s_next)

            #### TO DO: Write the equation for delta (TD error)
            ## Write code below
            delta = reward + self.gamma * V_s_next - V_s
            
            V_s_next = tf.stop_gradient(V_s_next)

            loss_a = self.actor_loss(action, pi, delta)
            loss_c =self.critic_loss(delta)
            loss_total = loss_a + loss_c

        gradient = tape.gradient(loss_total, self.ac_model.trainable_variables)
        self.ac_model.optimizer.apply_gradients(zip(gradient, self.ac_model.trainable_variables))

### Train the Network

In [None]:
# env = gym.make('CartPole-v1')
from tqdm.notebook import tqdm_notebook as tqdm
env = gym.make('Acrobot-v1')

# print env info
print("Action Space: ", env.action_space)
print("Action Size: ", env.action_space.n)
print(env.reset().reshape(1,-1))


In [None]:

#Initializing Agent
agent = Agent(action_size=env.action_space.n)
#Number of episodes
episodes = 1000
tf.compat.v1.reset_default_graph()

reward_list = []
average_reward_list = []
begin_time = datetime.datetime.now()

for ep in tqdm(range(1, episodes + 1)):
    state = env.reset().reshape(1,-1)
    done = False
    ep_rew = 0
    while not done:
        action = agent.sample_action(state) ##Sample Action
        next_state, reward, done, info = env.step(action) ##Take action
        next_state = next_state.reshape(1,-1)
        ep_rew += reward  ##Updating episode reward
        agent.learn(state, action, reward, next_state, done) ##Update Parameters
        state = next_state ##Updating State
    reward_list.append(ep_rew)
    if(len(reward_list) >= 20):
      avg_20 =  np.mean(reward_list[-20:])
      average_reward_list.append(avg_20)
    else:
      avg = np.mean(reward_list)
      average_reward_list.append(avg)

    if ep % 1 == 0:
        avg_rew = np.mean(reward_list[-1:])
        print('Episode ', ep, 'Reward %f' % ep_rew, 'Average Reward %f' % avg_rew)

    if ep % 20:
        avg_20 =  np.mean(reward_list[-20:])
        if avg_20 > -100.0:
            print('Stopped at Episode ',ep-20)
            break

time_taken = datetime.datetime.now() - begin_time
print(time_taken)

In [None]:
### Plot of total reward vs episode
## Write Code Below

plt.style.use('seaborn-poster')
# plt.figure(figsize = (10,8))
plt.xlabel('Episode')
plt.ylabel('Average reward, over last 100 episodes')
plt.plot(range(len(average_reward_list)),average_reward_list, 'b')
plt.show()


In [None]:
def actor_critic(env, episodes=10000, lr = 3e-5, gamma = 0.99):
    # rs = np.random.RandomState()
    #Initializing Agent
    agent = Agent(lr=lr, gamma=gamma, action_size=env.action_space.n)
    tf.compat.v1.reset_default_graph()

    reward_list = []

    for ep in tqdm(range(1, episodes + 1)):
        state = env.reset().reshape(1,-1)
        done = False
        ep_rew = 0
        while not done:
            action = agent.sample_action(state) ##Sample Action
            next_state, reward, done, info = env.step(action) ##Take action
            next_state = next_state.reshape(1,-1)
            ep_rew += reward  ##Updating episode reward
            agent.learn(state, action, reward, next_state, done) ##Update Parameters
            state = next_state ##Updating State
        reward_list.append(ep_rew)
        # if(len(reward_list) >= 20):
        #     avg_20 =  np.mean(reward_list[-20:])
        #     average_reward_list.append(avg_20)
        # else:
        #     avg = np.mean(reward_list)
        #     average_reward_list.append(avg)

        if ep % 5 == 0:
            avg_rew = np.mean(reward_list[-5:])
            # print('\rEpisode ', ep, 'Reward %f' % ep_rew, 'Average Reward %f' % avg_rew, end='')

        # if ep % 20:
        #     avg_20 =  np.mean(reward_list[-20:])
        #     if avg_20 > -100.0:
        #         print('\rStopped at Episode ',ep-20)
        #         break
    return avg_rew, reward_list


In [None]:
import threading
import queue

my_queue = queue.Queue()

def storeInQueue(f):
  def wrapper(*args):
    my_queue.put(f(*args))
  return wrapper


@storeInQueue
def actor_critic_mt(env,episodes,lr,gamma):
   _, l = actor_critic(env,episodes,lr,gamma)
   return l



# t = threading.Thread(target=get_name, args = ("foo", ))
# t.start()


experiments = 10
episodes = 1000
gamma = 0.99
lr = 3e-5
avg_rew_plots = np.zeros((experiments,episodes))
for e in range(experiments):
    try:
        env = gym.make('Acrobot-v1')
        t = threading.Thread(target=actor_critic_mt, args = (env,episodes,lr,gamma))
        t.start()
    except:
        print("threading failed for e = ", e)
        break
        # avg_rew, avg_rew_plots[e] = actor_critic(env, lr = lr,episodes=episodes)

for e in range(experiments):
    avg_rew_plots[e] = my_queue.get()
    


In [None]:
avg_avg_rew_plot = np.var(avg_rew_plots, axis=0)
plt.plot(avg_avg_rew_plot)


In [None]:
rewards = np.mean(accumator[:10], axis=0)
plt.plot(rewards)

In [None]:
import matplotlib.pyplot as plt
avg_plot = []
for i in range(len(rewards)):
    if(i >= 40):
        avg_plot.append(np.mean(rewards[i-40:i+1]))
    else:
        avg_plot.append(np.mean(rewards[0:i+1]))
    # rewards[i] = avg_plot[i]
plt.plot(avg_plot)
plt.show()
# var_plot = np.var(accumator[:10], axis=0)
# avg_var_plot = []
# for i in range(len(rewards)):
#     if(i >= 100):
#         avg_var_plot.append(np.mean(var_plot[i-100:i+1]))
#     else:
#         avg_var_plot.append(np.mean(var_plot[0:i+1]))
# plt.plot(avg_var_plot)
# plt.show()

In [None]:
'''
N STEP RETURNS
----------------
FOR N STEP RETURNS VARIATION, WE USE THE FOLLOWING CODE TO GET THE N STEP RETURN
'''
import torch
def calc_nstep_returns(rewards, dones, next_v_pred, gamma, n):
    rets = torch.zeros_like(rewards)
    future_ret = next_v_pred
    not_dones = 1 - dones
    for t in reversed(range(n)):
        rets[t] = future_ret = rewards[t] + gamma * future_ret * not_dones[t]
    return rets

In [None]:
'''
DQN ALGORITHM
'''

import numpy as np
import random
import torch
import torch.nn as nn  
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
import matplotlib.pyplot as plt
'''
Please refer to the first tutorial for more details on the specifics of environments
We've only added important commands you might find useful for experiments.
'''

'''
List of example environments
(Source - https://gym.openai.com/envs/#classic_control)

'Acrobot-v1'
'CartPole-v0'
'MountainCar-v0'
'''

env = gym.make('Acrobot-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print(state_shape)
print(no_of_actions)
print(env.action_space.sample())
print("----")

'''
# Understanding State, Action, Reward Dynamics

The agent decides an action to take depending on the state.

The Environment keeps a variable specifically for the current state.
- Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
- It returns the new current state and reward for the agent to take the next action

'''

state = env.reset()   
''' This returns the initial state (when environment is reset) '''

print(state)
print("----")

action = env.action_space.sample()  
''' We take a random action now '''

print(action)
print("----")

next_state, reward, done, info = env.step(action) 
''' env.step is used to calculate new state and obtain reward based on old state and action taken  ''' 

print(next_state)
print(reward)
print(done)
print(info)
print("----")

'''
### Q Network & Some 'hyperparameters'

QNetwork1:
Input Layer - 4 nodes (State Shape) \
Hidden Layer 1 - 64 nodes \
Hidden Layer 2 - 64 nodes \
Output Layer - 2 nodes (Action Space) \
Optimizer - zero_grad()

QNetwork2: Feel free to experiment more
'''

import torch
import torch.nn as nn  
import torch.nn.functional as F
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


'''
Bunch of Hyper parameters (Which you might have to tune later **wink wink**)
'''
BUFFER_SIZE = int(5e4)  
'''
replay buffer size 
'''
BATCH_SIZE = 256         
''' 
minibatch size 
'''
GAMMA = 0.99            
''' 
discount factor 
'''
LR = 6e-4              
''' 
learning rate 
'''
UPDATE_EVERY = 250       
''' 
how often to update the network (When Q target is present) 
'''


class QNetwork1(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64,fc3_units=32):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.tanh(self.fc1(state))
        x = F.tanh(self.fc2(x))
        return self.fc3(x)
### Replay Buffer:

This is a 'deque' that helps us store experiences. Recall why we use such a technique.
import random
import torch
import numpy as np
from collections import deque, namedtuple

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(device))

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
## Truncation:
We add a line (optionally) in the code to truncate the gradient in hopes that it would help with the stability of the learning process.

## Tutorial Agent Code:

class TutorialAgent():

    def __init__(self, state_size, action_size, seed):

        ''' Agent Environment Interaction '''
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        ''' Q-Network '''
        self.qnetwork_local = QNetwork1(state_size, action_size, seed,128,128).to(device)
        self.qnetwork_target = QNetwork1(state_size, action_size, seed,128,128).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        ''' Replay memory '''
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
        self.t_step = 0
        self.rg = np.random.RandomState(seed)
    
    def step(self, state, action, reward, next_state, done):

        ''' Save experience in replay memory '''
        self.memory.add(state, action, reward, next_state, done)
        
        ''' If enough samples are available in memory, get random subset and learn '''
        if len(self.memory) >= BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        """ +Q TARGETS PRESENT """
        ''' Updating the Network every 'UPDATE_EVERY' steps taken '''      
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, eps=0.):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        ''' Epsilon-greedy action selection (Already Present) '''
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
#         probabs = softmax(action_values.cpu().data.numpy()[0])
#         probabs /= probabs.sum()
#         return self.rg.choice(np.arange(self.action_size), p = probabs)

    def learn(self, experiences, gamma):
        """ +E EXPERIENCE REPLAY PRESENT """
        states, actions, rewards, next_states, dones = experiences

        ''' Get max predicted Q values (for next states) from target model'''
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        ''' Compute Q targets for current states '''
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        ''' Get expected Q values from local model '''
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        ''' Compute loss '''
        loss = F.mse_loss(Q_expected, Q_targets)

        ''' Minimize the loss '''
        self.optimizer.zero_grad()
        loss.backward()
        
        ''' Gradiant Clipping '''
        """ +T TRUNCATION PRESENT """
        for param in self.qnetwork_local.parameters():
            param.grad.data.clamp_(-1, 1)
            
        self.optimizer.step()
### Here, we present the DQN algorithm code.
solved_episodes = []
rewards = []
''' Defining DQN Algorithm '''
for i in range(10) :
    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n
    reward_list = []
    def dqn(n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.99):

        scores = []                 
        ''' list containing scores from each episode '''

        scores_window_printing = deque(maxlen=10) 
        ''' For printing in the graph '''

        scores_window= deque(maxlen=100)  
        ''' last 100 scores for checking if the avg is more than 195 '''

        eps = eps_start                    
        ''' initialize epsilon '''

        for i_episode in range(1, n_episodes+1):
            state = env.reset()
            score = 0
            for t in range(max_t):
                action = agent.act(state, eps)
                next_state, reward, done, _ = env.step(action)
                agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break 

            reward_list.append(score)
            scores_window.append(score)       
            scores_window_printing.append(score)   
            ''' save most recent score '''           

            eps = max(eps_end, eps_decay*eps) 
            ''' decrease epsilon '''

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")  
            if i_episode % 10 == 0: 
                scores.append(np.mean(scores_window_printing))        
            if i_episode % 100 == 0: 
               print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            if np.mean(scores_window)>=-80.0:
               solved_episodes.append(i_episode)
               print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
               break
        return [np.array(scores),i_episode-100]

    ''' Trial run to check if algorithm runs and saves the data '''

    begin_time = datetime.datetime.now()
    agent = TutorialAgent(state_size=state_shape,action_size = action_shape,seed = 0)


    dqn()


    time_taken = datetime.datetime.now() - begin_time

    print(time_taken)
    rewards.append(reward_list)
rewards1=rewards
average_reward_list = []
for i in range(10000) :
        average_reward_list.append(0)
for j in range(10) :
    for i in range(len(rewards[j])) :
        average_reward_list[i]+=rewards[j][i]
    for k in range(i,10000) :
        average_reward_list[k]+=rewards[j][i]
for i in range(len(average_reward_list)) :
    average_reward_list[i]/=10
plt.plot(average_reward_list)
average_reward_list1 = []
for i in range(100,len(average_reward_list)) :
  average_reward_list1.append(np.average(average_reward_list[i-100:i]))
plt.plot(average_reward_list1)
print(np.mean(solved_episodes))
print(solved_episodes)
print(np.mean(solved_episodes))
print(rewards)