# DQN with modifications ( Acrobot -v1 ) 

In [9]:
import gym
import random
import torch
import numpy as np 
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

## Environment - 'Acrobot - v1'

In [10]:
env = gym.make('Acrobot-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n

print('State shape: ', state_shape)
print('Action shape: ', action_shape)

State shape:  6
Action shape:  3


# Case 1- (+Q + E + T)

### Neural Network 
Input Layer - 4 nodes (State Shape) \
Hidden Layer 1 - 64 nodes \
Hidden Layer 2 - 64 nodes \
Output Layer - 2 nodes (Action Space) \
Optimizer - zero_grad()

### Network Update Frequency (YES)
Frequency of network switch - Every 5 episodes

###  Experience Replay (YES)
Total Replay Buffer Size - 10,000
Mini Batch Size - 64

### Loss Clipping (YES)
Gradient is clipped to 1 & -1


In [None]:
from config1 import Agent

#agent = Agent(state_size=6,action_size = 3,seed = 0)
no_siblings = 2
sibling_scores = []
sibling_lives = np.zeros(no_siblings)


def dqn(n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                 # list containing scores from each episode
    scores_window_printing = deque(maxlen=10) # For printing in the graph
    scores_window= deque(maxlen=100)  # last 100 scores for checking if the avg is more than 195
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores_window_printing.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")  
        if i_episode % 10 == 0: 
            scores.append(np.mean(scores_window_printing))        
        if i_episode % 100 == 0: 
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=-80.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
    return [np.array(scores),i_episode-100]

fig = plt.figure()
ax = fig.add_subplot(111)


for i in range(no_siblings):
    
    agent = Agent(state_size=state_shape,action_size = action_shape,seed = 0)
    [temp_scores,sibling_lives[i]] = dqn()
    sibling_scores.append(temp_scores)
    plt.plot(np.arange(len(temp_scores)), temp_scores)
    
        
    
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# Case 2- (+Q + E - T)

### Neural Network 
Input Layer - 4 nodes (State Shape)
Hidden Layer 1 - 64 nodes
Hidden Layer 2 - 64 nodes
Output Layer - 2 nodes (Action Space)
Optimizer - zero_grad()

### Network Update Frequency (YES)
Frequency of network switch - Every 5 episodes

###  Experience Replay (YES)
Total Replay Buffer Size - 10,000
Mini Batch Size - 64

### Loss Clipping (NO)
No Gradient clipping present