<a href="https://colab.research.google.com/github/Tapanmittal13/100-Days-Of-ML-Code/blob/master/tree/master/Phase_2/Session_1/P2S9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt

# import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

In [0]:
class ReplayBuffer(object): 
	
	def __init__(self, max_size = 1e6): 
		self.storage = [] 
		self.max_size = max_size 
		self.ptr = 0


	def add(self, transition): 
		if len(self.storage) == self.max_size: 
			self.storage[int(self.ptr)] = transition 
			self.ptr = (self.ptr + 1) % self.max_size 
		else: 
			self.storage.append(transition) 


	def sample(self, batch_size): 
		ind = np.random.randint(0, len(self.storage), batch_size) 
		batch_states, batch_next_states, batch_actions, batch_rewards,batch_dones = [], [], [], [], [] 
		for i in ind: 
			state, next_state, action, reward, done = self.storage[i] 
			batch_states.append(np.array(state, copy = False)) 
			batch_next_states.append(np.array(next_state, copy = False)) 
			batch_actions.append(np.array(action, copy = False)) 
			batch_rewards.append(np.array(reward, copy = False)) 
			batch_dones.append(np.array(done, copy = False)) 
		return np.array(batch_states), np.array(batch_next_states), \
			np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1),\
			np.array(batch_dones).reshape(-1, 1) 


In [0]:
class Actor(nn.Module):

	def __init__(self, state_dims, action_dim, max_action): 
		#fmax action is to clip in case we added too much noise 
		super(Actor, self).__init__() #activate the inheritance 
		self.layer_1 = nn.Linear(state_dims, 400) 
		self.layer_2 = nn.Linear(400, 300) 
		self.layer_3 = nn.Linear(300, action_dim) 
		self.max_action = max_action

	def forward(self, x): 
		x = F.relu(self.layer_1(x)) 
		x = F.relu(self.layer_2(x)) 
		x = self.max_action * torch.tanh(self.layer_3(x)) 
		return x


In [0]:

class Critic(nn.Module): 

	def __init__(self, state_dims, action_dim): 
		#max action is to clip in case we added too much noise 
		super(Critic, self).__init__() # activate the inheritance 
		# First Critic Network 
		self.layer_1 = nn.Linear(state_dims + action_dim, 400) 
		self.layer_2 = nn.Linear(400, 300) 
		self.layer_3 = nn.Linear(300, action_dim) 

		# Second Critic Network 
		self.layer_4 = nn.Linear(state_dims + action_dim, 400) 
		self.layer_5 = nn. Linear(400, 300) 
		self.layer_6 = nn.Linear(300, action_dim)


	def forward(self, x, u): # x - state, u = action 
		xu = torch.cat([x, u], 1) # 1 for verticat concatenation, 0 for Hzntal 
		# forward propagation on first Critic 
		x1 = F.relu(self.layer_1(xu)) 
		x1 = F.relu(self.layer_2(x1)) 
		x1 = self.layer_3(x1) 
		# forward propagation on second Critic 
		x2 = F.relu(self.layer_4(xu)) 
		x2 = F.relu(self.layer_5(x2)) 
		x2 = self.layer_6(x2) 
		return x1, x2


In [0]:

# Selecting the device (CPU or UPU) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 


# Building the whole Training Proces into a class 
class T3D(object): 

	def __init__(self, state_dims, action_dim, max_action):
		# making sure our T3D class can work with any env 
		self.actor = Actor(state_dims, action_dim, max_action).to(device) # GD 
		self.actor_target = Actor(state_dims, actiondim, max_action).to(device) # Potyok Avg 
		self.actor_target.load_statedict(self.actor.state_dict) # initializing with model weights to keep the same 
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

		self.critic = Critic(state_dims, action_dim).to(device) # GD 
		self.critic_target = critic(state_dims, action_dim).to(device) # Potyak Avg 
		self.critic_target.load_state_dict(self.critic.state_dict)

		# initializing with model weights to keep them same 
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) 
		self.maxaction = maxaction 


	def select_action(self, state): 
		state = torch.Tensor(state.reshape(1, -1)).to(device) 
		return self.actor(state).cpu().data.numpy().flatten() 
		# need to convert to numpy, remember clipping? 


	def train(self, replay_buffer, iterations, batchsize=100, discount=0.99,  tau = 0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
		for it in range(iterations): 
			# Step 4 We sample from a batch of transitions (s, s', a, r) from memory 
			batch_states, batch_next_states, batch_actions, batch_rewards, batchdones = replay_buffer.sample(batch_size) 
			state = torch.Tensor(batchstates).to(device) 
			next_state = torch.Tensor(batchnextstates).to(device) 
			action = torch.Tensor(batch_actions).to(device) 
			reward = torch.Tensor(batch_rewards).to(device) 
			done = torch.Tensor(batch_dones).to(device) 

			# Step 5: From the next state the Actor target plays the next actions a' 
			next_action = self.actor_target.forward(next_state)

			# Step 6, We add Gaussian noise to this next action a and we clamp it in a 
			# range of values supported by the environment 
			noise = torch.Tensor(batch_actions).data.normal(0, policy_noise).to(device) 
			noise = noise.clamp(-noise_clip, noise_clip) 
			next_action = (next_action + noise).clamp(-self.max_action, self.max_action) 

			#Step 7: The two critic targets take each the couple (s', a') as input and
			# return two Q-values , Qt1(s',a') and Qt2(s',a') as output
			target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)


			# Step 8: We keep the minimum of these two Q-Values 
			target_Q = torch.min(target_Q1, target_Q2)

			# Step 9: We get the final target of the two Critic model, which is:
			# Qt = r + gamma*min(Qt1, Qt2)

			# target_Q = reward + (1 - done) * discount * target_Q

			# 0 = episode not over, 1 = episode over

			# We can’t run the above equation efficiently as some components are in Computational
			# graphs and some are not, We need to make minor modification

			target_Q = reward + ((1 - done) * discount * target_Q).detach()

			# Step 10 : Te two critic models take each the couple (s,a),
			# as input and return two Q Values
			current_Ql, current_Q2 = self.critic.forward(state, action)


			# Step 11: We compute the Loss coming from the two Critic models 
			critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) 


			# Step 12, We backpropagate this Critic Loss snd update the parameters of the two Critic 
			# models with a Adam optimizer 
			self.critic_optimizer.zero_grad() #initiatizing the gradients to zero 
			critic_loss.backward() # computing the gradients 
			self.critic_optimizer.step() #performing the weight updates 


			# Step 13: Once every two iterations, we update our Actor cadet by 
			# performing gradient asent on the output of the first critic model 
			if it % policy_freq == 0: 
				# This is OPG part 
				actor_loss = -(self.critic.Q1(state, self.actor(state)).mean()) 
				self.actor_optimizer.grad_zero() 
				actor_loss.backward() 
				self.actor_optimizer.step() 


				# Step 14: Still once every two iteractions, we update the weights of the Actor target 
				# by PoLyak averaging 
				for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 
					target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) 


				# Step 15: Stift once every two iterations, we update the weights of the Critic target gi 
				# by Pcqyak averang 
				for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): 
					target_param.data.copy_(tan * param.data + (1 - tau) * target_param.data) 
				
				# T3D is done now! 
    