Dependencies

In [None]:
!pip install tensorflow==2.3.0    # TensorFlow is a Python-friendly open source library for numerical computation that makes machine learning and developing neural networks faster and easier        
!pip install gym     # Gym is a standard API for reinforcement learning, and a diverse collection of reference environments.
!pip install keras   # Keras is an open-source software library that provides a Python interface for artificial neural networks. Keras acts as an interface for the TensorFlow library.
!pip install keras-rl2   # keras-rl Reinforcement Learning framework like Stable baselines.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!apt update && apt install xvfb && pip3 install pyvirtualdisplay && pip install pyvirtualdisplay
!pip install piglet
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

#To compromise display 

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:6 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Ign:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Fetched 252 kB 

<pyvirtualdisplay.display.Display at 0x7ff74ab91050>

Step 1: Test Random Environment with Open AI Gym

In [None]:
import gym 
import random

In [None]:
env = gym.make('CartPole-v0')
states = env.observation_space.shape[0]
actions = env.action_space.n

In [None]:
# Good-general-purpose agents don't need to know the semantics of the observations: they can learn how to map observations to actions to maximize reward without any prior knowledge.
# Four-states of CartPole  [position of cart, velocity of cart, angle of pole, rotation rate of pole]
print(states)
print(actions) # Moving left or right

4
2


In [None]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()                            # At starting of each episode, state is getting reset...
    done = False                                   # Here when done = False , episode is not getting completed...
    score = 0                                      # Initiate the score as 0 coz which will be added with rewards...
    
    while not done:                                # while episode is still running i.e No of time steps is not fixed...
        env.render()                               # Render the environment
        action = random.choice([0,1])              # Choose an action as a random choice

        n_state, reward, done, info = env.step(action)    # Get state , reward , done status and info for the action
        score+=reward                                     # Add the reward in score
    print('Episode:{} Score:{}'.format(episode, score))   # Print final reward for each episode..

Episode:1 Score:13.0
Episode:2 Score:23.0
Episode:3 Score:19.0
Episode:4 Score:24.0
Episode:5 Score:16.0
Episode:6 Score:20.0
Episode:7 Score:16.0
Episode:8 Score:11.0
Episode:9 Score:14.0
Episode:10 Score:23.0


Step 2: Create Deep Learning Model with keras

In [None]:
import numpy as np 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten
from tensorflow.keras.optimizers import Adam

In [None]:
def build_model(states, actions):
  model = Sequential()
  model.add(Flatten(input_shape = (1,states)))
  model.add(Dense(24,activation = 'relu'))
  model.add(Dense(24,activation = 'relu'))
  model.add(Dense(actions,activation = 'linear'))
  return model

In [None]:
model = build_model(states,actions)

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_4 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


Build Agent with keras-RL

In [None]:
from rl.agents import DQNAgent             # Deep Q Network Agent
from rl.policy import BoltzmannQPolicy     # Choosing Policy
from rl.memory import SequentialMemory     # For replay buffer
#Import functionalities from Keras-rl

In [None]:
def build_agent(model,actions):
  policy = BoltzmannQPolicy()
  memory = SequentialMemory(limit = 50000, window_length=1)       # Agent can store maximum 50000 experience
  dqn = DQNAgent(model=model, memory=memory, policy=policy,       
                 nb_actions = actions, nb_steps_warmup = 10, target_model_update = 1e-2)  
  return dqn

#The parameter controls how often the target network is updated. If target_model_update >= 1, the target model is updated every target_model_update-th step. 
#I.e. if you set target_model_update = 10000, the target model will be updated on step 10 000, 20 000, and so on, 
#i.e. we set target_model = model on these steps. On the other hand, if target_model_update < 1, we use something called soft updates. 
#Often times in reinforcement learning the error rate of the first few steps will be very large and may cause your parameters to oscillate. 
#This is usually attributed to the lack of specificity of the deeper layers in your network. Thus we can come up with some schemes where the learning rate changes in a pre-determined way. 
#For example we can use constant warm-up or gradual warm-up.
#The convergence of stochastic gradient descent is a function of the learning rate and the batch size. When the batch size is increased too much then the needed increase in the learning rate can be such that it is beyond the possible curvature of the loss function.
# We thus introduce warm up as a means by which we can introduce large learning rates without the instability.


Training Agent

In [None]:
dqn = build_agent(model,actions)
dqn.compile(Adam(lr=1e-3),metrics=['mae'])
dqn.fit(env, nb_steps = 50000, visualize=False, verbose=1)
#nb_steps (integer): Number of training steps to be performed.

Training for 50000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
    1/10000 [..............................] - ETA: 7:08 - reward: 1.0000



130 episodes - episode_reward: 76.554 [9.000, 200.000] - loss: 3.463 - mae: 18.889 - mean_q: 38.225

Interval 2 (10000 steps performed)
61 episodes - episode_reward: 161.639 [102.000, 200.000] - loss: 6.791 - mae: 41.019 - mean_q: 82.534

Interval 3 (20000 steps performed)
57 episodes - episode_reward: 176.860 [115.000, 200.000] - loss: 5.132 - mae: 43.019 - mean_q: 86.444

Interval 4 (30000 steps performed)
52 episodes - episode_reward: 191.365 [119.000, 200.000] - loss: 3.927 - mae: 40.041 - mean_q: 80.437

Interval 5 (40000 steps performed)
done, took 376.545 seconds


<tensorflow.python.keras.callbacks.History at 0x7ff6f46d7a90>

Testing Agent

In [None]:
scores = dqn.test(env, nb_episodes=100, visualize = False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 200.000, steps: 200
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

In [None]:
dqn.save_weights('dqn_weights.h5f',overwrite=True)

Reloading Agent from Memory

In [None]:
# Delete Existing 
del model
del dqn
del env

In [None]:
env = gym.make('CartPole-v0')

In [None]:
actions = env.action_space.n
states = env.observation_space.shape[0]
model = build_model(states,actions)
dqn = build_agent(model,actions)
dqn.compile(Adam(lr=1e-3),metrics=['mae'])

In [None]:
dqn.load_weights('dqn_weights.h5f')

In [None]:
_ = dqn.test(env, nb_episodes = 10, visualize = False)

Testing for 10 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
