In [13]:
import gym

### 1. Interacting with GYM API

In [14]:
# Create environment
env = gym.make('CartPole-v0')

In [3]:
env.reset()

array([-0.04704792,  0.03258928, -0.02103965,  0.02876482])

In [4]:
# Total no of actions available
print(env.action_space)
print(env.action_space.n)
print(env.observation_space)
print(env.action_space.sample()) # Randomly select one action

Discrete(2)
2
Box(4,)
1


In [5]:
env.reset()
   
for t in range(200):
    random_action = env.action_space.sample()
    env.step(random_action) # Perform action
    env.render()
    
env.close()



### 2. Playing multiple game episode with random strategy

In [6]:
# Playing 2 game episode
for e in range(1):
    
    observation = env.reset()
    for t in range(50):
        env.render()
        random_action = env.action_space.sample()
        observation,reward,done,other_info = env.step(random_action) # Observation = new state
        print(observation,reward,done,other_info)
        if done:
            # Game episode over
            print("Game Episode %d/2, Score: %d"%(e+1,t))
            break
            
env.close()
print("All 20 episode over!")
        

[ 0.0092129  -0.24010751 -0.03520593  0.2812611 ] 1.0 False {}
[ 0.00441075 -0.43471005 -0.02958071  0.56263561] 1.0 False {}
[-0.00428345 -0.23918576 -0.018328    0.26078203] 1.0 False {}
[-0.00906717 -0.43404135 -0.01311236  0.54762819] 1.0 False {}
[-0.01774799 -0.23873766 -0.00215979  0.25084292] 1.0 False {}
[-0.02252275 -0.04358494  0.00285706 -0.04252046] 1.0 False {}
[-0.02339445 -0.23874774  0.00200666  0.25106253] 1.0 False {}
[-0.0281694  -0.0436545   0.00702791 -0.04098679] 1.0 False {}
[-0.02904249  0.15136597  0.00620817 -0.33144408] 1.0 False {}
[-2.60151708e-02  3.46399004e-01 -4.20711529e-04 -6.22162806e-01] 1.0 False {}
[-0.01908719  0.54152683 -0.01286397 -0.9149782 ] 1.0 False {}
[-0.00825665  0.3465812  -0.03116353 -0.62636586] 1.0 False {}
[-0.00132503  0.15190779 -0.04369085 -0.34365813] 1.0 False {}
[ 0.00171313 -0.04256625 -0.05056401 -0.06506646] 1.0 False {}
[ 0.0008618   0.15324282 -0.05186534 -0.37326416] 1.0 False {}
[ 0.00392666 -0.04110547 -0.05933062 -0

## 3. Q Learning
- Agent Design & Neural Model

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

In [16]:
class Agent:
    
    # Constructor
    def __init__(self,state_size,action_size):
        self.state_size = state_size # 4
        self.action_size = action_size # 2 - left or right
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 # Discount factor
        # Exploration vs Exploitation tradeoff
        # Exploration: good in beginning - helps you to try various random things
        # Explotation: good in end - sample good experiences from the past(memory)
        self.epsilon = 1.0 # 100% random exploration in beginning
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learninng_rate = 0.001
        self.model = self._create_model() 
        
        
        
    def _create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        return model
    
    
    
    def remember(self,state,action,reward,next_state,done):
        # Remember Past experience
        self.memory.append((state,action,reward,next_state,done))
        
        
        
    def act(self,state):
        # Sampling according to Greedy Epsion method
        if np.random.rand()<=self.epsilon:
            # Take a random action
            return random.randrange(self.action_size)
        
        # Ask NN to give me the suitable action
        return np.argmax(self.model.predict(state)[0])
    
    
    
    def train(self, batch_size=32):
        # Training using a 'Replay Buffer'
        minibatch = random.sample(self.memory,batch_size)
        
        for experience in minibatch:
            state,action,reward,next_state,done = experience
            
            if not done:
                # Game is not over, belman eqn to approx the target value of reward
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
                
            # X = state, Y = target_f
            self.model.fit(state,target_f,epochs=1,verbose=0)
                
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
            
            
    def load(self,name):
        self.model.load_weightsd_weights(name)
                                         
                                         
        
    def save(self,name):
        self.model.save_weights(name)
                
            
            

### 4. Train Deep Q-Learner Agent

In [17]:
n_episodes = 500
output_dir = "carpole_model/"
state_size = 4
batch_size = 32

In [18]:
agent = Agent(state_size=4,action_size=2)
done = False

In [19]:
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(500):
        env.render()
        action = agent.act(state) # action is 0 or 1
        next_state,reward,done,other_info = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done) # experience for the agent
        state = next_state
        
        if done:
            print("Game Episode : %d/%d, Score: %d, Exploration Rate: %.2f"%(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory) > batch_size:
        agent.train(batch_size)
        
    if e%50 == 0:
        agent.save(output_dir+"weights_%d.hdf5"%e)
    
print("Deep Q-Learnner Model Trained")
env.close()

Game Episode : 0/500, Score: 12, Exploration Rate: 1.00
Game Episode : 1/500, Score: 18, Exploration Rate: 1.00
Game Episode : 2/500, Score: 26, Exploration Rate: 1.00
Game Episode : 3/500, Score: 32, Exploration Rate: 0.99
Game Episode : 4/500, Score: 14, Exploration Rate: 0.99
Game Episode : 5/500, Score: 14, Exploration Rate: 0.99
Game Episode : 6/500, Score: 14, Exploration Rate: 0.98
Game Episode : 7/500, Score: 16, Exploration Rate: 0.98
Game Episode : 8/500, Score: 49, Exploration Rate: 0.97
Game Episode : 9/500, Score: 51, Exploration Rate: 0.97
Game Episode : 10/500, Score: 40, Exploration Rate: 0.96
Game Episode : 11/500, Score: 8, Exploration Rate: 0.96
Game Episode : 12/500, Score: 30, Exploration Rate: 0.95
Game Episode : 13/500, Score: 17, Exploration Rate: 0.95
Game Episode : 14/500, Score: 46, Exploration Rate: 0.94
Game Episode : 15/500, Score: 22, Exploration Rate: 0.94
Game Episode : 16/500, Score: 17, Exploration Rate: 0.93
Game Episode : 17/500, Score: 15, Explorat

Game Episode : 144/500, Score: 36, Exploration Rate: 0.49
Game Episode : 145/500, Score: 41, Exploration Rate: 0.49
Game Episode : 146/500, Score: 39, Exploration Rate: 0.49
Game Episode : 147/500, Score: 53, Exploration Rate: 0.48
Game Episode : 148/500, Score: 23, Exploration Rate: 0.48
Game Episode : 149/500, Score: 32, Exploration Rate: 0.48
Game Episode : 150/500, Score: 29, Exploration Rate: 0.48
Game Episode : 151/500, Score: 38, Exploration Rate: 0.47
Game Episode : 152/500, Score: 22, Exploration Rate: 0.47
Game Episode : 153/500, Score: 25, Exploration Rate: 0.47
Game Episode : 154/500, Score: 26, Exploration Rate: 0.47
Game Episode : 155/500, Score: 87, Exploration Rate: 0.46
Game Episode : 156/500, Score: 81, Exploration Rate: 0.46
Game Episode : 157/500, Score: 108, Exploration Rate: 0.46
Game Episode : 158/500, Score: 33, Exploration Rate: 0.46
Game Episode : 159/500, Score: 137, Exploration Rate: 0.46
Game Episode : 160/500, Score: 48, Exploration Rate: 0.45
Game Episode

Game Episode : 285/500, Score: 145, Exploration Rate: 0.24
Game Episode : 286/500, Score: 199, Exploration Rate: 0.24
Game Episode : 287/500, Score: 199, Exploration Rate: 0.24
Game Episode : 288/500, Score: 32, Exploration Rate: 0.24
Game Episode : 289/500, Score: 46, Exploration Rate: 0.24
Game Episode : 290/500, Score: 199, Exploration Rate: 0.24
Game Episode : 291/500, Score: 134, Exploration Rate: 0.23
Game Episode : 292/500, Score: 110, Exploration Rate: 0.23
Game Episode : 293/500, Score: 196, Exploration Rate: 0.23
Game Episode : 294/500, Score: 199, Exploration Rate: 0.23
Game Episode : 295/500, Score: 199, Exploration Rate: 0.23
Game Episode : 296/500, Score: 169, Exploration Rate: 0.23
Game Episode : 297/500, Score: 199, Exploration Rate: 0.23
Game Episode : 298/500, Score: 127, Exploration Rate: 0.23
Game Episode : 299/500, Score: 124, Exploration Rate: 0.23
Game Episode : 300/500, Score: 182, Exploration Rate: 0.22
Game Episode : 301/500, Score: 174, Exploration Rate: 0.22

Game Episode : 425/500, Score: 155, Exploration Rate: 0.12
Game Episode : 426/500, Score: 199, Exploration Rate: 0.12
Game Episode : 427/500, Score: 199, Exploration Rate: 0.12
Game Episode : 428/500, Score: 148, Exploration Rate: 0.12
Game Episode : 429/500, Score: 131, Exploration Rate: 0.12
Game Episode : 430/500, Score: 144, Exploration Rate: 0.12
Game Episode : 431/500, Score: 131, Exploration Rate: 0.12
Game Episode : 432/500, Score: 157, Exploration Rate: 0.12
Game Episode : 433/500, Score: 110, Exploration Rate: 0.12
Game Episode : 434/500, Score: 29, Exploration Rate: 0.11
Game Episode : 435/500, Score: 150, Exploration Rate: 0.11
Game Episode : 436/500, Score: 130, Exploration Rate: 0.11
Game Episode : 437/500, Score: 123, Exploration Rate: 0.11
Game Episode : 438/500, Score: 124, Exploration Rate: 0.11
Game Episode : 439/500, Score: 116, Exploration Rate: 0.11
Game Episode : 440/500, Score: 122, Exploration Rate: 0.11
Game Episode : 441/500, Score: 121, Exploration Rate: 0.1