In [1]:
import gym

### 1. Interacting with GYM API

In [2]:
# Create environment
env = gym.make('CartPole-v0')

In [4]:
env.reset()

array([ 0.01543768, -0.01070381, -0.03705663, -0.03993271])

In [18]:
# Total no of actions available
print(env.action_space)
print(env.action_space.n)
print(env.observation_space)
print(env.action_space.sample()) # Randomly select one action

Discrete(2)
2
Box(4,)
0


In [29]:
env.reset()
   
for t in range(200):
    random_action = env.action_space.sample()
    env.step(random_action) # Perform action
    env.render()
    
env.close()

### 2. Playing multiple game episode with random strategy

In [36]:
# Playing 2 game episode
for e in range(1):
    
    observation = env.reset()
    for t in range(50):
        env.render()
        random_action = env.action_space.sample()
        observation,reward,done,other_info = env.step(random_action) # Observation = new state
        print(observation,reward,done,other_info)
        if done:
            # Game episode over
            print("Game Episode %d/2, Score: %d"%(e+1,t))
            break
            
env.close()
print("All 20 episode over!")
        

[-0.03840103 -0.23807675  0.02326431  0.33618221] 1.0 False {}
[-0.04316257 -0.04329347  0.02998796  0.05092545] 1.0 False {}
[-0.04402844  0.15138594  0.03100647 -0.23214723] 1.0 False {}
[-0.04100072 -0.04416502  0.02636352  0.07015266] 1.0 False {}
[-0.04188402  0.15056924  0.02776657 -0.21409725] 1.0 False {}
[-0.03887264 -0.04493845  0.02348463  0.08721357] 1.0 False {}
[-0.03977141  0.14983913  0.0252289  -0.1979684 ] 1.0 False {}
[-0.03677462 -0.04563442  0.02126953  0.10256513] 1.0 False {}
[-0.03768731 -0.24105463  0.02332084  0.40188191] 1.0 False {}
[-0.0425084  -0.04627109  0.03135847  0.11664175] 1.0 False {}
[-0.04343383 -0.24182799  0.03369131  0.41905077] 1.0 False {}
[-0.04827038 -0.04719924  0.04207232  0.1371767 ] 1.0 False {}
[-0.04921437 -0.24289775  0.04481586  0.44283032] 1.0 False {}
[-0.05407232 -0.04843766  0.05367246  0.1646045 ] 1.0 False {}
[-0.05504108  0.14587653  0.05696455 -0.11067492] 1.0 False {}
[-0.05212355  0.3401379   0.05475106 -0.3848558 ] 1.0 F

## 3. Q Learning
- Agent Design & Neural Model

In [37]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

Using TensorFlow backend.


In [39]:
class Agent:
    
    # Constructor
    def __init__(self,state_size,action_size):
        self.state_size = state_size # 4
        self.action_size = action_size # 2 - left or right
        self.memory = deque(maxLen=2000)
        self.gamma = 0.95 # Discount factor
        # Exploration vs Exploitation tradeoff
        # Exploration: good in beginning - helps you to try various random things
        # Explotation: good in end - sample good experiences from the past(memory)
        self.epsilon = 1.0 # 100% random exploration in beginning
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learninng_rate = 0.001
        self.model = self._create_model() 
        
        
        
    def _create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        return model
    
    
    
    def remember(self,state,action,reward,next_state,done):
        # Remember Past experience
        self.memory.append((state,action,reward,next_state,done))
        
        
        
    def act(self,state):
        # Sampling according to Greedy Epsion method
        if np.random.rand()<=self.epsilon():
            # Take a random action
            return random.randrange(self.action_size)
        
        # Ask NN to give me the suitable action
        return np.argmax(model.predict(state)[0])
    
    
    
    def train(self, batch_size=32):
        # Training using a 'Replay Buffer'
        minibatch = random.sample(self.memory,batch_size)
        
        for experience in minibatch:
            state,action,reward,next_state,done = experience
            
            if not done:
                # Game is not over, belman eqn to approx the target value of reward
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
                
            # X = state, Y = target_f
            self.model.fit(state,target_f,epochs=1,verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
            
            
    def load(self,name):
        self.model.load_weightsd_weights(name)
                                         
                                         
        
    def save(self,name):
        self.model.save_weights(name)
                
            
            