In [1]:
import gym

In [2]:
#  Create Environment
env =  gym.make('CartPole-v0')


# Comes with certain methods/attrs
- action_space
- observation_space
- reset() : returns init state and resets the environment
- step()
- render()


In [4]:
env.reset()
for t in range(1000):
    random_action = env.action_space.sample()
    env.step(random_action) 
    env.render()

env.close()



 # Playing Games with a Random Strategy
- Game Episode
- Step() Function in more details
- Game Over?

In [8]:
for e in range(20): #Episodes
    # Play 20 games
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation,reward,done,other_info = env.step(action)
        
        if done:
            # Game Episode is over
            print('Game Episode :{}/{} High Score: {}'.format(e,20,t))
            break
env.close()
print('All 20 Episodes Over!!')

Game Episode :0/20 High Score: 10
Game Episode :1/20 High Score: 14
Game Episode :2/20 High Score: 10
Game Episode :3/20 High Score: 23
Game Episode :4/20 High Score: 25
Game Episode :5/20 High Score: 31
Game Episode :7/20 High Score: 21
Game Episode :8/20 High Score: 39
Game Episode :9/20 High Score: 14
Game Episode :10/20 High Score: 26
Game Episode :11/20 High Score: 20
Game Episode :12/20 High Score: 11
Game Episode :13/20 High Score: 16
Game Episode :14/20 High Score: 13
Game Episode :15/20 High Score: 11
Game Episode :16/20 High Score: 31
Game Episode :17/20 High Score: 17
Game Episode :18/20 High Score: 22
Game Episode :19/20 High Score: 18
All 20 Episodes Over!!


# Step()
- observation/new_state
- reward
- done
- info

# Q-learning
# Agent Design and Neural network

#### Importing Libraries

In [12]:
import numpy as np
import matplotlib.pyplot as plt
import os
from keras.models import Sequential
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
import random


Using TensorFlow backend.


In [24]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 # Discount Factor 
        
        '''
        Exploitation v Exploration trade-off
        
        Exploration : Good in the beginning --> helps you to try various random things
        Exploitation : sample Good Experience from the past(memory)--> Good in the end 
        '''
        self.epsilon = 1.0  # 100% Exploration in the beginning
        self.epsilon_decay = 0.995 # slowly move towards Exploitation
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        
        self.model = self._create_model()
        
    def _create_model(self):
        model =  Sequential()
        model.add(Dense(24,input_dim=4,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(2,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        
        return model
    
    def remember(self,state,action,reward,next_state,done):
        '''Remember Past Experiences'''
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self):
        # Sampling according to the Epsilon Greedy Method
        if np.random.random() <= self.epsilon:
            # Take a random Action
            return random.randrange(self.action_size)
        
        # Ask the NN to give the most suitable action
        return np.argmax(model.predict(state)[0])
    def train(self,batch_size=32):
        # Training using a replay buffer
        minibatch = random.sample(self.memory,batch_size)
        for experience in minibatch:
            state,action,reward,next_state,done = experience
             # X,Y : State,Expected Reward
            if not done:
                #  Game is not over yet, Bellman eqn to approx the target_value of reward
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            # X= state ,Y = target_f
            
            self.model.fit(state,target_f,epochs=1,verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
          
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)

# Building the Model

In [17]:
model =  Sequential()
model.add(Dense(24,input_dim=4,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(2,activation='linear'))
model.compile(loss='mse',optimizer=Adam(lr=0.001))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_5 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


### Training the DQN Agent(Deep Q Learning)

In [25]:
n_episodes = 1000
output_dir ='cartpole_model/'


In [26]:
agent = Agent(state_size=4,action_size=2)
done = False


In [27]:
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    batch_size=32
    for t in range(500):
        env.render()
        action = agent.act(state)  # action : 1/0
        next_state ,reward,done,other_info =env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done) # Experience for the agent
        
        if done:
            print('Game Episode :{}/{} High Score: {} Exploration Rate{:.2}'.format(e,20,t,agent.epsilon))
            break
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
        
    if e%50 == 0:
        agent.save(output_dir+'weights'+'{:0.4d}'.format(e)+'.hdf5')
print('Deep Q-Learning Model Trained')
env.close()
        

AttributeError: 'CartPoleEnv' object has no attribute 'reshape'