In [None]:
!pip install gym



In [1]:
import gym

In [2]:
env = gym.make('CartPole-v0')

## ***Lets define our agent***

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import Adam 
import random

In [4]:
class Agent:
  def __init__(self,state_size,action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = deque(maxlen=2000)
    self.gamma = 0.95  # dicount factor
    ## exploration Vs explitation tradeoff
    self.epsilon = 1.0 ## 100% exploration in begining 
    self.epsilon_decay = 0.95
    self.epsilon_min = 0.01
    self.learning_rate = 0.001
    self.model  = self._create_model()
  
  def _create_model(self):
    ## Lets define our Model
    model = Sequential()
    model.add(Dense(24,input_dim=self.state_size,activation='relu'))
    model.add(Dense(24,activation='relu'))
    model.add(Dense(self.action_size,activation='linear'))
    model.compile(loss='mse',optimizer=Adam(lr=0.001))
    return model
  
  def remember(self,state,action,reward,next_state,done):
    ## we add details to our memory
    self.memory.append((state,action,reward,next_state,done))
  
  def act(self,state):
    ## basis of epsilon greddy method
    if np.random.rand() <= self.epsilon:
      return random.randrange(self.action_size)
    ## else we will go for neural to predict the step
    return np.argmax(self.model.predict(state)[0])

  ## time to train our model
  def train(self,batch_size=32):
    ## we will train using replay buffer

    minibatch = random.sample(self.memory,batch_size)
    for exp in minibatch:
      state,action,reward,next_state,done = exp

      ## if game is not completed
      if not done:
        ## we will use bellman eq to approxx the target value of reward
        target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
      else:
        target = reward
      target_f = self.model.predict(state)
      target_f[0][action] = target 

      ## lets train our model with x as sample and result as target_f
      self.model.fit(state,target_f,epochs=1,verbose=0)

      # decay our epislon value
      if self.epsilon > self.epsilon_min:
        self.epsilon*=self.epsilon_decay
  def sav(self,name):
    self.model.save_weights(name)
    
  def load(self,name):
    self.model.load_weights(name)
  
  











### ***Let's Train our model*** (Game Time)

In [5]:
episode = 300
output_dir = 'carpole_model/'

In [7]:
!ls

carpole_model  sample_data


In [8]:
agent = Agent(state_size=4,action_size=2)
done = False
state_size=4

In [9]:
for e in range(episode):
  state = env.reset()
  state = np.reshape(state,[1,state_size])

  for t in range(5000):
    #env.render()
    action = agent.act(state)
    next_state,reward,done,other_info = env.step(action)
    reward = reward if not done else -10
    next_state = np.reshape(next_state,[1,state_size])
    agent.remember(state,action,reward,next_state,done) ## experience for the agent
    state = next_state
    
    if done:
      print("Game episode{}/{} score :{} rl :{}".format(e,episode,t,agent.epsilon))
      break

  if (len(agent.memory))>32:
      agent.train(32)

  if e%50==0:
      agent.sav(output_dir+'weights_'+'{:04d}'.format(e)+'.hdf5')
  
print("model trained")
env.close()





Game episode0/300 score :16 rl :1.0
Game episode1/300 score :18 rl :1.0
Game episode2/300 score :13 rl :0.1937114844585008
Game episode3/300 score :33 rl :0.03752413921111601
Game episode4/300 score :21 rl :0.009888364709658948
Game episode5/300 score :30 rl :0.009888364709658948
Game episode6/300 score :34 rl :0.009888364709658948
Game episode7/300 score :32 rl :0.009888364709658948
Game episode8/300 score :61 rl :0.009888364709658948
Game episode9/300 score :42 rl :0.009888364709658948
Game episode10/300 score :71 rl :0.009888364709658948
Game episode11/300 score :69 rl :0.009888364709658948
Game episode12/300 score :60 rl :0.009888364709658948
Game episode13/300 score :37 rl :0.009888364709658948
Game episode14/300 score :23 rl :0.009888364709658948
Game episode15/300 score :34 rl :0.009888364709658948
Game episode16/300 score :27 rl :0.009888364709658948
Game episode17/300 score :17 rl :0.009888364709658948
Game episode18/300 score :23 rl :0.009888364709658948
Game episode19/300 sc