In [1]:
import numpy as np
import os
from collections import deque
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random
import gymnasium as gym

In [8]:
class Agent:
    def __init__(self,state_size =4,action_size = 2):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.95
        self.eplison = 1
        self.eps_decay = 0.995
        self.eps_min = 0.01
        self.model = self.createModel()

    def createModel(self):
        model = keras.Sequential()
        model.add(keras.Input(shape=(self.state_size,)))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))

        model.compile(loss=keras.losses.MeanSquaredError,optimizer=keras.optimizers.Adam())
        return model

    def remember(self,state,action,next_state,reward,done):
        self.memory.append((state,action,next_state,reward,done))

    def act(self,state):
        if np.random.random() <= self.eplison:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state,verbose = 0)[0])

    def train(self,batch_size = 32):
        minibatch = random.sample(self.memory,batch_size)
        for exp in minibatch:
            state,action,next_state,reward,done = exp
            target_f = self.model.predict(state,verbose=0)
            if done:
                target = reward
            else:
                target = reward + self.gamma*np.max(self.model.predict(next_state,verbose = 0)[0])
            target_f[0][action] = target

            self.model.fit(state,target_f,epochs=1,verbose=0)

        if self.eplison > self.eps_min:
            self.eplison *= self.eps_decay

In [9]:
n_episode = 600
agent = Agent(state_size=4,action_size=2)
env = gym.make('CartPole-v1',render_mode = 'rgb_array')
batch_size = 500

In [10]:
for e in range(n_episode):
    state,info = env.reset()
    state = state.reshape((1,-1))

    for i in range(500):
        action = agent.act(state)
        next_state,reward,done,truncate,info = env.step(action)
        reward = reward if not done else -100
        next_state = next_state.reshape((1,-1))
        agent.remember(state,action,next_state,reward,done)
        state = next_state

        if done or truncate:
            print(f'Episode: {e}/{n_episode} Score: {i+1}')
            break
    if len(agent.memory) >= batch_size:
        agent.train()

Episode: 0/600 Score: 13
Episode: 1/600 Score: 20
Episode: 2/600 Score: 13
Episode: 3/600 Score: 16
Episode: 4/600 Score: 17
Episode: 5/600 Score: 26
Episode: 6/600 Score: 19
Episode: 7/600 Score: 8
Episode: 8/600 Score: 15
Episode: 9/600 Score: 14
Episode: 10/600 Score: 24
Episode: 11/600 Score: 11
Episode: 12/600 Score: 38
Episode: 13/600 Score: 37
Episode: 14/600 Score: 29
Episode: 15/600 Score: 19
Episode: 16/600 Score: 55
Episode: 17/600 Score: 16
Episode: 18/600 Score: 10
Episode: 19/600 Score: 65
Episode: 20/600 Score: 26
Episode: 21/600 Score: 39
Episode: 22/600 Score: 13
Episode: 23/600 Score: 20
Episode: 24/600 Score: 14
Episode: 25/600 Score: 13
Episode: 26/600 Score: 18
Episode: 27/600 Score: 28
Episode: 28/600 Score: 15
Episode: 29/600 Score: 13
Episode: 30/600 Score: 15
Episode: 31/600 Score: 17
Episode: 32/600 Score: 10
Episode: 33/600 Score: 15
Episode: 34/600 Score: 17
Episode: 35/600 Score: 16
Episode: 36/600 Score: 13
Episode: 37/600 Score: 19
Episode: 38/600 Score: 

In [15]:
env.close()

In [14]:
env = gym.make('CartPole-v1',render_mode = 'human')
state,info = env.reset()
state = state.reshape((1,-1))
agent.eplison = 0

for i in range(500):
    action = agent.act(state)
    next_state,reward,done,truncate,info = env.step(action)
    reward = reward if not done else -100
    next_state = next_state.reshape((1,-1))
    agent.remember(state,action,next_state,reward,done)
    state = next_state

    if done or truncate:
        print(f'Score: {i+1}')
        break

Score: 410
