In [47]:
import gymnasium as gym
import random
import numpy as np
import pandas as pd
from collections import deque

from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
from keras.optimizers import Adam

In [48]:
env = gym.make("CartPole-v1", render_mode="rgb_array_list")

state_size, = env.observation_space.shape
action_size = env.action_space.n

gameover = False
episodes = 1_000


In [277]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.model = self._create_model()
        self.memory = deque(maxlen=2000)
        
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.001
        self.gamma = 0.98
        
    def _create_model(self):
        model = Sequential()
        model.add(Input(self.state_size))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='mse')
        return model
    
    def action(self, state):
        # random actions with epsilon chances
        if np.random.rand() <= self.epsilon:
            return env.action_space.sample()
        # predicted action based on NN (brain)
        else:
            return np.argmax(self.model.predict(state, 
                                                verbose=0, 
                                                use_multiprocessing=True)[0])
        
    def store(self, state, reward, gameover, next_state):
        experience = pd.DataFrame([[state, action, reward, gameover, next_state]],
                                  columns=['state', 'action', 'reward', 'gameover', 'next_state'])
        self.memory.append(experience)
        
    def train(self, buffer_size=64):
        """
        neural network model will be trained on env states (input) and predicted_qsa.
        objective - optimize the predicted_qsa to close to actual qsa
        predicted_qsa = reward + gamma * max(predicted_qs'a')
        """
        buffer = random.sample(self.memory, buffer_size) # this will return a list
        nexperience = pd.concat(buffer)
        
        # psa shape is (batch size, action size)
        qsa = np.where(nexperience.gameover, # where condition
                       nexperience.reward, # gameover is true
                       nexperience.reward + self.gamma * 
                       np.argmax(
                           self.model.predict(
                               np.vstack(nexperience.next_state),
                               verbose=0,),
                           axis=1))
    
        predicted_qsa = self.model.predict(np.vstack(nexperience.state),
                                           verbose=0, 
                                           use_multiprocessing=True)
        
        predicted_qsa[np.arange(buffer_size), np.vstack(nexperience.action)] = qsa
        
        self.model.fit(np.vstack(nexperience.state),
                       predicted_qsa,
                       epochs=1,
                       verbose=0,
                       use_multiprocessing=True)

In [275]:
agent = Agent(state_size, action_size)

In [182]:
agent.model.predict(env.reset()[0].reshape(1, -1)).shape



(1, 2)

In [69]:
np.argmax(p, axis=1)

array([0], dtype=int64)

In [276]:
for e in range(episodes):
    state = env.reset()[0].reshape((1, -1))
    for time in range(1000):
        action = agent.action(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        gameover = terminated | truncated
        # penalize reward
        if gameover: reward = -1
        # store experience
        next_state = next_state.reshape((1, -1)) # reshape next_state array before storing experience
        agent.store(state, reward, gameover, next_state)
        
        if gameover:
            print("Episode #{} of {} | score: {} | exploration rate: {:.2}".format(e, episodes, time, agent.epsilon))
            break
            
    if len(agent.memory) > 100:
        agent.train()
        
print("model trained")

Episode #0 of 1000 | score: 19 | exploration rate: 1.0
Episode #1 of 1000 | score: 11 | exploration rate: 1.0
Episode #2 of 1000 | score: 17 | exploration rate: 1.0
Episode #3 of 1000 | score: 43 | exploration rate: 1.0
Episode #4 of 1000 | score: 16 | exploration rate: 1.0
done
Episode #5 of 1000 | score: 24 | exploration rate: 1.0
done
Episode #6 of 1000 | score: 14 | exploration rate: 1.0
done
Episode #7 of 1000 | score: 20 | exploration rate: 1.0
done
Episode #8 of 1000 | score: 40 | exploration rate: 1.0
done
Episode #9 of 1000 | score: 12 | exploration rate: 1.0
done
Episode #10 of 1000 | score: 10 | exploration rate: 1.0
done
Episode #11 of 1000 | score: 21 | exploration rate: 1.0
done
Episode #12 of 1000 | score: 21 | exploration rate: 1.0
done
Episode #13 of 1000 | score: 23 | exploration rate: 1.0
done
Episode #14 of 1000 | score: 12 | exploration rate: 1.0
done
Episode #15 of 1000 | score: 18 | exploration rate: 1.0
done
Episode #16 of 1000 | score: 59 | exploration rate: 1.

done
Episode #135 of 1000 | score: 17 | exploration rate: 1.0
done
Episode #136 of 1000 | score: 32 | exploration rate: 1.0
done
Episode #137 of 1000 | score: 58 | exploration rate: 1.0
done
Episode #138 of 1000 | score: 35 | exploration rate: 1.0
done
Episode #139 of 1000 | score: 24 | exploration rate: 1.0
done
Episode #140 of 1000 | score: 57 | exploration rate: 1.0
done
Episode #141 of 1000 | score: 22 | exploration rate: 1.0
done
Episode #142 of 1000 | score: 12 | exploration rate: 1.0
done
Episode #143 of 1000 | score: 8 | exploration rate: 1.0
done
Episode #144 of 1000 | score: 20 | exploration rate: 1.0
done
Episode #145 of 1000 | score: 114 | exploration rate: 1.0
done
Episode #146 of 1000 | score: 37 | exploration rate: 1.0
done
Episode #147 of 1000 | score: 16 | exploration rate: 1.0
done
Episode #148 of 1000 | score: 38 | exploration rate: 1.0
done
Episode #149 of 1000 | score: 12 | exploration rate: 1.0
done
Episode #150 of 1000 | score: 24 | exploration rate: 1.0
done
Epi

done
Episode #268 of 1000 | score: 10 | exploration rate: 1.0
done
Episode #269 of 1000 | score: 26 | exploration rate: 1.0
done
Episode #270 of 1000 | score: 23 | exploration rate: 1.0
done
Episode #271 of 1000 | score: 11 | exploration rate: 1.0
done
Episode #272 of 1000 | score: 9 | exploration rate: 1.0
done
Episode #273 of 1000 | score: 9 | exploration rate: 1.0
done
Episode #274 of 1000 | score: 16 | exploration rate: 1.0
done
Episode #275 of 1000 | score: 47 | exploration rate: 1.0
done
Episode #276 of 1000 | score: 32 | exploration rate: 1.0
done
Episode #277 of 1000 | score: 14 | exploration rate: 1.0
done
Episode #278 of 1000 | score: 10 | exploration rate: 1.0
done
Episode #279 of 1000 | score: 14 | exploration rate: 1.0
done
Episode #280 of 1000 | score: 10 | exploration rate: 1.0
done
Episode #281 of 1000 | score: 38 | exploration rate: 1.0
done
Episode #282 of 1000 | score: 23 | exploration rate: 1.0
done
Episode #283 of 1000 | score: 12 | exploration rate: 1.0
done
Episo

done
Episode #401 of 1000 | score: 23 | exploration rate: 1.0
done
Episode #402 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #403 of 1000 | score: 32 | exploration rate: 1.0
done
Episode #404 of 1000 | score: 17 | exploration rate: 1.0
done
Episode #405 of 1000 | score: 11 | exploration rate: 1.0
done
Episode #406 of 1000 | score: 18 | exploration rate: 1.0
done
Episode #407 of 1000 | score: 34 | exploration rate: 1.0
done
Episode #408 of 1000 | score: 10 | exploration rate: 1.0
done
Episode #409 of 1000 | score: 10 | exploration rate: 1.0
done
Episode #410 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #411 of 1000 | score: 23 | exploration rate: 1.0
done
Episode #412 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #413 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #414 of 1000 | score: 12 | exploration rate: 1.0
done
Episode #415 of 1000 | score: 19 | exploration rate: 1.0
done
Episode #416 of 1000 | score: 12 | exploration rate: 1.0
done
Epi

done
Episode #534 of 1000 | score: 15 | exploration rate: 1.0
done
Episode #535 of 1000 | score: 17 | exploration rate: 1.0
done
Episode #536 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #537 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #538 of 1000 | score: 19 | exploration rate: 1.0
done
Episode #539 of 1000 | score: 21 | exploration rate: 1.0
done
Episode #540 of 1000 | score: 23 | exploration rate: 1.0
done
Episode #541 of 1000 | score: 17 | exploration rate: 1.0
done
Episode #542 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #543 of 1000 | score: 10 | exploration rate: 1.0
done
Episode #544 of 1000 | score: 28 | exploration rate: 1.0
done
Episode #545 of 1000 | score: 14 | exploration rate: 1.0
done
Episode #546 of 1000 | score: 15 | exploration rate: 1.0
done
Episode #547 of 1000 | score: 19 | exploration rate: 1.0
done
Episode #548 of 1000 | score: 22 | exploration rate: 1.0
done
Episode #549 of 1000 | score: 18 | exploration rate: 1.0
done
Epi

done
Episode #667 of 1000 | score: 16 | exploration rate: 1.0
done
Episode #668 of 1000 | score: 24 | exploration rate: 1.0
done
Episode #669 of 1000 | score: 22 | exploration rate: 1.0
done
Episode #670 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #671 of 1000 | score: 12 | exploration rate: 1.0
done
Episode #672 of 1000 | score: 23 | exploration rate: 1.0
done
Episode #673 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #674 of 1000 | score: 39 | exploration rate: 1.0
done
Episode #675 of 1000 | score: 26 | exploration rate: 1.0
done
Episode #676 of 1000 | score: 20 | exploration rate: 1.0
done
Episode #677 of 1000 | score: 21 | exploration rate: 1.0
done
Episode #678 of 1000 | score: 15 | exploration rate: 1.0
done
Episode #679 of 1000 | score: 12 | exploration rate: 1.0
done
Episode #680 of 1000 | score: 42 | exploration rate: 1.0
done
Episode #681 of 1000 | score: 24 | exploration rate: 1.0
done
Episode #682 of 1000 | score: 18 | exploration rate: 1.0
done
Epi

done
Episode #800 of 1000 | score: 10 | exploration rate: 1.0
done
Episode #801 of 1000 | score: 8 | exploration rate: 1.0
done
Episode #802 of 1000 | score: 13 | exploration rate: 1.0
done
Episode #803 of 1000 | score: 12 | exploration rate: 1.0
done
Episode #804 of 1000 | score: 38 | exploration rate: 1.0
done
Episode #805 of 1000 | score: 22 | exploration rate: 1.0
done
Episode #806 of 1000 | score: 21 | exploration rate: 1.0
done
Episode #807 of 1000 | score: 11 | exploration rate: 1.0
done
Episode #808 of 1000 | score: 27 | exploration rate: 1.0
done
Episode #809 of 1000 | score: 23 | exploration rate: 1.0
done
Episode #810 of 1000 | score: 9 | exploration rate: 1.0
done
Episode #811 of 1000 | score: 18 | exploration rate: 1.0
done
Episode #812 of 1000 | score: 24 | exploration rate: 1.0
done


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "Z:\Google\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-276-4f3c52543ba0>", line 18, in <module>
    agent.train()
  File "<ipython-input-274-cf716cc31ce5>", line 58, in train
    predicted_qsa = self.model.predict(np.vstack(nexperience.state),
  File "Z:\Google\Anaconda\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
    return fn(*args, **kwargs)
  File "Z:\Google\Anaconda\lib\site-packages\keras\engine\training.py", line 2220, in predict
    data_handler = data_adapter.get_data_handler(
  File "Z:\Google\Anaconda\lib\site-packages\keras\engine\data_adapter.py", line 1582, in get_data_handler
    return DataHandler(*args, **kwargs)
  File "Z:\Google\Anaconda\lib\site-packages\keras\engine\data_adapter.py", line 1262, in __init__
    self._adapter = adapter_cls(
  File "Z:\Google\Anaconda\lib\site-pa

TypeError: object of type 'NoneType' has no len()

In [247]:
f, ff = agent.train()

done


In [265]:
ff.shape

(64, 2)

In [249]:
f[0:4]

array([1.98, 1.  , 1.98, 1.98])

In [250]:
g = f
g[g!=1.] = 0
g = np.asarray(g, dtype=int)

In [259]:
ff[0, g] = 

In [281]:
np.amax(ff, axis=1)

array([ 0.00201042, -0.00918985, -0.00918985,  0.00067103,  0.00961511,
        0.00201042, -0.00918985,  0.00201042,  0.01340952,  0.00961511,
        0.01592626,  0.01340952,  0.00067103, -0.00918985, -0.00918985,
        0.00201042, -0.00918985,  0.00201042,  0.00067103, -0.00918985,
        0.00201042,  0.00961511, -0.00918985,  0.01592626, -0.00918985,
        0.00201042,  0.00201042, -0.00918985,  0.00067103,  0.00201042,
        0.00067103,  0.00067103,  0.01340952,  0.01340952,  0.01340952,
        0.01340952,  0.00201042, -0.00918985, -0.00918985, -0.00918985,
        0.01340952,  0.00201042, -0.00918985,  0.00201042,  0.00201042,
        0.00201042,  0.01592626,  0.00961511,  0.00067103,  0.00067103,
        0.00201042,  0.00201042,  0.01340952,  0.00201042,  0.00201042,
        0.01592626, -0.00918985,  0.01592626,  0.00961511, -0.00918985,
        0.00067103,  0.00201042, -0.00918985,  0.00067103], dtype=float32)

In [282]:
np.max(ff, axis=1)

array([ 0.00201042, -0.00918985, -0.00918985,  0.00067103,  0.00961511,
        0.00201042, -0.00918985,  0.00201042,  0.01340952,  0.00961511,
        0.01592626,  0.01340952,  0.00067103, -0.00918985, -0.00918985,
        0.00201042, -0.00918985,  0.00201042,  0.00067103, -0.00918985,
        0.00201042,  0.00961511, -0.00918985,  0.01592626, -0.00918985,
        0.00201042,  0.00201042, -0.00918985,  0.00067103,  0.00201042,
        0.00067103,  0.00067103,  0.01340952,  0.01340952,  0.01340952,
        0.01340952,  0.00201042, -0.00918985, -0.00918985, -0.00918985,
        0.01340952,  0.00201042, -0.00918985,  0.00201042,  0.00201042,
        0.00201042,  0.01592626,  0.00961511,  0.00067103,  0.00067103,
        0.00201042,  0.00201042,  0.01340952,  0.00201042,  0.00201042,
        0.01592626, -0.00918985,  0.01592626,  0.00961511, -0.00918985,
        0.00067103,  0.00201042, -0.00918985,  0.00067103], dtype=float32)