In [45]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T

In [46]:
T.cuda.is_available()

True

In [47]:
class LinearDeepQNetwork(nn.Module):
    def __init__(self,
                 lr,
                 n_actions,
                 input_dims):
        super(LinearDeepQNetwork, self).__init__()
        
        self.fc1 = nn.Linear(*input_dims, 256)
        self.fc2 = nn.Linear(256,n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        
        layer1 = F.relu(self.fc1(state))
        actions = self.fc2(layer1)
        
        return actions

In [48]:
class Agent():
    def __init__(self,
                 input_dims,
                 n_actions,
                 gamma=0.99,
                 lr=0.001,
                 initial_epsilon=1,
                 epsilon_decay=1e-4,
                 final_epsilon=0.01):
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.gamma = gamma
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.action_space = [i for i in range(self.n_actions)]
        
        self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims)
        
    def choose_action(self, obs):
            if np.random.random() > self.epsilon:
                state = T.tensor(obs, dtype=T.float).to(self.Q.device)
                actions = self.Q.forward(state)
                action = T.argmax(actions).item()
                # print('All Actions: ', actions)
                # print('Selected Action: ', action.numpy())
            else:
                action = np.random.choice(self.action_space)
            
            return action
    
    def decrement_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)
        
    def learn(self, state, action, reward, next_state):
        self.Q.optimizer.zero_grad()
        state = T.tensor(state, dtype=T.float).to(self.Q.device)
        action = T.tensor(action).to(self.Q.device)
        reward = T.tensor(reward).to(self.Q.device)
        next_state = T.tensor(next_state, dtype=T.float).to(self.Q.device)
        
        
        # print(f'states {states.shape}, next_states {next_states.shape}')
        q_pred = self.Q.forward(state)[action]
        # print(f'q_pred: {q_pred}')
        q_next = self.Q.forward(next_state).max()
#         print(f'q_next_max: {q_next}')        
        q_target = reward + self.gamma * q_next
#         print(f'q_target: {q_target}')
        loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
        loss.backward()
        self.Q.optimizer.step()
        self.decrement_epsilon()
            
    

In [49]:
if __name__ == '__main__':
    env = gym.make('CartPole-v1')
    nb_episodes = 10000
    scores = []
    eps_history = []
    agent = Agent(input_dims= env.observation_space.shape,
                  n_actions=env.action_space.n,
                  lr=0.0001
                  )
    avg_score = 0
    for i in range (nb_episodes):
        if avg_score > 100 : 
            break
        else:
            score = 0
            done = False
            obs, _ = env.reset()
            while not done:
                action = agent.choose_action(obs)
                next_obs, reward, terminated,truncated, _ = env.step(action)
                score+= reward
                agent.learn(obs, action, reward, next_obs)
                obs = next_obs
                done = truncated or terminated
            scores.append(score)
            eps_history.append(agent.epsilon)
            
            if i % 100 == 0:
                avg_score = np.mean(scores[-100:])
                
                print('Episode', i, 'score %.1f avg score %.1f epsilon %.3f'% (score, avg_score,agent.epsilon) )
        
            

Episode 0 score 10.0 avg score 10.0 epsilon 0.999
Episode 100 score 13.0 avg score 21.3 epsilon 0.786
Episode 200 score 12.0 avg score 21.9 epsilon 0.566
Episode 300 score 17.0 avg score 24.7 epsilon 0.320
Episode 400 score 15.0 avg score 29.5 epsilon 0.025
Episode 500 score 47.0 avg score 47.6 epsilon 0.010
Episode 600 score 32.0 avg score 49.5 epsilon 0.010
Episode 700 score 58.0 avg score 52.7 epsilon 0.010
Episode 800 score 40.0 avg score 50.7 epsilon 0.010
Episode 900 score 44.0 avg score 43.2 epsilon 0.010
Episode 1000 score 46.0 avg score 44.9 epsilon 0.010
Episode 1100 score 52.0 avg score 46.1 epsilon 0.010
Episode 1200 score 28.0 avg score 43.7 epsilon 0.010
Episode 1300 score 58.0 avg score 44.8 epsilon 0.010
Episode 1400 score 30.0 avg score 44.3 epsilon 0.010
Episode 1500 score 59.0 avg score 46.3 epsilon 0.010
Episode 1600 score 38.0 avg score 55.6 epsilon 0.010
Episode 1700 score 92.0 avg score 45.7 epsilon 0.010
Episode 1800 score 46.0 avg score 50.7 epsilon 0.010
Episo

In [51]:
import matplotlib
matplotlib.use('TkAgg')  # or 'Qt5Agg' if you prefer Qt
env = gym.make('CartPole-v1', render_mode = 'rgb_array')
obs, info = env.reset()

plt.ion()
fig, ax = plt.subplots(figsize=(8,8))
action_text = ax.text(510, 20, '', color='white', fontsize=12, bbox=dict(facecolor='blue', alpha=0.8))
actions = ['Left','Right']
img = ax.imshow(env.render())
rewards = 0
num_epochs= 10
for step in range(num_epochs):
    obs, info = env.reset()
    done = False
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        
        rewards += reward
        frame = env.render()
        img.set_data(frame)
        action_text.set_text(f'Step: {actions[action] }')

        fig.canvas.draw()
        fig.canvas.flush_events()
        done = terminated or truncated
        obs = next_obs

plt.ioff()  # Turn off interactive mode
# plt.show()  # Keep the window open after the animation finishes
plt.close()
env.close()

TclError: can't invoke "update" command: application has been destroyed

In [None]:
plt.plot(scores)
plt.show()