In [31]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T

In [32]:
T.cuda.is_available()

True

In [33]:
class LinearDeepQNetwork(nn.Module):
    def __init__(self,
                 lr,
                 n_actions,
                 input_dims):
        super(LinearDeepQNetwork, self).__init__()
        
        self.fc1 = nn.Linear(*input_dims, 128)
        self.fc2 = nn.Linear(128,n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        
        layer1 = F.relu(self.fc1(state))
        actions = self.fc2(layer1)
        
        return actions

In [34]:
class Agent():
    def __init__(self,
                 input_dims,
                 n_actions,
                 gamma=0.99,
                 lr=0.0001,
                 initial_epsilon=1,
                 epsilon_decay=1e-5,
                 final_epsilon=0.01):
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.gamma = gamma
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.action_space = [i for i in range(self.n_actions)]
        
        self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims)
        
    def choose_action(self, obs):
            if np.random.random() > self.epsilon:
                state = T.tensor(obs, dtype=T.float).to(self.Q.device)
                actions = self.Q.forward(state)
                action = T.argmax(actions).item()
                # print('All Actions: ', actions)
                # print('Selected Action: ', action.numpy())
            else:
                action = np.random.choice(self.action_space)
            
            return action
    
    def decrement_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)
        
    def learn(self, state, action, reward, next_state):
        self.Q.optimizer.zero_grad()
        state = T.tensor(state, dtype=T.float).to(self.Q.device)
        action = T.tensor(action).to(self.Q.device)
        reward = T.tensor(reward).to(self.Q.device)
        next_state = T.tensor(next_state, dtype=T.float).to(self.Q.device)
        
        
        # print(f'states {states.shape}, next_states {next_states.shape}')
        q_pred = self.Q.forward(state)[action]
        # print(f'q_pred: {q_pred}')
        q_next = self.Q.forward(next_state).max()
#         print(f'q_next_max: {q_next}')        
        q_target = reward + self.gamma * q_next
#         print(f'q_target: {q_target}')
        loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
        loss.backward()
        self.Q.optimizer.step()
        self.decrement_epsilon()
            
    

In [35]:
if __name__ == '__main__':
    env = gym.make('CartPole-v1')
    nb_episodes = 2000
    scores = []
    eps_history = []
    agent = Agent(input_dims= env.observation_space.shape,
                  n_actions=env.action_space.n,
                  lr=0.001
                  )
    
    for i in range (nb_episodes):
        score = 0
        done = False
        obs, _ = env.reset()
        while not done:
            action = agent.choose_action(obs)
            next_obs, reward, terminated,truncated, _ = env.step(action)
            score+= reward
            agent.learn(obs, action, reward, next_obs)
            obs = next_obs
            done = truncated or terminated
        scores.append(score)
        eps_history.append(agent.epsilon)
        
        if i % 100 == 0:
            avg_score = np.mean(scores[-100:])
            print('Episode', i, 'score %.1f avg score %.1f epsilon %.3f'% (score, avg_score,agent.epsilon) )
    
            

Episode 0 score 10.0 avg score 10.0 epsilon 1.000
Episode 100 score 32.0 avg score 23.5 epsilon 0.976
Episode 200 score 16.0 avg score 23.9 epsilon 0.953
Episode 300 score 21.0 avg score 21.6 epsilon 0.931
Episode 400 score 15.0 avg score 24.5 epsilon 0.906
Episode 500 score 53.0 avg score 22.9 epsilon 0.884
Episode 600 score 20.0 avg score 23.2 epsilon 0.860
Episode 700 score 65.0 avg score 25.1 epsilon 0.835
Episode 800 score 13.0 avg score 24.6 epsilon 0.811
Episode 900 score 28.0 avg score 27.7 epsilon 0.783
Episode 1000 score 19.0 avg score 25.2 epsilon 0.758
Episode 1100 score 12.0 avg score 28.4 epsilon 0.729
Episode 1200 score 65.0 avg score 30.0 epsilon 0.699
Episode 1300 score 45.0 avg score 28.5 epsilon 0.671
Episode 1400 score 17.0 avg score 36.3 epsilon 0.635
Episode 1500 score 12.0 avg score 37.2 epsilon 0.597
Episode 1600 score 11.0 avg score 32.8 epsilon 0.565
Episode 1700 score 54.0 avg score 36.5 epsilon 0.528
Episode 1800 score 19.0 avg score 33.9 epsilon 0.494
Episo

In [36]:
import matplotlib
matplotlib.use('TkAgg')  # or 'Qt5Agg' if you prefer Qt
env = gym.make('CartPole-v1', render_mode = 'rgb_array')
obs, info = env.reset()

plt.ion()
fig, ax = plt.subplots(figsize=(8,8))
action_text = ax.text(510, 20, '', color='white', fontsize=12, bbox=dict(facecolor='blue', alpha=0.8))
actions = ['Left','Right']
img = ax.imshow(env.render())
rewards = 0
num_epochs= 10
for step in range(num_epochs):
    obs, info = env.reset()
    done = False
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        
        rewards += reward
        frame = env.render()
        img.set_data(frame)
        action_text.set_text(f'Step: {actions[action] }')

        fig.canvas.draw()
        fig.canvas.flush_events()
        done = terminated or truncated
        obs = next_obs

plt.ioff()  # Turn off interactive mode
# plt.show()  # Keep the window open after the animation finishes
plt.close()
env.close()

In [41]:
plt.plot(scores)
plt.show()