In [43]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T

In [44]:
T.cuda.is_available()

True

In [45]:
class LinearDeepQNetwork(nn.Module):
    def __init__(self,
                 lr,
                 n_actions,
                 input_dims):
        super(LinearDeepQNetwork, self).__init__()
        
        self.fc1 = nn.Linear(*input_dims, 128)
        self.fc2 = nn.Linear(128, n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        layer1 = F.relu(self.fc1(state))
        actions = self.fc2(layer1)
        
        return actions

In [46]:
class Agent():
    def __init__(self,
                 input_dims,
                 n_actions,
                 gamma=0.99,
                 lr=0.1,
                 initial_epsilon=1,
                 epsilon_decay=0.9995,
                 final_epsilon=0.01):
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.gamma = gamma
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.action_space = [i for i in range(self.n_actions)]
        
        self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims)
        
    def choose_action(self, obs):
            if np.random.random() > self.epsilon:
                state = T.tensor(obs, dtype=T.float).to(self.Q.device)
                actions = self.Q.forward(state)
                action = T.argmax(actions).item()
                # print('All Actions: ', actions)
                # print('Selected Action: ', action.numpy())
            else:
                action = np.random.choice(self.action_space)
            
            return action
    
    def decrement_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon * self.epsilon_decay)
        
    def learn(self, state, action, reward, next_state):
        self.Q.optimizer.zero_grad()
        state = T.tensor(state, dtype=T.float).to(self.Q.device)
        action = T.tensor(action).to(self.Q.device)
        reward = T.tensor(reward).to(self.Q.device)
        next_state = T.tensor(next_state, dtype=T.float).to(self.Q.device)
        
        
        # print(f'states {states.shape}, next_states {next_states.shape}')
        q_pred = self.Q.forward(state)[action]
        # print(f'q_pred: {q_pred}')
        q_next = self.Q.forward(next_state).max()
#         print(f'q_next_max: {q_next}')        
        q_target = reward + self.gamma * q_next
#         print(f'q_target: {q_target}')
        loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
        loss.backward()
        self.Q.optimizer.step()
        self.decrement_epsilon()
            
    

In [None]:
if __name__ == '__main__':
    env = gym.make('CartPole-v1',max_episode_steps=150)
    nb_episodes = 3000
    scores = []
    eps_history = []
    agent = Agent(input_dims= env.observation_space.shape,
                  n_actions=env.action_space.n,
                  lr=0.001
                  )
    
    for i in range (nb_episodes):
        score = 0
        done = False
        obs, _ = env.reset()
        while not done:
            action = agent.choose_action(obs)
            next_obs, reward, terminated,truncated, _ = env.step(action)
            score+= reward
            agent.learn(obs, action, reward, next_obs)
            obs = next_obs
            done = truncated or terminated
        scores.append(score)
        eps_history.append(agent.epsilon)
        
        if i % 100 == 0:
            avg_score = np.mean(scores[-100:])
            print('Episode', i, 'score %.1f avg score %.1f epsilon %.3f'% (score, avg_score,agent.epsilon) )
    
            

Episode 0 score 21.0 avg score 21.0 epsilon 0.990
Episode 100 score 11.0 avg score 23.8 epsilon 0.301
Episode 200 score 45.0 avg score 34.7 epsilon 0.053
Episode 300 score 27.0 avg score 33.7 epsilon 0.010
Episode 400 score 9.0 avg score 40.6 epsilon 0.010
Episode 500 score 40.0 avg score 45.2 epsilon 0.010


In [None]:
plt.plot(scores)