In [88]:
import gymnasium as gym
import numpy as np
from torch import nn
import torch
from collections import deque
import random

In [89]:
torch.manual_seed(42)
np.random.seed(42)

In [90]:
class DQN(nn.Module):
    def __init__(self, state,action):
        super(DQN,self).__init__()
        self.input=nn.Linear(state,128)
        self.hidden=nn.Linear(128,128)
        self.output=nn.Linear(128,action)

    def forward(self,x):
        x=self.input(x)
        x=torch.relu(x)
        x=self.hidden(x)
        x=torch.relu(x)
        x=self.output(x)
        return x

In [91]:
class Replaymemory:
    def __init__(self,maxlen):
        self.maxlen=maxlen
        self.memory=deque([],maxlen=self.maxlen)
    
    def append(self,transition):
        self.memory.append(transition)
    
    def sample(self,sample_size):
        return random.sample(self.memory,sample_size)
    
    def __len__(self):
        return len(self.memory)


In [92]:
class Hyperparameters:
    epsilon=1.0
    epsilon_end=0.01
    epsilon_dec=0.9954

    batch_size=32
    memory_size=100000
    target_update_freq=100

    learning_rate=0.0001
    discount_factor=0.99


In [None]:
class Agent:

    def __init__(self,is_train=True,DDQN=True):
        if is_train:
            hp=Hyperparameters()
            self.epsilon=hp.epsilon
            self.epsilon_dec=hp.epsilon_dec
            self.epsilon_end=hp.epsilon_end
            self.batch_size=hp.batch_size
            self.memory_size=hp.memory_size
            self.target_update_freq=hp.target_update_freq
            self.lossfn=nn.SmoothL1Loss()
            self.optimizer=None
            self.learning_rate=hp.learning_rate
            self.discount_factor=hp.discount_factor

            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            self.env = gym.make("LunarLander-v3")
            obs_shape = self.env.observation_space.shape[0]
            n_actions = self.env.action_space.n

            self.policy_net = DQN(obs_shape, n_actions).to(self.device)
            self.target_net = DQN(obs_shape, n_actions).to(self.device)
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.target_net.eval()

            self.memory = Replaymemory(self.memory_size)
            self.DDQN=DDQN

    def run(self,is_train=True,render=False,episodes=100,DDQN=True,model_path=None):
        env=gym.make("LunarLander-v3",render_mode="human" if render else None)
        #env=gym.make("CartPole-v1",render_mode="human")

        policy_net=self.policy_net

        if is_train:
            memory=self.memory
            Target_net=self.target_net
            Target_net.load_state_dict(policy_net.state_dict())
            self.optimizer=torch.optim.Adam(policy_net.parameters(),lr=self.learning_rate)
            stepcount=0
        else:
            if model_path is None:
                policy_net.load_state_dict(torch.load(r"D:\Data Science and ML\Reinforcement Learning\Midterm Submission\lunarlander_dqn.pt",map_location=self.device))
                policy_net.eval()
            else:
                policy_net.load_state_dict(torch.load(model_path,map_location=self.device))
                policy_net.eval()

        reward_list=[]
        reward_list_mean=[]

        for i in range(episodes):
            state_main=env.reset()[0]
            reward_ep=0
            while True:
                term=False
                state=torch.tensor(state_main,dtype=torch.float32,device=self.device).unsqueeze(0)

                if np.random.random()<self.epsilon and is_train:
                    action=env.action_space.sample()    
                else:
                    with torch.no_grad():
                        action=policy_net(state).argmax().item()

                new_state,reward_move,term,trunc,_=env.step(action)
                reward_ep+=reward_move
                done=term or trunc
                state_main=new_state

                if is_train:
                    action_tens=torch.tensor([action],dtype=torch.int64,device=self.device)
                    new_state_tens=torch.tensor(new_state,dtype=torch.float32,device=self.device).unsqueeze(0)
                    reward_tens=torch.tensor(reward_move,dtype=torch.float32,device=self.device)
                    term_tens=torch.tensor(term,dtype=torch.float32,device=self.device)
                    memory.append((state,action_tens,new_state_tens,reward_tens,term_tens))
                    stepcount+=1


                if is_train and len(memory)>=self.batch_size:
                    batch=memory.sample(self.batch_size)
                    self.optimize(batch,policy_net,Target_net,DDQN)
                    if stepcount%self.target_update_freq==0:
                        Target_net.load_state_dict(policy_net.state_dict())
                        stepcount=0

                if done :
                    break

            if self.epsilon>self.epsilon_end and is_train:
                self.epsilon*=self.epsilon_dec
            else:
                self.epsilon=self.epsilon_end
            reward_list.append(reward_ep)

            if (np.mean(reward_list[-10:])>=200 and (np.mean(reward_list[-10:])>=(reward_list_mean[-1] if reward_list_mean else True))) and is_train :
                reward_list_mean.append(np.mean(reward_list[-4:]))
                torch.save(policy_net.state_dict(),"dqn_lunarlander_best.pth")
                print(f"Updated in episode {i+1}")
            
            print(f"Episode: {i+1}, Reward: {reward_ep}, Epsilon: {self.epsilon}")
        return reward_list
    
    def optimize(self,batch,policy_net,Target_net,DDQN=True):

        states,actions,new_state,reward,term=zip(*batch)

        states=torch.cat(states)
        actions=torch.stack(actions).view(-1,1).long()
        new_state=torch.cat(new_state)
        reward=torch.stack(reward).unsqueeze(1)
        term=torch.tensor(term,dtype=torch.float32).unsqueeze(1)

        curr_q=policy_net(states).gather(1,actions)
        
        if DDQN:
            with torch.no_grad():
                best_nxt_act=policy_net(new_state).argmax(1).unsqueeze(1)
                tar_q_val=Target_net(new_state).gather(1,best_nxt_act).squeeze(1)
                tar_q=reward+(1-term)*self.discount_factor*tar_q_val
        else:
            with torch.no_grad():
                tar_q_val=Target_net(new_state).max(1)[0]
                tar_q=reward+(1-term)*self.discount_factor*tar_q_val.unsqueeze(1)

        loss=self.lossfn(curr_q,tar_q)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 1.0)
        self.optimizer.step()


In [None]:
my_agent=Agent(is_train=True) 

train_rewards=my_agent.run(is_train=True,render=False,episodes=1000,DDQN=False)

In [95]:
torch.save(my_agent.policy_net.state_dict(),"lunarlander_dqn.pt")


In [97]:
test_rewards=my_agent.run(is_train=False,render=True,episodes=10,DDQN=False,model_path=r"D:\Data Science and ML\Reinforcement Learning\Midterm Submission\lunarlander_dqn.pt")

Episode: 1, Reward: 228.9654320216352, Epsilon: 0.01
Episode: 2, Reward: 240.5855753099405, Epsilon: 0.01
Episode: 3, Reward: 273.16228421625084, Epsilon: 0.01
Episode: 4, Reward: 267.93088403329654, Epsilon: 0.01
Episode: 5, Reward: 255.16356367525353, Epsilon: 0.01
Episode: 6, Reward: 258.6962581653863, Epsilon: 0.01
Episode: 7, Reward: 229.08476278498898, Epsilon: 0.01
Episode: 8, Reward: 269.2746911755635, Epsilon: 0.01
Episode: 9, Reward: 252.4660003509502, Epsilon: 0.01
Episode: 10, Reward: 260.24100509464995, Epsilon: 0.01
