# Deep Q-Learning for Lunar Landing

## Part-0 Installing required packages and importing librarie

### Installing Gymnasium

In [None]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0
Collecting ale-py>=0.9 (from gymnasium[accept-rom-license,atari])
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collec

### Importing the libraries

In [None]:
import os
import random
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque,namedtuple

## Part-1 Building the AI

### Creating the Architecture of the Neural Network

In [None]:
class Network(nn.Module) :
    # Pytorch deer shaaj l baigaa bol neural network-d ugaasaa buhii l nn.Module must inherit shaah yostoi genee.
    def __init__(self,state_size,action_size,seed=42):
        # random seed for reproducibility. seed ni bol
        super(Network,self).__init__()
        self.seed = torch.manual_seed(seed)
          # by using the same seed,the same random numbers are generated across different runs.
        self.fc1 = nn.Linear(state_size,64)  # input bolon ehnii layer ym baina.
          # state_size-iin hemjeetei input avaad uuruu 64 neurons-toi bolhoor 64 ouput tsaashaa damjuulna.
        self.fc2 = nn.Linear(64,64)          # # Hidden layer
        self.fc3 = nn.Linear(64,action_size)   # output layer

    def forward(self,state):
        x = self.fc1(state)
        x = F.relu(x)   # rectifier-iiig activate hiijiin genee.
        x = self.fc2(x)
        x = F.relu(x)
        return self.fc3(x)

## Part-2 Training the AI

### Setting up the environment

In [None]:
import gymnasium as gym
env = gym.make('LunarLander-v3')
state_shape = env.observation_space.shape # shape of space buyu observation space has 8 features.
state_size = env.observation_space.shape[0] # ene ni yg ter 8n feature ni ... x-velocity, x-angle,x-position etcc...
number_actions = env.action_space.n
print("State shape: " , state_shape)
print("State size: " , state_size)
print("Number of actions: " , number_actions)

State shape:  (8,)
State size:  8
Number of actions:  4


### Intializing the hyperparameters

In [None]:
learning_rate = 5e-4  # alpha
minibatch_size = 100   # number of experiences sampled from the replay buffer
discount_factor = 0.99   # nuguu gamma l ym baina.
replay_buffer_size = int(1e5)   # maximum number of experiences that can be stored in the replay buffer.
interpolation_parameter = 1e-3  # Affects how values are updated or interpolated in the learning process.

### Implementing Experience Replay

In [None]:
class ReplayMemory():
    def __init__(self,capacity):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
          # bolomjtoi bol "gpu" ashiglay , bolku bol "cpu"-geeree shaay gejiin.
        self.capacity = capacity
        self.memory = []

    def push(self,event):
        self.memory.append(event)
        if len(self.memory) > self.capacity :
            del self.memory[0]
    def sample(self,batch_size):
        experiences = random.sample(self.memory,k=batch_size)
        states=torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2]for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3]for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e[4]for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
         # duussan uu , ugui yu gedgiig ni shalgahad ashigladag shit.
        return states,next_states,actions,rewards,dones



### Implementing the DQN class

In [None]:
class Agent():
    def __init__(self,state_size,action_size):
         # state_size ni bol number of inputs of the neural network
         # action_size ni bol output size of the neural network

        self.state_size = state_size
        self.action_size = action_size
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
        self.local_qnetwork = Network(state_size , action_size).to(self.device)
         # responsible for estimating the Q-values for the actions given the current state.
        self.target_qnetwork = Network(state_size,action_size).to(self.device)
         # compute the target Q-values during training,
        self.optimizer = optim.Adam(self.local_qnetwork.parameters(),lr = learning_rate)  # .parameters ni weigth-uud ni genee.
         # The optimizer is responsible for updating the network weights
        self.memory = ReplayMemory(replay_buffer_size)
        self.t_step = 0
        # used to keep track of the number of steps taken by the agent.

    def step(self,state,action,reward,next_state,done):
        self.memory.push((state,action,reward,next_state,done))
        self.t_step = (self.t_step +1)%4  # 4 udaa shaah bolgondoo t_step--iig reset hiiy gej baina.
        if self.t_step == 0:
            if len(self.memory.memory)>minibatch_size: # 15dah line deer ReplayMemory-iin self.memory instance uusgesen,tegeed ternii memory object ruu shaahiin tuld ingeed baigaan baina.
                  # ene bol medeej memory-nii buyu niit size-ni sample size-naas tom baih yostoi shuu dee.
                experiences = self.memory.sample(100)
                self.learn(experiences,discount_factor)

    def act(self,state,epsilon=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
           # ene deer extra 1 dimension nemj hiij baina. Ali batch-d ene state in belong hiij baina gedgiig haruulah ed. unsqueeze(0) gechiheer ter nemegdel dimensio ni 0-r index deer ochino l gesen sanaa.
        self.local_qnetwork.eval() # ene eval ni bur anh inherit hiisen nn.Module-iin method , tegeed evalution mode ruu local_qnetwork-iig shaaj baina.
         # ingehiin shaltgaan ni neural network behaves differently in evalutionp phase and the training phase.
        with torch.no_grad(): # makes sure any gradient computation is disabled. yurn l evalution phase buyu predict hiih phase-d baigaagaa l double check hiijiin gehiimu.
            action_values = self.local_qnetwork(state)
              # ene shit bol predicted Q-Value--g hadgalah shit baina.

        self.local_qnetwork.train() # ingeed butsaagaad training mode ruu ni shaahcij baigaa.

        if(random.random() > epsilon) : # random-oor too shaagad, ter ni epsilon-oos ih baival,maximum Q-value  --g avna. ugui bol randomoor neg shit shaana.
            return np.argmax(action_values.cpu().data.numpy())  # tged ter q-valuendaa hargalzah action-g return hiine.
        else :
            return random.choice(np.arange(self.action_size))

    def learn(self,experiences,discout_factor):
        states,next_states,actions,rewards,dones = experiences
        next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)  # max(1)[0] ni max q-valued hargalzah action-g return ,, unsquueze ni extra dimension nemeh ba,loss calculation expect hiine(ter l hemjeeg)
        q_target = rewards + discount_factor * next_q_targets*(1-dones)
        q_expected = self.local_qnetwork(states).gather(1,actions) # predicts the current_state's q-value  // extracts the Q-values corresponding to the agent's chosen actions.
        loss = F.mse_loss(q_expected,q_target)  # mean-squared error
        self.optimizer.zero_grad()
        loss.backward()   # performs the backpropagation
        self.optimizer.step() # ene step func bol bidnii bichsen shit bish, tsaanasa bdg shit. And it is useful for updating parameters.
        self.softupdate(self.local_qnetwork,self.target_qnetwork,interpolation_parameter)

    def softupdate(self,local_model,target_model,interpolation_parameter):  # target_network-oo softupdate-aar update hiij baina. for a better stability.
        for target_param,local_param in zip(target_model.parameters(),local_model.parameters()):
            target_param.data.copy_(interpolation_parameter*local_param.data + (1.0-interpolation_parameter)*target_param.data)




### Initializing the DQN agent

In [None]:
agent = Agent(state_size,number_actions)

### Training the DQN agent

In [None]:
number_of_episodes = 2000
maximum_number_timesteps_per_episode = 1000 # neg episode-d hiih action-ii limit genee.
epsilon_starting_value = 1.0  # ene ni 1 baigaa ni ehendee random-oor l songood baina gesen ug.
epsilon_ending_value = 0.01  # tegeed duusahdaa , 1% ni random-oor songoj baigaa shit baina gesen ug.
eplison_decay_value = 0.995  # ene urjuulehiin durmeer bolohoor,neleen baga bagaar l bagasna.allows Agent to shift from exploration to exploitation over time.
epsilon = epsilon_starting_value
scores_on_100_episodes = deque(maxlen=100)  # agent-iin hamgiin suuliin 100n scores kept in memory gesen ug.

for episode in range(1,number_of_episodes+1):
    state,_ = env.reset()  # underscore ni bol uur hereggu medeellig ni disregard shaahiin tuld.
      # each episode-iin beginning-d env-ii reset hiigee, initial state ruu shiljij biana gesen ug.
    score = 0
    for t in range(maximum_number_timesteps_per_episode):
        action = agent.act(state,epsilon)  # this line of code selects the action
        next_state,reward,done,_,_ = env.step(action) # select hiisen action-g execute hiij baina.
        agent.step(state,action,reward,next_state,done)  #stores experiences and learn
        state = next_state
        score += reward
        if done :
            break
    scores_on_100_episodes.append(score)
    epsilon=max(epsilon_ending_value,eplison_decay_value*epsilon)
    print('\rEpisode {}\tAverage Score: {:.2f}\t'.format(episode,np.mean(scores_on_100_episodes)),end="")
    if episode % 100 == 0 :
        print('\rEpisode {}\tAverage Score: {:.2f}\t'.format(episode,np.mean(scores_on_100_episodes)))
    if np.mean(scores_on_100_episodes)>=200.0:
        print('\nEnvironment solved in {:d} episodes!\tEpisode {:d}\tAverage Score: {:.2f}\t'.format(episode-1, episode, np.mean(scores_on_100_episodes)),end="")
        torch.save(agent.local_qnetwork.state_dict(),'checkpoint.pth')
        break



Episode 100	Average Score: -165.41	
Episode 200	Average Score: -110.33	
Episode 300	Average Score: -35.84	
Episode 400	Average Score: -17.42	
Episode 500	Average Score: 2.74	
Episode 600	Average Score: 59.25	
Episode 700	Average Score: 193.79	
Episode 712	Average Score: 202.74	
Environment solved in 711 episodes!	Episode 712	Average Score: 202.74	

## Part-3 Visualising the results

In [None]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action.item())
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'LunarLander-v3')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

