<a href="https://colab.research.google.com/github/PandisDP/Deep-Reinforcement-Learning/blob/main/DRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil
import torch
import numpy as np
import torch as th
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from itertools import count
import torch.nn.functional as F
from collections import namedtuple
from IPython import display
import torch
import random
import math

In [None]:
class Field:
    def __init__(self,device,size,item_pickup,item_dropoff,
                start_position,zones_blocks=[],path_predicts='Episodes'):
        '''
        Constructor of the class Field
        Parameters:
        size: size of the field
        item_pickup: position of the item to pickup
        item_dropoff: position of the item to dropoff
        start_position: position of the agent
        zones_blocks: list of tuples with the positions of the blocks
        path_predicts: path to save the images of the episodes
        '''
        self.device=device
        self.size = size
        self.item_pickup = item_pickup
        self.item_dropoff = item_dropoff
        self.position = start_position
        self.position_start= start_position
        self.block_zones=zones_blocks
        self.item_in_car= False
        self.number_of_actions=6
        self.allposicions = []
        self.path_predicts= path_predicts
        self.done=False
        self.save_state()
        self.initial_state = {
            'device': self.device,
            'position': self.position,
            'item_pickup': self.item_pickup,
            'item_dropoff': self.item_dropoff,
            'item_in_car': self.item_in_car
        }

    def reset(self):
        '''
        Reset the game
        '''
        self.device = self.initial_state['device']
        self.position = self.initial_state['position']
        self.item_pickup = self.initial_state['item_pickup']
        self.item_dropoff = self.initial_state['item_dropoff']
        self.item_in_car = self.initial_state['item_in_car']
        self.done=False
        self.allposicions = []
        self.save_state()    

    def get_number_of_actions(self):
        '''
        Get the number of actions of the game
        Returns: number of actions
        '''
        return self.number_of_actions
    
    def get_number_of_states(self):
        '''
        Get the number of states of the game
        Returns: number of states
        '''
        return (self.size**4)*2 

    def get_state(self):
        '''
        Get the state of the game
        Returns: state
        '''
        state= self.position[0]*self.size*self.size*self.size*2
        state+= self.position[1]*self.size*self.size*2
        state+= self.item_pickup[0]*self.size*2
        state+= self.item_pickup[1]*2   
        if self.item_in_car:
            state+=1
        return torch.tensor([state],device=self.device)   
    
    def save_state(self):
        '''
        Save the state of the game
        '''
        self.allposicions.append(self.position)

    def graphics(self,puntos,name_fig):
        '''
        Create a plot of the game
        Parameters:
        puntos: list of tuples with the positions of the points
        name_fig: name of the figure
        '''
        # Crear una cuadrícula de 10x10
        cuadricula = np.zeros((10, 10))
        # Marcar los puntos en la cuadrícula
        for punto in puntos:
            cuadricula[punto] = 1
        # Crear la figura y el eje para el plot
        fig, ax = plt.subplots()
        # Usar 'imshow' para mostrar la cuadrícula como una imagen
        # 'cmap' define el mapa de colores, 'Greys' es bueno para gráficos en blanco y negro
        ax.imshow(cuadricula, cmap='Greys', origin='lower')
        # Ajustar los ticks para que coincidan con las posiciones de la cuadrícula
        ax.set_xticks(np.arange(-.5, 10, 1))
        ax.set_yticks(np.arange(-.5, 10, 1))
        # Dibujar las líneas de la cuadrícula
        ax.grid(color='black', linestyle='-', linewidth=2)
        # Ajustar el límite para evitar cortes
        ax.set_xlim(-0.5, 9.5)
        ax.set_ylim(-0.5, 9.5)
        for punto in self.block_zones:
            ax.scatter(punto[1], punto[0], color='red', marker='X', s=100) 
        for punto in puntos:
            ax.text(punto[1], punto[0], '✔', color='white', ha='center', va='center', fontsize=10)

        lst_start=[self.position_start, self.item_pickup,self.item_dropoff]
        for punto in lst_start:
            ax.scatter(punto[1], punto[0], color='blue',marker='*', s=100)  
        name_fig_path = self.path_predicts + '/' +name_fig
        plt.savefig(name_fig_path)
        plt.close()

    def empty_predict_data(self):
        '''
        Empty the folder of the predictions
        '''
        path=self.path_predicts
        for nombre in os.listdir(path):
            ruta_completa = os.path.join(path, nombre)
            try:
                if os.path.isfile(ruta_completa) or os.path.islink(ruta_completa):
                    os.remove(ruta_completa)
                elif os.path.isdir(ruta_completa):
                    shutil.rmtree(ruta_completa)
            except Exception as e:
                print(f'Error {ruta_completa}. reason: {e}')

    def block_zones_evaluation(self,position):
        '''
        Evaluate if the position is in a block zone
        Parameters:
        position: position to evaluate
        Returns: True if the position is in a block zone, False otherwise
        '''
        if position in self.block_zones:
            return True
        return False

    def make_action(self,action):
        '''
        Make an action in the game
        Parameters:
        action: action to make
        Returns: reward, done
        '''
        val_return=0
        (x,y) = self.position
        if action ==0: #down
            if y==self.size-1:
                val_return= -10 #reward,done
                return torch.tensor([val_return],device=self.device),self.done
            else:
                self.position = (x,y+1)
                self.save_state()
                if self.block_zones_evaluation(self.position):
                    val_return= -100
                    return torch.tensor([val_return],device=self.device),self.done 
                val_return = -1
                return torch.tensor([val_return],device=self.device),self.done
        elif action ==1: #up
            if y==0:
                val_return = -10
                return torch.tensor([val_return],device=self.device),self.done  
            else:
                self.position = (x,y-1)
                self.save_state()
                if self.block_zones_evaluation(self.position):
                    val_return =-100
                    return torch.tensor([val_return],device=self.device),self.done  
                val_return = -1
                return torch.tensor([val_return],device=self.device),self.done
        elif action ==2: #left
            if x==0:
                val_return = -10
                return torch.tensor([val_return],device=self.device),self.done  
            else:
                self.position = (x-1,y)
                self.save_state()
                if self.block_zones_evaluation(self.position):
                    val_return = -100
                    return torch.tensor([val_return],device=self.device),self.done  
                val_return= -1
                return torch.tensor([val_return],device=self.device),self.done  
        elif action ==3: #right
            if x==self.size-1:
                val_return = -10
                return torch.tensor([val_return],device=self.device),self.done  
            else:
                self.position = (x+1,y)
                self.save_state()
                if self.block_zones_evaluation(self.position):
                    val_return =-100
                    return torch.tensor([val_return],device=self.device),self.done  
                val_return = -1
                return torch.tensor([val_return],device=self.device),self.done 
        elif action ==4: #pickup
            if self.item_in_car:
                val_return = -10
                return torch.tensor([val_return],device=self.device),self.done   
            elif self.item_pickup != (x,y):
                val_return = -10
                return torch.tensor([val_return],device=self.device),self.done  
            else:
                self.item_in_car = True
                val_return = 20
                return torch.tensor([val_return],device=self.device),self.done
        elif action ==5: #dropoff
            if not self.item_in_car:
                val_return = -10
                return torch.tensor([val_return],device=self.device),self.done  
            elif self.item_dropoff != (x,y):
                val_return = -10
                return torch.tensor([val_return],device=self.device),self.done   
            else:
                self.item_in_car = False
                self.done=True
                val_return = 20
                return torch.tensor([val_return],device=self.device),self.done  




In [None]:
class QValues():
    device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #device = th.device("mps" if th.backends.mps.is_available() else "cpu")
    @staticmethod
    def get_current(policy_net,states,actions):
        value_=policy_net(states).gather(dim=1,index=actions.unsqueeze(-1))
        return value_
    
    @staticmethod
    def get_next(target_net,next_states,is_done):
        next_q_values= torch.zeros(len(next_states)).to(QValues.device)
        non_final_mask= ~is_done
        non_final_next_states= next_states[non_final_mask]
        if len(non_final_next_states)>0:
            with torch.no_grad():
                next_q_values[non_final_mask]= target_net(non_final_next_states).max(dim=1)[0]
        return next_q_values
        
class DQN(nn.Module):
    def __init__(self,feature_size, num_actions):
        super().__init__()
        self.fc1 = nn.Linear(in_features=feature_size,out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=256)
        self.fc3 = nn.Linear(in_features=256, out_features=128)
        self.out= nn.Linear(in_features=128 ,out_features=num_actions)

    def forward(self,t): 
    
        if t.dim()==1:
            t= t.unsqueeze(1)
        t=t.float()  
        t= F.relu(self.fc1(t))
        t= F.relu(self.fc2(t))
        t= F.relu(self.fc3(t))
        t= self.out(t)
        return t    


In [None]:

class Agent():
    def __init__(self,strategy,num_actions,device):
        self.step=0
        self.strategy=strategy
        self.num_actions= num_actions
        self.device=device

    def select_action(self,state,policy_net):
        rate= self.strategy.get_exploration_rate(self.step)
        self.step+=1
        if random.random()<rate:
            action= random.randrange(self.num_actions)
            return torch.tensor([action]).to(self.device) #action
        else:
            with torch.no_grad():
                return policy_net(state).argmax(dim=1).to(self.device)

class EpsilonGreedyStrategy():
    def __init__(self,start,end,decay):
        self.start= start
        self.end= end
        self.decay= decay

    def get_exploration_rate(self,step):
        return self.end + (self.start - self.end)*math.exp(-step*self.decay)    

class ReplayMemory():
    def __init__(self,capacity):
        self.capacity= capacity
        self.memory= []
        self.count=0
    def push(self,exp):
        if len(self.memory)< self.capacity:
            self.memory.append(exp)
        else:
            self.memory[self.count%self.capacity]=exp
        self.count+=1      
    def sample(self,batch_size):
        return random.sample(self.memory,batch_size),0,0

    def can_provide_sample(self,batch_size):
        return len(self.memory)> batch_size 

class PrioritizedReplayMemory():
    def __init__(self, capacity, alpha=0.6):
        self.tree = SumTree(capacity)
        self.capacity = capacity
        self.alpha = alpha
        self.beta = 0.4
        self.beta_increment_per_sampling = 0.001
        self.epsilon = 1e-6

    def _get_priority(self, error):
        return (error + self.epsilon) ** self.alpha

    def push(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, batch_size):
        batch = []
        idxs = []
        segment = self.tree.total() / batch_size
        priorities = []
        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
        for i in range(batch_size):
            a = segment * i
            b = segment * (i + 1)
            s = random.uniform(a, b)
            s = min(max(s, 0), self.tree.total())
            (idx, p, data) = self.tree.get(s)
            #print('bath',idx,p,self.tree.total(),data,a,b,s)
            batch.append(data)
            idxs.append(idx)
            priorities.append(p)

        sampling_probabilities = priorities / self.tree.total()
        sampling_probabilities+=self.epsilon
        is_weights = np.power(self.tree.total() * sampling_probabilities, -self.beta)
        is_weights /= is_weights.max()
        #print('bath',idx,p,data)
        return batch, idxs, is_weights

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def can_provide_sample(self, batch_size):
        return self.tree.write >= batch_size  


class SumTree:
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
        self.write = 0

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2
        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            if self.tree[right] > 0:
                return self._retrieve(right, s - self.tree[left])
            else:
                return idx

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1
        self.data[self.write] = data
        self.update(idx, p)
        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

    def update(self, idx, p):
        change = p - self.tree[idx]
        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1

        return idx, self.tree[idx], self.data[dataIdx]                

In [None]:
from QL import DQN,QValues
import numpy as np
import torch as th
import matplotlib.pyplot as plt
from itertools import count
import torch.nn.functional as F
from collections import namedtuple
from IPython import display

Experience= namedtuple('Experience',('state','action','next_state','reward','is_done'))

class QDQ:
    def __init__(self,device,enviroment,agent,memory_strategy,features):
        self.device= device
        self.env= enviroment
        self.memory= memory_strategy
        self.features= features
        self.agent= agent
        self.policy_net= DQN(self.features,self.env.get_number_of_actions()).to(self.device)
        self.target_net= DQN(self.features,self.env.get_number_of_actions()).to(self.device)

    def training_priorized_memory(self,batch_size,gamma,target_update,learning_rate,num_episodes):
        print('Training Process with Prioritized Memory')
        self.target_net.load_state_dict(self.policy_net.state_dict()) 
        self.target_net.eval()
        optimizer= th.optim.Adam(self.policy_net.parameters(),lr=learning_rate)
        episode_durations=[]
        episode_losses=[]
        total_timesteps = 0
        for episode in range(num_episodes):
            self.env.reset()
            episode_losses=[]
            total_loss= 0
            loss_count= 0
            for timestep in count():
                state= self.env.get_state()
                action= self.agent.select_action(state,self.policy_net)
                reward,done= self.env.make_action(action)
                next_state= self.env.get_state()
                # Calculate the error for the priority
                with th.no_grad():
                    current_q_value = self.policy_net(state).gather(1, th.tensor([[action]]))
                    next_q_value = self.target_net(next_state).max(1)[0].unsqueeze(1)
                    target_q_value = reward + (gamma * next_q_value * (1 - done))
                    error = abs(current_q_value - target_q_value).item()
                self.memory.push(error, Experience(state, action, next_state, reward, done))
                #memory.push(Experience(state,action,next_state,reward,done))
                if self.memory.can_provide_sample(batch_size):
                    experiences,idxs,is_weights= self.memory.sample(batch_size)
                    states,actions,rewards,next_states,is_done= self.__extract_tensors(experiences)
                    current_q_values= QValues.get_current(self.policy_net,states,actions)
                    with th.no_grad():
                        next_q_values= QValues.get_next(self.target_net,next_states,is_done)
                    target_q_values= (next_q_values*gamma)+rewards
                    is_weights= th.tensor(is_weights,dtype=th.float).unsqueeze(1)
                    #loss= F.mse_loss(current_q_values,target_q_values.unsqueeze(1))
                    loss = (is_weights * F.mse_loss(current_q_values, target_q_values.unsqueeze(1)
                                                    , reduction='none')).mean()
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    # Update priorities
                    errors = th.abs(current_q_values - target_q_values.unsqueeze(1)).detach().numpy()
                    for idx, error in zip(idxs, errors):
                        self.memory.update(idx, error)
                    episode_losses.append(loss.item())
                    total_timesteps += 1
                if done:
                    episode_durations.append(timestep)
                    print("Episode: ",episode," Average_Losses: ",np.mean(episode_losses),
                        " Duration: ",timestep)   
                    break
                if total_timesteps % target_update == 0:
                    self.target_net.load_state_dict(self.policy_net.state_dict())     

    def training_replay_memory(self,batch_size,gamma,target_update,learning_rate,num_episodes):
        print('Training Process with Replay Memory')
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        optimizer= th.optim.Adam(self.policy_net.parameters(),lr=learning_rate)
        episode_durations=[]
        episode_losses=[]
        total_timesteps = 0
        for episode in range(num_episodes):
            self.env.reset()
            episode_losses=[]
            total_loss= 0
            loss_count= 0
            for timestep in count():
                state= self.env.get_state()
                action= self.agent.select_action(state,self.policy_net)
                reward,done= self.env.make_action(action)
                next_state= self.env.get_state()
                self.memory.push(Experience(state,action,next_state,reward,done))
                if self.memory.can_provide_sample(batch_size):
                    experiences,*_= self.memory.sample(batch_size)
                    #print(experiences)
                    states,actions,rewards,next_states,is_done= self.__extract_tensors(experiences)
                    current_q_values= QValues.get_current(self.policy_net,states,actions)
                    with th.no_grad():
                        next_q_values= QValues.get_next(self.target_net,next_states,is_done)
                    target_q_values= (next_q_values*gamma)+rewards
                    loss= F.mse_loss(current_q_values,target_q_values.unsqueeze(1))
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    episode_losses.append(loss.item())
                    total_timesteps += 1
                if done:
                    episode_durations.append(timestep)
                    print("Episode: ",episode," Average_Losses: ",np.mean(episode_losses),
                        " Duration: ",timestep)
                    break
                if total_timesteps % target_update == 0:
                    self.target_net.load_state_dict(self.policy_net.state_dict())


    def __extract_tensors(self,experiences):
        batch = Experience(*zip(*experiences))
        states = th.cat(batch.state)
        actions = th.cat(batch.action)
        rewards = th.cat(batch.reward)
        next_states = th.cat(batch.next_state)
        final_states = th.tensor(batch.is_done, dtype=th.bool)
        return states, actions, rewards, next_states, final_states

    def __get_moving_avg(self,values,period):
        values = th.tensor(values,dtype=th.float)
        if len(values)>=period:
            moving_avg= values.unfold(dimension=0,size=period,step=1).mean(dim=1).flatten(start_dim=0)
            moving_avg= th.cat((th.zeros(period-1),moving_avg))
            return moving_avg
        else:
            moving_avg= th.zeros(len(values))
            return moving_avg
        
    def __plot(self,values,moving_avg_period):
        plt.figure(2)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Duration')
        plt.plot(values)
        moving_avg= self.get_moving_avg(values,moving_avg_period)
        plt.plot(moving_avg)
        plt.pause(0.001)
        #print("Episode", len(values),"\n",moving_avg_period,"episode moving avg:", moving_avg[-1])
        display.clear_output(wait=True)            



Using device: cuda
Episode:  0  Average_Losses:  135.28777499408346  Duration:  389215
Episode:  1  Average_Losses:  4.105564534663412  Duration:  122408
Episode:  2  Average_Losses:  172.5556785041706  Duration:  221875
Episode:  3  Average_Losses:  127.1155606902137  Duration:  257766
Episode:  4  Average_Losses:  289.5835316890957  Duration:  18484
Episode:  5  Average_Losses:  265.1592773839858  Duration:  16709
Episode:  6  Average_Losses:  219.13382037182205  Duration:  98785
Episode:  7  Average_Losses:  229.6904303624651  Duration:  6165
Episode:  8  Average_Losses:  157.07802212904772  Duration:  24039
Episode:  9  Average_Losses:  195.22051784399622  Duration:  86371
Episode:  10  Average_Losses:  218.35436321994868  Duration:  14417
Episode:  11  Average_Losses:  152.2117819436007  Duration:  58571
Episode:  12  Average_Losses:  181.87751747064297  Duration:  215796
Episode:  13  Average_Losses:  163.18956641947008  Duration:  48555
Episode:  14  Average_Losses:  175.4065372

In [None]:
from QDQ  import QDQ
from  Games import Field
from Agent import Agent ,EpsilonGreedyStrategy,ReplayMemory,PrioritizedReplayMemory
import torch as th

def training_process(params_env,prms_tra,type_memory):
    '''
    Params:
    params_env: Dictionary with the parameters of the game
    prms_tra: Dictionary with the parameters of the training
    type_memory: 0 for ReplayMemory and 1 for PrioritizedReplayMemory
    '''
    device= th.device("cuda" if th.cuda.is_available() else "cpu")
    env= Field(device,params_env['size'],params_env['start_position'],params_env['item_pickup'],
                params_env['item_dropoff'],params_env['zones_block'],params_env['Path'])
    eps= EpsilonGreedyStrategy(prms_tra['eps_start'],prms_tra['eps_end'],prms_tra['eps_decay'])
    agent= Agent(eps,env.get_number_of_actions(),device)
    if type_memory==0:
        memory= ReplayMemory(prms_tra['memory_size'])
        q= QDQ(device,env,agent,memory,prms_tra['features'])
        q.training_replay_memory(prms_tra['batch_size'],prms_tra['gamma'],
                            prms_tra['target_update'],prms_tra['lr'],prms_tra['num_episodes'])
    elif type_memory==1:
        memory= PrioritizedReplayMemory(prms_tra['memory_size'])
        q= QDQ(device,env,agent,memory,prms_tra['features'])
        q.training_priorized_memory(prms_tra['batch_size'],prms_tra['gamma'],
                            prms_tra['target_update'],prms_tra['lr'],prms_tra['num_episodes'])
    else:
        print('Error: type_memory must be 0 or 1')
        return 'Error' 
    return 0
if __name__ == '__main__':
    params_game = {
                "size": 10,
                "start_position": (9, 0),  # (9,0)
                "item_pickup": (1, 1),  # (1,1)
                "item_dropoff": (8, 8),  # (8,8)
                "zones_block": [(4, 0), (4, 1), (4, 2), (4, 3), (2, 6), (2, 7), (2, 8), (2, 9), 
                                (4, 8), (5, 8), (6, 8), (7, 6), (8, 6), (9, 6)],
                "Path": 'Episodes'
    }
    params_training = {
                "batch_size": 128,
                "features": 1,
                "gamma": 0.99,
                "eps_start": 1,
                "eps_end": 0.01,
                "eps_decay": 0.001,
                "target_update": 5000,
                "memory_size": 100000,
                "lr": 0.001,
                "num_episodes": 10000
    }
    training_process(params_game,params_training,0)