In [None]:
'''This repository contains a detailed implementation of the Reinforcement Learning Enviroment class'''
import matplotlib.pyplot as plt
import numpy as np
from dataclasses import *
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from typing import Any, Callable, Dict, List, Tuple, Union, Optional
from functools import wraps
import os
import random
from abc import ABC, abstractmethod
from collections import deque, namedtuple
T.Tensor.ndim = property(lambda self: len(self.shape))

In [None]:
@dataclass
class EnforceClassTyping:
    def __post_init__(self):
        for (name, field_type) in self.__annotations__.items():
            if not isinstance(self.__dict__[name], field_type):
                current_type = type(self.__dict__[name])
                raise TypeError(f"The field `{name}` was assigned by `{current_type}` instead of `{field_type}`")
        # print("Check is passed successfully")
def EnforceMethodTyping(func: Callable) -> Callable:
    'Enforces type annotation/hints for class mathods'
    arg_annotations = func.__annotations__
    if not arg_annotations:
        return func

    @wraps(func)
    def wrapper(self, *args: Tuple[Any], **kwargs: Dict[str, Any]) -> Any:
        for arg, annotation in zip(args, arg_annotations.values()):
            if not isinstance(arg, annotation):
                raise TypeError(f"Expected {annotation} for argument {arg}, got {type(arg)}.")

        for arg_name, arg_value in kwargs.items():
            if arg_name in arg_annotations:
                annotation = arg_annotations[arg_name]
                if not isinstance(arg_value, annotation):
                    raise TypeError(f"Expected {annotation} for keyword argument {arg_name}, got {type(arg_value)}.")

        return func(self, *args, **kwargs)

    return wrapper
def EnforceFunctionTyping(func: Callable) -> Callable:
    'Enforces type annotation/hints for other functions'
    @wraps(func)
    def wrapper(*args, **kwargs):
        # Check positional arguments
        for arg, annotation in zip(args, func.__annotations__.values()):
            if not isinstance(arg, annotation):
                raise TypeError(f"Expected {annotation} for {arg}, got {type(arg)}.")

        # Check keyword arguments
        for arg_name, arg_value in kwargs.items():
            if arg_name in func.__annotations__:
                annotation = func.__annotations__[arg_name]
                if not isinstance(arg_value, annotation):
                    raise TypeError(f"Expected {annotation} for {arg_name}, got {type(arg_value)}.")

        return func(*args, **kwargs)

    return wrapper
 

**Ornstein Uhlenbeck Noise**

In [None]:
class OUActionNoise(object):
    def __init__(self, mu, sigma=0.15, theta=.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
            self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(
                                                            self.mu, self.sigma)
 

**Particle**

In [None]:
@dataclass
class Particle(EnforceClassTyping):
    'This class represents the electric field sources with its position in the field(Position) and the magnitude of the source(Charge)'
    Mass: float # kg
    Charge: float #C
    Position: T.Tensor # m
    Velocity: T.Tensor # m/s


**Particle Test**

In [None]:
Electron= Particle(Mass=9.11e-8, Charge= -1.6e-9, Position=T.tensor([1.0, 0.0]), Velocity=T.tensor([0.0, 0.0]))
Proton= Particle(Mass=9.11e-8, Charge= 1.6e-9, Position=T.tensor([-1.0, 0.0]), Velocity=T.tensor([0.0, 0.0]))
Source= [Electron]

**Vector Field**

In [None]:
@dataclass
class Field:
    Dimensions: int
    FieldHighBound: list[float]
    FieldLowBound: list[float]
    def __post_init__(self):
        assert  len(self.FieldHighBound) == self.Dimensions| 1, "Length of high bound and dimensions do not match"
        assert  len(self.FieldLowBound) == self.Dimensions| 1, "Length of low bound and dimensions do not match"
    @abstractmethod
    def FieldStrength(self, ObservationPosition: T.Tensor)-> T.Tensor:
        pass
    @abstractmethod
    def FieldPotential(self, ObservationPosition: T.Tensor)-> float:
        pass

class HomogenousField(Field):
    def FieldStrength(self, ObservationPosition: T.Tensor)-> T.Tensor:
        return  T.zeros((ObservationPosition.shape[0], self.Dimensions), dtype=T.float64)
    def FieldPotential(self, ObservationPosition: T.Tensor)-> float:
        return  0.0

@dataclass
class LJField:
    FieldSources: list[Particle]
    FieldHighBound: float
    FieldLowBound: float
    def __call__(self, ObservationPosition: T.Tensor)->T.Tensor:
        return self.ElectricFieldStrength(ObservationPosition)
    @EnforceMethodTyping
    def ElectricFieldStrength(self, ObservationPosition: T.Tensor)->T.Tensor:
        'This function takes a list of sources and outputs the field strength experienced at any given point(s). This determines the physics of the field(an electric field in this case)'
        CoulombConstant = 8.9875e9 #N*m^2/C^2
        for FieldSource in self.FieldSources:
            if type(FieldSource) != Particle:
                raise TypeError("The input is not valid")
        assert type(ObservationPosition) == T.Tensor, "Invalid Reference point data type"
        ElectricFieldVector = T.zeros_like(ObservationPosition)
        for FieldSource in self.FieldSources:
            PositionMatrices= T.stack([T.ones_like(ObservationPosition[0])* FieldSource.Position[0].item(), 
                                            T.ones_like(ObservationPosition[1])* FieldSource.Position[1].item()])
            DisplacementVector = ObservationPosition - PositionMatrices
            DisplacementMagnitude = T.sqrt(DisplacementVector[0]**2 +DisplacementVector[1]**2)
            ElectricFieldVector += ((FieldSource.Charge) / DisplacementMagnitude**3 * DisplacementVector) - ((FieldSource.Charge) / DisplacementMagnitude**6 * DisplacementVector)
        ElectricFieldVector= CoulombConstant *ElectricFieldVector
        return ElectricFieldVector #N/C or V/m
    @EnforceMethodTyping
    def WorkDoneAgainstField(self, InitialPosition: T.Tensor, FinalPosition: T.Tensor, resolution: int= 5000)-> float:
        '''This method determines the amount of work required to get one position to another in the field'''
        XInterval= (FinalPosition[0] - InitialPosition[0]) / resolution
        YInterval= (FinalPosition[1] - InitialPosition[1]) / resolution
        XPositions = [InitialPosition[0] + i * XInterval for i in range(resolution + 1)]
        YPositions = [InitialPosition[1] + i * YInterval for i in range(resolution + 1)]
        WorkDone = 0
        for i in range(resolution):
            PositionFieldStrength = self.ForceFieldStrength(T.Tensor([XPositions[i], YPositions[i]]))
            WorkDone += - (PositionFieldStrength[0]*XInterval + PositionFieldStrength[1]*YInterval)
        return WorkDone
    @EnforceMethodTyping
    def PlotField(self):
        'This funtion plots the 2D electric vector field'
        ObservationPosition= T.meshgrid(T.linspace(self.FieldLowBound, self.FieldHighBound, 40), 
                                        T.linspace(self.FieldLowBound, self.FieldHighBound, 40))
        ObservationPosition= T.stack(ObservationPosition)
        xd, yd = self.ElectricFieldStrength(ObservationPosition)
        xd = xd / T.sqrt(xd**2 + yd**2)
        yd = yd / T.sqrt(xd**2 + yd**2)
        color_aara = T.sqrt(xd**2+ yd**2)
        fig, ax = plt.subplots(1,1)
        cp = ax.quiver(ObservationPosition[0],ObservationPosition[1],xd,yd,color_aara)
        fig.colorbar(cp)
        plt.rcParams['figure.dpi'] = 250
        plt.show()

@dataclass
class ElectricField:
    FieldSources: list[Particle]
    FieldHighBound: float
    FieldLowBound: float
    def __call__(self, ObservationPosition: T.Tensor)->T.Tensor:
        return self.ElectricFieldStrength(ObservationPosition)
    @EnforceMethodTyping
    def ElectricFieldStrength(self, ObservationPosition: T.Tensor)->T.Tensor:
        'This function takes a list of sources and outputs the field strength experienced at any given point(s). This determines the physics of the field(an electric field in this case)'
        CoulombConstant = 8.9875e9 #N*m^2/C^2
        for FieldSource in self.FieldSources:
            if type(FieldSource) != Particle:
                raise TypeError("The input is not valid")
        assert type(ObservationPosition) == T.Tensor, "Invalid Reference point data type"
        ElectricFieldVector = T.zeros_like(ObservationPosition)
        for FieldSource in self.FieldSources:
            PositionMatrices= T.stack([T.ones_like(ObservationPosition[0])* FieldSource.Position[0].item(), 
                                            T.ones_like(ObservationPosition[1])* FieldSource.Position[1].item()])
            DisplacementVector = ObservationPosition - PositionMatrices
            DisplacementMagnitude = T.sqrt(DisplacementVector[0]**2 +DisplacementVector[1]**2)
            ElectricFieldVector += (CoulombConstant * FieldSource.Charge) / DisplacementMagnitude**3 * DisplacementVector
        return ElectricFieldVector #N/C or V/m
    @EnforceMethodTyping
    def WorkDoneAgainstField(self, InitialPosition: T.Tensor, FinalPosition: T.Tensor, resolution: int= 5000)-> float:
        '''This method determines the amount of work required to get one position to another in the field'''
        XInterval= (FinalPosition[0] - InitialPosition[0]) / resolution
        YInterval= (FinalPosition[1] - InitialPosition[1]) / resolution
        XPositions = [InitialPosition[0] + i * XInterval for i in range(resolution + 1)]
        YPositions = [InitialPosition[1] + i * YInterval for i in range(resolution + 1)]
        WorkDone = 0
        for i in range(resolution):
            PositionFieldStrength = self.ElectricFieldStrength(T.Tensor([XPositions[i], YPositions[i]]))
            WorkDone += - (PositionFieldStrength[0]*XInterval + PositionFieldStrength[1]*YInterval)
        return WorkDone
    @EnforceMethodTyping
    def PlotField(self):
        'This funtion plots the 2D electric vector field'
        ObservationPosition= T.meshgrid(T.linspace(self.FieldLowBound, self.FieldHighBound, 50), 
                                        T.linspace(self.FieldLowBound, self.FieldHighBound, 50))
        ObservationPosition= T.stack(ObservationPosition)
        xd, yd = self.ElectricFieldStrength(ObservationPosition)
        xd = xd / T.sqrt(xd**2 + yd**2)
        yd = yd / T.sqrt(xd**2 + yd**2)
        color_aara = T.sqrt(xd**2+ yd**2)
        fig, ax = plt.subplots(1,1)
        cp = ax.quiver(ObservationPosition[0],ObservationPosition[1],xd,yd,color_aara)
        fig.colorbar(cp)
        plt.rcParams['figure.dpi'] = 250
        plt.show()


**Vector Field Test**

In [None]:
ElectricField1= ElectricField(Source, 10.0, -10.0)
# ElectricField1.PlotField()

**Environment**

In [None]:
# should obey newtons laws in Homogenous vector field 
@dataclass
class Environment:
    @dataclass
    class State:
        pass
    InitialState: State 
    CurrentState: State 
 
    def __post_init__(self):
        pass
 
    @abstractmethod
    def TransitionModel(self, State: State, Action)-> State:
        NotImplementedError ("Subclasses should implement this method")
        pass

    @abstractmethod
    def RewardModel(self, State: State, Action, NextState: State, TerminalSignal: bool)-> float:
        NotImplementedError ("Subclasses should implement this method")
        pass

    @abstractmethod
    def IsTerminalCondition(self, State: State)-> bool:
        NotImplementedError ("Subclasses should implement this method")
        pass

    @abstractmethod
    def StateTransition(self, State: State, Action)-> tuple[float, State, bool]:
        NotImplementedError ("Subclasses should implement this method")
        pass

    @abstractmethod
    def Run(self, RunDuration: float)-> list[State]:
        NotImplementedError ("Subclasses should implement this method")
        pass
    
@dataclass
class ParticleInField(EnforceClassTyping):
    '''This class represents the environment the agent will learn from. 
    
    The UppperBoundX, LowerBoundX, UpperBoundY, and LowerBoundY determine the dimensions of the viable learning region of the environment.
    The FieldType determines the physics/dynamics of the environment
    The FieldSources shape the field '''
    Field: ElectricField
    ChargedParticle: Particle
    Target: T.Tensor
    DistanceWeight: float= 0.5
    EnergyWeight: float= 0.5
    TerminalSignalWeight: float= 0.5
    @dataclass 
    class State(EnforceClassTyping):
        '''This class represents the state of the Agent with its Position, Momentum and the Field Strength if experiences at its Position. 
        These are parameters the agent is able to observe, they uniquely define the state of the agent.'''
        Position: T.Tensor # m
        Momentum: T.Tensor #kg*m/s
        Time: float # s

        # def __add__(self, other):
        #     Position = self.Position + other.Position
        #     Momentum = self.Momentum + other.Momentum
        #     Time = self.Time + other.Time
        #     return self(Position, Momentum, Time)
    InitialState: State = None
    CurrentState: State = None
    def __post_init__(self):
        if self.InitialState is None:
            self.InitialState= self.RandomState()
        self.CurrentState= self.InitialState

    # def StateDynamics(self, State: State, Action: T.Tensor= T.tensor([0.0, 0.0])):
    #     PositionDynamics= State.Momentum/ self.ChargedParticle.Mass
    #     MomentumDynamics= (self.ChargedParticle.Charge* self.Field(State.Position))+Action
    #     TimeDynamics= 1.0
    
    @EnforceMethodTyping
    def TransitionModel(self, State: State, Action: T.Tensor= T.tensor([0.0, 0.0]), TimeInterval:float= 1, Resolution: int=100)-> State:
        '''This function determines how the state of the system changes after a given period given the agents state and parameters'''
        CurrentMomentum= State.Momentum
        CurrrentPosition= State.Position
        TimeTaken= 0
        for _ in range(Resolution):
            CurrentMomentum = CurrentMomentum + ((self.ChargedParticle.Charge* self.Field(CurrrentPosition))+Action)*(TimeInterval/Resolution)
            CurrrentPosition= CurrrentPosition+ (CurrentMomentum/ self.ChargedParticle.Mass)*(TimeInterval/Resolution)
            TimeTaken+= (TimeInterval/Resolution)
        CurrentTime= State.Time+ TimeTaken
        return self.State(CurrrentPosition, CurrentMomentum, CurrentTime)
    
    @EnforceMethodTyping
    def IsTerminalCondition(self, State: State)-> bool:
        '''This method determines if a position is within the viable learning region of the environment'''
        WithinXBound= self.Field.FieldLowBound <= State.Position[0] <= self.Field.FieldHighBound
        WithinYBound= self.Field.FieldLowBound <= State.Position[1] <= self.Field.FieldHighBound
        if WithinXBound and WithinYBound:
            return False    
        else:
            return True
    
    def RewardModel(self, State: State, Action , NextState: State, TerminalSignal: bool)-> float:
        '''This method determines how the agent is rewarded given a state transition. The reward determines the behaviour the agent should learn(i.e getting to the target and using the least amount of energy).'''
        DistanceGainedFromTarget= T.norm(State.Position-self.Target)- T.norm(NextState.Position-self.Target) 
        EnergyConsumed= self.Field.WorkDoneAgainstField(State.Position, NextState.Position)
        Cost= self.DistanceWeight* DistanceGainedFromTarget+ self.EnergyWeight* EnergyConsumed+ self.TerminalSignalWeight* TerminalSignal+ 1.0
        return -Cost.item()
        
    def Step(self, State: State= CurrentState, Action: T.Tensor= T.tensor([0.0, 0.0]), TimeInterval: float= 1):
        NextState= self.TransitionModel(State, Action, TimeInterval)
        TerminalSignal= self.IsTerminalCondition(NextState) 
        Reward= self.RewardModel(State, Action, NextState, TerminalSignal)
        return NextState, Reward, TerminalSignal
    
    @EnforceMethodTyping
    def RandomState(self)->State:
        '''This method generates a random state within the viable learning region'''
        RandomPosition= T.Tensor([np.random.uniform(self.Field.FieldLowBound, self.Field.FieldHighBound), 
                                  np.random.uniform(self.Field.FieldLowBound, self.Field.FieldHighBound)])
        RandomMomentum= T.zeros_like(RandomPosition)
        return self.State(RandomPosition, RandomMomentum, 0.0)

    def Run(self, RunDuration: float, Resolution: int=100):
        Path= []
        State= self.CurrentState
        Time= 0
        for _ in range(Resolution):
            Path.append(State.Position)
            State= self.TransitionModel(State, 1e-7* T.randn(2))
            Time += (RunDuration/Resolution)
        return Path
    
    def Lagrangian(self):
        pass
    
    def PlotRun(self, RunDuration: float):
        Path= self.Run(RunDuration)
        Path= T.stack(Path)
        Path= Path.transpose(dim0=0, dim1=1)
        # print(Path)
        t=  T.arange(0, RunDuration)
        plt.plot(Path[0], Path[1])
        plt.plot(Path[0][0], Path[1][0], 'ko')
        plt.plot(Path[0][-1], Path[1][-1], 'r*')
        plt.xlim(-10,10)
        plt.ylim(-10,10)
        plt.grid(True)
        plt.show()

    def Reset(self):
        self.CurrentState= self.InitialState

**Environment Test**

In [None]:
Environment1= ParticleInField(ElectricField1, Proton, T.tensor([0.0, 0.0]))
Environment1.PlotRun(20)
# -(Environment.ChargedParticle.Charge* Environment.Field(Environment.CurrentState.Position))
# Environment.Step(Environment.CurrentState,T.tensor([1e-9, 1e-9]))

**Critic Network**

In [None]:
class CriticNetwork(nn.Module):
    def __init__(self, learning_rate, state_dims, fc1_dims, fc2_dims, n_actions, name, chkpt_dir='tmp/ddpg'):
        super(CriticNetwork, self).__init__() 
        self.checkpoint_file = os.path.join(chkpt_dir,name+'_ddpg')

        self.fc1 = T.nn.utils.parametrizations.weight_norm(nn.Linear(state_dims+n_actions, fc1_dims)) 
        self.bn1 = nn.LayerNorm(fc1_dims)
        self.fc2 = T.nn.utils.parametrizations.weight_norm(nn.Linear(fc1_dims, fc2_dims))
        self.bn2 = nn.LayerNorm(fc2_dims)
        self.fc3 = T.nn.utils.parametrizations.weight_norm(nn.Linear(fc2_dims, 1))

        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state, action):
        x = T.cat([state, action], dim=-1)
        x = T.relu(self.bn1(self.fc1(x)))
        x = T.relu(self.bn2(self.fc2(x)))
        x = self.fc3(x)
        return x

    def save_checkpoint(self):
        print('... saving checkpoint ...')
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        print('... loading checkpoint ...')
        self.load_state_dict(T.load(self.checkpoint_file))
 

**Critic Network Test**

In [None]:
# N= CriticNetwork(0.02, 5, 10, 15, 2, name='tb')

# bt= T.tensor([[-1.4355, -0.7806,  0.3042,  1.1601, -0.1184]])
# at= T.tensor([[0.8233, 0.8126]])
# for target_param in N.parameters():
#     print(target_param) 
# bt, m(bt), N(bt)
# N.load_checkpoint()
# bt, at, N(bt, at)

**Actor Network**

In [None]:
class ActorNetwork(nn.Module):
    def __init__(self, learning_rate, state_dims, fc1_dims, fc2_dims, n_actions, name, chkpt_dir='tmp/ddpg'):
        super(ActorNetwork, self).__init__()
        self.checkpoint_file = os.path.join(chkpt_dir,name+'_ddpg')

        self.fc1 = T.nn.utils.parametrizations.weight_norm(nn.Linear(state_dims , fc1_dims)) 
        self.bn1 = nn.LayerNorm(fc1_dims)
        self.fc2 = T.nn.utils.parametrizations.weight_norm(nn.Linear(fc1_dims, fc2_dims))
        self.bn2 = nn.LayerNorm(fc2_dims)
        self.fc3 = T.nn.utils.parametrizations.weight_norm(nn.Linear(fc2_dims, n_actions))

        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = T.relu(self.bn1(self.fc1(state)))
        x = T.relu(self.bn2(self.fc2(x)))
        x = self.fc3(x)
        return x

    def save_checkpoint(self):
        print('... saving checkpoint ...')
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        print('... loading checkpoint ...')
        self.load_state_dict(T.load(self.checkpoint_file))


**Actor Network Test**

In [None]:
# Nt= ActorNetwork(0.02, 5, 10, 15, 2, name='btt')

# btt= T.tensor([[-0.6552,  0.0852,  2.0087, -0.6352,  0.4445]])
# att= T.randn(1, 2)

# Nt.load_checkpoint()
# btt, Nt(btt)

In [None]:
@dataclass
class Agent1:
    AgentEnvironment: Environment

    def __post_init__(self):
        pass
    @abstractmethod
    def Act(self, Observation: T.Tensor)-> T.Tensor:
        NotImplementedError ("Subclasses must implement the `Act` method")
        pass
    @abstractmethod
    def Observe(self)-> T.Tensor:
        NotImplementedError ("Subclasses must implement the `Act` method")
        pass
    
    @abstractmethod
    def Learn(self):
        'Improves  the agent by updating its models'
        pass
    @abstractmethod
    def LearningAlgorithm(self):
        pass
    
class Agent(object):
    def __init__(self, lr_Actor, lr_Critic, ObservationDimensions, SoftUpdateRate, LearningEnvironment,
                 gamma=0.99, n_actions=2, max_size=1000, layer1_size=20, layer2_size=15, batch_size=16, ControlInterval= 0.5):
        self.Actor = ActorNetwork(lr_Actor, ObservationDimensions, layer1_size, layer2_size, n_actions=n_actions, name='Actor')
        self.Critic = CriticNetwork(lr_Critic, ObservationDimensions, layer1_size, layer2_size, n_actions=n_actions, name='Critic')
        self.TargetActor = ActorNetwork(lr_Actor, ObservationDimensions, layer1_size, layer2_size, n_actions=n_actions, name='TargetActor')
        self.TargetCritic = CriticNetwork(lr_Critic, ObservationDimensions, layer1_size, layer2_size, n_actions=n_actions, name='TargetCritic')
        for target_param, param in zip(self.TargetActor.parameters(), self.Actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.TargetCritic.parameters(), self.Critic.parameters()):
            target_param.data.copy_(param.data) 
        self.memory = deque(maxlen=max_size)
        self.gamma = gamma
        self.SoftUpdateRate = SoftUpdateRate
        self.batch_size = batch_size
        self.LearningEnvironment: ParticleInField= LearningEnvironment
        self.noise = OUActionNoise(mu=np.zeros(n_actions))
        self.ControlInterval= ControlInterval# Acttion duration= conrol interval
        self.update_network_parameters(SoftUpdateRate=1)

    def choose_action(self, observation):
        self.Actor.eval()
        observation = T.tensor(observation, dtype=T.float).to(self.Actor.device)
        mu = self.Actor.forward(observation).to(self.Actor.device)
        mu_prime = 1e-7* (mu + T.tensor(self.noise(), dtype=T.float).to(self.Actor.device))
        self.Actor.train()
        return mu_prime.cpu().detach()

    def Observe(self, State= None):
        if State is None:
          State= self.LearningEnvironment.CurrentState   
        if type(State) == tuple:
            Observation= [0]*len(State)
            for i in range(len(State)):
              Observation[i]= self.Observe(State[i])
            Observation= T.stack(Observation)
        else:
            Observation= T.cat([State.Position,
                                State.Momentum])
        return Observation
    
    def DDPGAlgorithm(self):
        score_history = []
        for i in range(10):
            self.LearningEnvironment.CurrentState = self.LearningEnvironment.InitialState
            IsDone = False
            score = 0
            for _ in range(50):
                Observation= self.Observe()
                Action = self.choose_action(Observation) 
                new_state, Reward, IsDone= self.LearningEnvironment.Step(self.LearningEnvironment.CurrentState, Action, self.ControlInterval) 
                self.memory.append((self.LearningEnvironment.CurrentState, Action, new_state, Reward, int(IsDone)))
                self.learn()
                score += Reward
                # print(Reward)
                self.LearningEnvironment.CurrentState = new_state
            score_history.append(score)
        plt.plot(score_history)
        return score_history
        
    def learn(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        states, actions, next_states, rewards, dones = zip(*batch)

        state = self.Observe(states).to(self.Critic.device)
        action = T.stack(actions).to(self.Critic.device)
        reward = T.tensor(rewards, dtype=T.float).unsqueeze(1).to(self.Critic.device)
        new_state = self.Observe(next_states).to(self.Critic.device)
        done = T.tensor(dones, dtype=T.float).unsqueeze(1).to(self.Critic.device)
        
        self.TargetActor.eval()
        self.TargetCritic.eval()
        self.Critic.eval()
        
        target_actions = self.TargetActor.forward(new_state)
        Critic_value_ = self.TargetCritic.forward(new_state, target_actions) 
        q_expected = self.Critic.forward(state, action)
        q_targets = reward + self.gamma * Critic_value_ * (1 - done)

        Critic_loss = nn.MSELoss()(q_expected, q_targets.detach())
        self.Critic.train()
        self.Critic.optimizer.zero_grad()
        Critic_loss.backward()
        self.Critic.optimizer.step()

        self.Actor.eval()
        self.Critic.eval()

        mu = self.Actor.forward(state)
        Actor_loss = -self.Critic.forward(state, mu)

        Actor_loss = T.mean(Actor_loss)
        self.Actor.train()
        self.Actor.optimizer.zero_grad()
        Actor_loss.backward()
        self.Actor.optimizer.step()

        self.update_network_parameters()

    def Run(self, RunDuration: float, Resolution: int=100):
        Path= []
        State= self.LearningEnvironment.InitialState
        Time= 0
        for _ in range(Resolution):
            Path.append(State.Position)
            action= self.choose_action(self.Observe(State))
            State= self.LearningEnvironment.TransitionModel(State, action)
            Time += (RunDuration/Resolution)
        return Path
    def PlotRun(self, RunDuration: float):
        Path= self.Run(RunDuration)
        Path= T.stack(Path)
        Path= Path.transpose(dim0=0, dim1=1)
        # print(Path)
        t=  T.arange(0, RunDuration)
        plt.plot(Path[0], Path[1])
        plt.plot(Path[0][0], Path[1][0], 'ko')
        plt.plot(Path[0][-1], Path[1][-1], 'r*')
        plt.xlim(-10,10)
        plt.ylim(-10,10)
        plt.grid(True)
        plt.show()
    def update_network_parameters(self, SoftUpdateRate=None):
        if SoftUpdateRate is None:
            SoftUpdateRate = self.SoftUpdateRate

        Critic_state_dict = dict(self.Critic.named_parameters())
        Actor_state_dict = dict(self.Actor.named_parameters())
        TargetCritic_dict = dict(self.TargetCritic.named_parameters())
        TargetActor_dict = dict(self.TargetActor.named_parameters())

        for name in Critic_state_dict:
            Critic_state_dict[name] = SoftUpdateRate*Critic_state_dict[name].clone() + (1-SoftUpdateRate)*TargetCritic_dict[name].clone()
        self.TargetCritic.load_state_dict(Critic_state_dict)

        for name in Actor_state_dict:
            Actor_state_dict[name] = SoftUpdateRate*Actor_state_dict[name].clone() + (1-SoftUpdateRate)*TargetActor_dict[name].clone()
        self.TargetActor.load_state_dict(Actor_state_dict)

        """
        #Verify that the copy assignment worked correctly
        TargetActor_params = self.TargetActor.named_parameters()
        TargetCritic_params = self.TargetCritic.named_parameters()

        Critic_state_dict = dict(TargetCritic_params)
        Actor_state_dict = dict(TargetActor_params)
        print('\nActor Networks', tau)
        for name, param in self.Actor.named_parameters():
            print(name, T.equal(param, Actor_state_dict[name]))
        print('\nCritic Networks', tau)
        for name, param in self.Critic.named_parameters():
            print(name, T.equal(param, Critic_state_dict[name]))
        input()
        """
    
    def save_models(self):
        self.Actor.save_checkpoint()
        self.TargetActor.save_checkpoint()
        self.Critic.save_checkpoint()
        self.TargetCritic.save_checkpoint()

    def load_models(self):
        self.Actor.load_checkpoint()
        self.TargetActor.load_checkpoint()
        self.Critic.load_checkpoint()
        self.TargetCritic.load_checkpoint()
 
agent = Agent(0.0025, 0.0025, 4, 0.001, Environment1)
agent.DDPGAlgorithm()

In [None]:
agent.PlotRun(100)