# **RL implementation for inverted pendulum**

Required imports

In [None]:
%%capture
%matplotlib qt
%load_ext autoreload
%autoreload 2

from rcognita_framework.pipelines.pipeline_inverted_pendulum_AC_async import PipelineInvertedPendulumAC
from rcognita_framework.rcognita.actors import ActorProbabilisticEpisodic
from rcognita_framework.rcognita.critics import CriticActionValue
from rcognita_framework.rcognita.systems import SysInvertedPendulum
from rcognita_framework.rcognita.models import ModelGaussianConditional, ModelNN
from rcognita_framework.rcognita.scenarios import EpisodicScenario
from rcognita_framework.rcognita import models
from rcognita_framework.rcognita.utilities import rc
from rcognita_framework.rcognita.optimizers import BaseOptimizer
import numpy as np
from torch import nn
import torch
import matplotlib.animation as animation
import matplotlib.pyplot as plt
import warnings

'''
    QT libs may fall down
    Sometimes python
'''

## Critic implementation

In [36]:
class ModelNNStudent(ModelNN):

    model_name = "NN"

    def __init__(self, dim_observation, dim_action, *args, weights = None, **kwargs):
        super().__init__() # dim_observation, dim_action, *args, weights=weights, **kwargs

        self.linear_1 = nn.Linear(dim_observation + dim_action, (dim_observation + dim_action)*2)
        self.linear_2 = nn.Linear((dim_observation + dim_action)*2, (dim_observation + dim_action)*2)
        self.linear_3 = nn.Linear((dim_observation + dim_action)*2, (dim_observation + dim_action)*2)
        self.linear_4 = nn.Linear((dim_observation + dim_action)*2, 1)
        
        if weights is not None:
            self.load_state_dict(weights)
        self.double()
        self.cache_weights()

    def forward(self, input_tensor, weights=None):
        if weights is not None:
            self.update(weights)
            
        x = self.linear_1(input_tensor).relu()
        x = self.linear_2(x).relu()
        x = self.linear_3(x).relu()
        x = self.linear_4(x).relu()
        x = torch.linalg.vector_norm(x)

        return x

## Q-critic (Action-Value-critic) implementation

In [37]:
class CriticActionValueStudent(CriticActionValue):
    def objective(self, data_buffer=None, weights=None):

        if data_buffer is None:
            observation_buffer = self.observation_buffer
            action_buffer = self.action_buffer
        else:
            observation_buffer = data_buffer["observation_buffer"]
            action_buffer = data_buffer["action_buffer"]

        critic_objective = 0
        for i in range(self.data_buffer_size - 1, -1, -1):
            c = self.model(observation_buffer[i - 1, :], action_buffer[i - 1, :], weights=weights)
            n = self.model(observation_buffer[i, :], action_buffer[i, :], use_stored_weights=True)
            out = (c - self.discount_factor * n - self.running_objective(observation_buffer[i - 1, :], action_buffer[i - 1, :]))
            critic_objective += out ** 2 / 2

        return critic_objective

## Actor modification

In [38]:
class ActorProbabilisticEpisodicACStudent(ActorProbabilisticEpisodic):
    def update(self, observation):
        ''' Action bounds must be redefined according to initial state!!! '''
        action = self.model.sample_from_distribution(observation)
        self.action = np.array(np.clip(action, self.action_bounds[0], self.action_bounds[1]))
        self.action_old = self.action
        gradient = self.model.compute_gradient(action)
        self.store_gradient(gradient)

## Optimizer implementation

In [39]:
class TorchOptimizerStudent(BaseOptimizer):
    engine = "Torch"

    def __init__(
        self, opt_options, iterations=1, opt_method=torch.optim.Adam, verbose=False
    ):
        self.opt_method = opt_method
        self.opt_options = opt_options
        self.iterations = iterations
        self.verbose = verbose
        self.loss_history = []

    def optimize(self, objective, model, objective_input):
        optimizer = self.opt_method(
            model.parameters(), **self.opt_options, weight_decay=0
        )
        
        for i in range(self.iterations):
            optimizer.zero_grad()
            loss = objective(objective_input)
            loss.backward()
            optimizer.step()
        self.loss_history.append(objective(objective_input))

## Scenario implementation

In [40]:
class EpisodicScenarioAsyncACStudent(EpisodicScenario):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.critic_optimizer = TorchOptimizerStudent({"lr": 0.01})
        self.squared_TD_sums_of_episodes = []
        self.square_TD_means = []

    def reset_episode(self):
        self.squared_TD_sums_of_episodes.append(self.critic.objective())
        super().reset_episode()

    def iteration_update(self):
        mean_sum_of_squared_TD = self.get_mean(self.squared_TD_sums_of_episodes) #just for visualization purposes
        self.square_TD_means.append(mean_sum_of_squared_TD.detach().numpy()) #just for visualization purposes
        
        self.critic_optimizer.optimize(
            objective=self.get_mean,
            model=self.critic.model,
            objective_input=self.squared_TD_sums_of_episodes)

        super().iteration_update()

    def reset_iteration(self):        
        self.squared_TD_sums_of_episodes = []
        super().reset_iteration()

## Testing

In [41]:
class PipelineInvertedPendulumStudentAC(PipelineInvertedPendulumAC):

    def initialize_models(self):
        print('self.dim_input', self.dim_input)
        print('self.dim_output', self.dim_output)
        self.actor_model = models.ModelGaussianConditional(
            expectation_function=self.safe_controller,
            arg_condition=self.observation_init,
            weights=self.initial_weights,
        )
        self.critic_model = ModelNNStudent(self.dim_output, self.dim_input)
        self.running_objective_model = models.ModelQuadForm(weights=self.R1)

    def initialize_actor_critic(self):
        self.critic = CriticActionValueStudent(
            dim_input=self.dim_input,
            dim_output=self.dim_output,
            data_buffer_size=self.data_buffer_size,
            running_objective=self.running_objective,
            discount_factor=self.discount_factor,
            optimizer=self.critic_optimizer,
            model=self.critic_model,
            sampling_time=self.sampling_time,
        )
        self.actor = ActorProbabilisticEpisodicACStudent(
            self.prediction_horizon,
            self.dim_input,
            self.dim_output,
            self.control_mode,
            self.action_bounds,
            action_init=self.action_init,
            predictor=self.predictor,
            optimizer=self.actor_optimizer,
            critic=self.critic,
            running_objective=self.running_objective,
            model=self.actor_model,
        )

    def initialize_scenario(self):
        self.scenario = EpisodicScenarioAsyncACStudent(
            system=self.system,
            simulator=self.simulator,
            controller=self.controller,
            actor=self.actor,
            critic=self.critic,
            logger=self.logger,
            datafiles=self.datafiles,
            time_final=self.time_final,
            running_objective=self.running_objective,
            no_print=self.no_print,
            is_log=self.is_log,
            is_playback=self.is_playback,
            N_episodes=self.N_episodes,
            N_iterations=self.N_iterations,
            state_init=self.state_init,
            action_init=self.action_init,
            learning_rate=self.learning_rate
        )

    def execute_pipeline(self, **kwargs):
        np.random.seed(42)
        super().execute_pipeline(**kwargs)

pipeline = PipelineInvertedPendulumStudentAC()
pipeline.execute_pipeline(
    no_visual=True,
    is_playback=False, 
    no_print=True,
    
    time_final=0.1,      # tunable
    speedup=150,
    N_episodes=1,        # tunable
    N_iterations=100,    # tunable
    learning_rate=1e-3,  # tunable
    initial_weights=-np.array([10,-10,10,10,0])*(10**1), # tunable
    sampling_time=0.1,
    is_fixed_critic_weights=True,
)

self.dim_input 1
self.dim_output 5
End of simulation episode
End of simulation episode
new_weights [ 1.82283941e-03  9.99995641e+01  3.68865580e-02 -1.22563796e-02
  0.00000000e+00]
End of simulation episode
End of simulation episode
new_weights [-2.00792165e-02  1.00004802e+02 -4.06318396e-01  1.92422399e-01
  0.00000000e+00]
End of simulation episode
End of simulation episode
new_weights [6.17381394e-03 9.99985237e+01 1.24931876e-01 1.50911004e-01
 0.00000000e+00]
End of simulation episode
End of simulation episode
new_weights [-1.01176533e-02  1.00002419e+02 -2.04738500e-01  3.58928567e-01
  0.00000000e+00]
End of simulation episode
End of simulation episode
new_weights [-9.63541679e-04  1.00000230e+02 -1.94980073e-02  3.65407214e-01
  0.00000000e+00]
End of simulation episode
End of simulation episode
new_weights [1.22496304e-02 9.99970708e+01 2.47880696e-01 2.83043338e-01
 0.00000000e+00]
End of simulation episode
End of simulation episode
new_weights [3.42838376e-02 9.99918018e+0

FileNotFoundError: [Errno 2] No such file or directory: './critic_plots/100-iters_1-episodes_0.1-fintime'