In [1]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
import matplotlib.pyplot as plt
import gym
from gym import wrappers

In [2]:
class GenericNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims,
                 n_actions):
        super(GenericNetwork, self).__init__()
        self.lr = lr
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)

        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)

    def forward(self, observation):
        state = torch.tensor(observation, dtype = torch.float).to(self.device)
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class Agent():
    def __init__(self, alpha, beta, input_dims, gamma=0.99, n_actions=2,
                 layer1_size=64, layer2_size=64, n_outputs=1):
        self.gamma = gamma
        self.log_probs = None
        self.n_outputs = n_outputs
        self.actor = GenericNetwork(alpha, input_dims, layer1_size, layer2_size,
                                    n_actions=n_actions)
        self.critic = GenericNetwork(beta, input_dims, layer1_size, layer2_size,
                                    n_actions=1)

    def choose_actions(self, observation):
        mu, sigma = self.actor.forward(observation)
        sigma = torch.exp(sigma)
        action_probs = torch.distributions.Normal(mu, sigma)
        probs = action_probs.sample(sample_shape = torch.Size([self.n_outputs]))
        self.log_probs = action_probs.log_prob(probs).to(self.actor.device)
        action = torch.tanh(probs)

        return action.item()

    def learn(self, state, reward, new_state, done):
        self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()

        critic_value_ = self.critic.forward(new_state)
        critic_value = self.critic.forward(state)

        reward = torch.tensor(reward, dtype=torch.float).to(self.actor.device)
        delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value

        actor_loss = -self.log_probs * delta
        critic_loss = delta**2

        (actor_loss + critic_loss).backward() #Perchè non possiamo avere due actor_loss.backward() e un critic_loss.backward()
        self.actor.optimizer.step()
        self.critic.optimizer.step()

In [3]:
if __name__ == '__main__':
    agent = Agent(alpha = 0.000005, beta = 0.00001, input_dims = [2], gamma = 0.99,
                  layer1_size = 256, layer2_size = 256)
    env = gym.make("MountainCarContinuous-v0")
    score_history = []
    num_episodes = 100
    for i in range(num_episodes):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = np.array(agent.choose_actions(observation)).reshape((1,))
            observation_, reward, done, info = env.step(action)
            agent.learn(observation, reward, observation_, done)
            observation = observation_
            score += reward
        score_history.append(score)
        print('episode ', i, "score %.2f " %score)

episode  0 score -34.89 
episode  1 score -25.60 
episode  2 score -21.00 
episode  3 score -17.27 
episode  4 score -13.47 
episode  5 score -11.13 
episode  6 score -8.88 
episode  7 score -7.64 
episode  8 score -7.29 
episode  9 score -6.47 
episode  10 score -5.40 
episode  11 score -4.92 
episode  12 score -4.16 
episode  13 score -3.88 
episode  14 score -3.03 
episode  15 score -2.99 
episode  16 score -2.63 
episode  17 score -2.20 
episode  18 score -2.33 
episode  19 score -1.87 
episode  20 score -1.66 
episode  21 score -1.44 
episode  22 score -1.29 
episode  23 score -1.01 
episode  24 score -0.92 
episode  25 score -1.03 
episode  26 score -0.84 
episode  27 score -0.91 
episode  28 score -1.13 
episode  29 score -1.00 
episode  30 score -0.99 
episode  31 score -0.89 
episode  32 score -0.83 
episode  33 score -0.85 
episode  34 score -0.96 
episode  35 score -1.05 
episode  36 score -0.98 
episode  37 score -0.74 
episode  38 score -0.96 
episode  39 score -0.69 
epis

KeyboardInterrupt: 