In [8]:
import gym
gym.logger.set_level(40) 
import numpy as np
import copy
from collections import namedtuple, deque
import matplotlib.pyplot as plt
%matplotlib inline
import torch
torch.manual_seed(0) 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import random
from torch.utils.data.sampler import SubsetRandomSampler
import torch.utils.data as utils_data
import json

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [10]:
env = gym.make('Pendulum-v0').unwrapped
env.seed(2)
n_state=env.observation_space.shape[0] + 1
n_actions=env.action_space.shape[0]
print(f'state space {n_state} \naction space {n_actions}')

state space 4 
action space 1


In [11]:
experience_safe = namedtuple("experience_safe", field_names=["state", "action", "next_state"])
memory = deque(maxlen=1000000)

In [12]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    def __init__(self, state_size, action_size, fc1_units=400, fc2_units=300):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))


In [13]:
def act(state, add_noise=False):
        state = torch.from_numpy(state).float()
        actor_local.eval()
        with torch.no_grad():
            action = actor_local(state).data.numpy()
        actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

In [14]:
actor_local = Actor(3, n_actions)
actor_local.load_state_dict(torch.load('checkpoint_actor_tmp.pth'))

In [15]:
"""Random Agent"""
Num_episode=10000
max_t= 100
def random_episode(env):
    for episode in range(Num_episode):
        state = env.reset()
        state_c = np.hstack((state,np.abs(state[2])))
        for t in range(max_t):
            action=act(state)
            next_state,reward,done,_= env.step(action) 
            next_state_c = np.hstack((next_state,np.abs(next_state[2])))
            e = experience_safe(state_c, action, next_state_c)
            memory.append(e)
            state_c = next_state_c  
            state = next_state
            if np.abs(state[2]) > 6:
                break


In [16]:
random_episode(env)    

In [17]:
print(len(memory))

347832


In [18]:
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)


IndentationError: unexpected indent (<ipython-input-18-48c4aa66a1c3>, line 3)

restricting 3 varaible -3 to 3 to allow some slack
to allow some slack variable from -2.8 to 2.8
saying augument another state (mod velocity) < 3

In [19]:
class Safelayer(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=10):
        super(Safelayer, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, action_size)
           
    def forward(self,x):
        x = self.fc1(x)
        x = self.fc2(x)       
        return x

In [20]:
train_set_size = int(0.8 * len(memory))
train_indices = np.random.choice(np.arange(len(memory)), train_set_size, replace = False)
train_sampler = SubsetRandomSampler(train_indices)
val_indices = np.setdiff1d(np.arange(len(memory)), train_indices, assume_unique= True)
val_sampler = SubsetRandomSampler(val_indices)

In [21]:
batch_size = 256
learning_rate = 0.0001

In [22]:
trainloader = utils_data.DataLoader(memory, batch_size = batch_size, sampler=train_sampler, num_workers=1)
valloader = utils_data.DataLoader(memory, batch_size = batch_size, sampler=val_sampler, num_workers=1)

In [23]:
def Train(trainloader, phase = 'Training'):
    if phase == 'Training':
        model.train()
    if phase == 'Validation':
        model.eval()
    total_loss = 0
    for i, (state,action,next_state) in enumerate(trainloader):  
        states = state.float().to(device)
        actions = action.float().to(device)
        next_states = next_state.float().to(device)
        cs = states[:,-1].view(-1,1).to(device)
        cs_p = next_states[:,-1].view(-1,1).to(device)
        c = model(states).view(-1,1)
        #print(c)
        diff = torch.einsum('ij,ij->i', [c, actions]).view(-1,1)
        #print(diff)
        y_pred = cs+diff
        loss = criterion(y_pred,cs)
        total_loss += loss.item()
        if phase == 'Training':
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
    return total_loss/len(trainloader)
    

In [24]:
train_loss_values =[]
val_loss_values =[]
num_epochs = 3
model = Safelayer(n_state,n_actions,121).to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr= learning_rate)
besval=0
for epoch in range(num_epochs):

    train_loss = Train(trainloader)
   # Training set accuracy  train_loss_values.append(train_loss)
    val_loss = Train(valloader,'Validation')  # Training set accuracy
    val_loss_values.append(val_loss)
    print(f'Epoch: {epoch}   Train Loss: {train_loss:.5f} \
    Val Loss: {val_loss:.5f}  ')
    torch.save(model.state_dict(), 'safe_layer_guided.pth')

    

Epoch: 0   Train Loss: 0.27819     Val Loss: 0.02118  
Epoch: 1   Train Loss: 0.00558     Val Loss: 0.00124  
Epoch: 2   Train Loss: 0.00045     Val Loss: 0.00006  


In [25]:
model.state_dict()['fc2.weight']

tensor([[ 0.2151,  0.1278, -0.1023, -0.1311, -0.1130, -0.1973, -0.1711,  0.0297,
          0.0918,  0.0177]], device='cuda:0')

In [26]:
torch.save(model.state_dict(), 'safe_layer_guided.pth')

In [141]:
model.state_dict()

OrderedDict([('fc1.weight', tensor([[ 0.0478, -0.1145,  0.3815,  0.1945],
                      [ 0.1848,  0.1499,  0.3260,  0.0066],
                      [-0.0001,  0.4127, -0.0213, -0.1184],
                      [-0.2749,  0.3716,  0.4599,  0.3640],
                      [ 0.0371, -0.0674,  0.6275, -0.3252],
                      [ 0.3398, -0.2197, -0.0225,  0.0680],
                      [ 0.1296, -0.1586,  0.1447,  0.2567],
                      [ 0.2024, -0.1572, -0.2074,  0.4135],
                      [ 0.1608,  0.1184,  0.4054,  0.0735],
                      [ 0.3036, -0.0972, -0.2475, -0.3242]], device='cuda:0')),
             ('fc1.bias',
              tensor([ 0.3280, -0.2559,  0.3603,  0.2388,  0.4980,  0.2227,  0.3507,  0.3689,
                      -0.2879, -0.0483], device='cuda:0')),
             ('fc2.weight',
              tensor([[ 0.2147,  0.1285, -0.0586, -0.1310, -0.1120, -0.1986, -0.1697,  0.0354,
                        0.0923,  0.0185]], device='cuda:0')),
 

In [18]:
for param in model.parameters():
    print(param.requires_grad)

True
True
True
True


In [7]:
4.0/0.1

40.0