In [14]:
import gym
gym.logger.set_level(40) 
import numpy as np
import copy
from collections import namedtuple, deque
import matplotlib.pyplot as plt
%matplotlib inline
import torch
torch.manual_seed(0) 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import random
from torch.utils.data.sampler import SubsetRandomSampler
import torch.utils.data as utils_data
import json

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [16]:
env = gym.make('Pendulum-v0').unwrapped
env.seed(2)
n_state=env.observation_space.shape[0] + 1
n_actions=env.action_space.shape[0]
print(f'state space {n_state} \naction space {n_actions}')

state space 4 
action space 1


In [17]:
experience_safe = namedtuple("experience_safe", field_names=["state", "action", "next_state"])
memory = deque(maxlen=1000000)

In [18]:
"""Random Agent"""
Num_episode=10000
max_t= 100
def random_episode(env):
    for episode in range(Num_episode):
        state = env.reset()
        state = np.hstack((state,np.abs(state[2])))
        for t in range(max_t):
            action=env.action_space.sample()
            next_state,reward,done,_= env.step(action) 
            next_state = np.hstack((next_state,np.abs(next_state[2])))
            e = experience_safe(state, action, next_state)
            memory.append(e)
            state = next_state  
            if np.abs(state[2]) > 6:
                break


In [19]:
random_episode(env)    

In [7]:
print(len(memory))

565454


In [None]:
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)


restricting 3 varaible -3 to 3 to allow some slack
to allow some slack variable from -2.8 to 2.8
saying augument another state (mod velocity) < 3

In [8]:
class Safelayer(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=10):
        super(Safelayer, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, action_size)
           
    def forward(self,x):
        x = self.fc1(x)
        x = self.fc2(x)       
        return x

In [9]:
train_set_size = int(0.8 * len(memory))
train_indices = np.random.choice(np.arange(len(memory)), train_set_size, replace = False)
train_sampler = SubsetRandomSampler(train_indices)
val_indices = np.setdiff1d(np.arange(len(memory)), train_indices, assume_unique= True)
val_sampler = SubsetRandomSampler(val_indices)

In [10]:
batch_size = 256
learning_rate = 0.0001

In [11]:
trainloader = utils_data.DataLoader(memory, batch_size = batch_size, sampler=train_sampler, num_workers=1)
valloader = utils_data.DataLoader(memory, batch_size = batch_size, sampler=val_sampler, num_workers=1)

In [12]:
def Train(trainloader, phase = 'Training'):
    if phase == 'Training':
        model.train()
    if phase == 'Validation':
        model.eval()
    total_loss = 0
    for i, (state,action,next_state) in enumerate(trainloader):  
        states = state.float().to(device)
        actions = action.float().to(device)
        next_states = next_state.float().to(device)
        cs = states[:,-1].view(-1,1).to(device)
        cs_p = next_states[:,-1].view(-1,1).to(device)
        c = model(states).view(-1,1)
        #print(c)
        diff = torch.einsum('ij,ij->i', [c, actions]).view(-1,1)
        #print(diff)
        y_pred = cs+diff
        loss = criterion(y_pred,cs)
        total_loss += loss.item()
        if phase == 'Training':
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
    return total_loss/len(trainloader)
    

In [20]:
train_loss_values =[]
val_loss_values =[]
num_epochs = 3
model = Safelayer(n_state,n_actions,121).to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr= learning_rate)
besval=0
for epoch in range(num_epochs):

    train_loss = Train(trainloader)
   # Training set accuracy  train_loss_values.append(train_loss)
    val_loss = Train(valloader,'Validation')  # Training set accuracy
    val_loss_values.append(val_loss)
    print(f'Epoch: {epoch}   Train Loss: {train_loss:.5f} \
    Val Loss: {val_loss:.5f}  ')
    torch.save(model.state_dict(), 'safe_layer.pth')

    

Epoch: 0   Train Loss: 0.24746     Val Loss: 0.00322  
Epoch: 1   Train Loss: 0.00092     Val Loss: 0.00014  
Epoch: 2   Train Loss: 0.00004     Val Loss: 0.00000  


In [16]:
model.state_dict()['fc2.weight']

tensor([[ 0.2152,  0.1285, -0.0286, -0.1300, -0.1066, -0.1990, -0.1705,  0.0352,
          0.0914,  0.0199]], device='cuda:0')

In [140]:
torch.save(model.state_dict(), 'safe_layer.pth')

In [141]:
model.state_dict()

OrderedDict([('fc1.weight', tensor([[ 0.0478, -0.1145,  0.3815,  0.1945],
                      [ 0.1848,  0.1499,  0.3260,  0.0066],
                      [-0.0001,  0.4127, -0.0213, -0.1184],
                      [-0.2749,  0.3716,  0.4599,  0.3640],
                      [ 0.0371, -0.0674,  0.6275, -0.3252],
                      [ 0.3398, -0.2197, -0.0225,  0.0680],
                      [ 0.1296, -0.1586,  0.1447,  0.2567],
                      [ 0.2024, -0.1572, -0.2074,  0.4135],
                      [ 0.1608,  0.1184,  0.4054,  0.0735],
                      [ 0.3036, -0.0972, -0.2475, -0.3242]], device='cuda:0')),
             ('fc1.bias',
              tensor([ 0.3280, -0.2559,  0.3603,  0.2388,  0.4980,  0.2227,  0.3507,  0.3689,
                      -0.2879, -0.0483], device='cuda:0')),
             ('fc2.weight',
              tensor([[ 0.2147,  0.1285, -0.0586, -0.1310, -0.1120, -0.1986, -0.1697,  0.0354,
                        0.0923,  0.0185]], device='cuda:0')),
 

In [18]:
for param in model.parameters():
    print(param.requires_grad)

True
True
True
True
