In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from tqdm import tqdm
from tqdm import trange
import numpy as np
import time

In [3]:
import torch as T
import torch.nn as nn
from torch.utils.data.sampler import SubsetRandomSampler
import torch

In [4]:
import gym

env = gym.make('Ant-v2')
print(env.observation_space.shape, env.action_space.shape)

rews = 0
temp = -54
var = 0
for i in range(20):
    env.reset()
    rew = 0
    
    while True:
        _, r, done, _ = env.step(env.action_space.sample())
        
        rew += r
        
        if done==True:
            print('Ep %d: %.2f' % (i+1, rew))
            rews += rew
            var += (rew-temp)**2
            break
rews = rews/20
var = (var/20)**0.5
print(rews)
print(var)

(111,) (8,)
Ep 1: 6.22
Ep 2: -37.62
Ep 3: -47.95
Ep 4: 3.09
Ep 5: 1.93
Ep 6: -287.07
Ep 7: -10.75
Ep 8: -57.32
Ep 9: -23.57
Ep 10: 0.54
Ep 11: -5.81
Ep 12: -39.23
Ep 13: -334.52
Ep 14: -294.70
Ep 15: -8.60
Ep 16: -33.87
Ep 17: -27.03
Ep 18: -68.67
Ep 19: 4.27
Ep 20: -328.60
-79.46332320224306
120.7883100609068


In [5]:
class BCO(nn.Module):
    def __init__(self, env, policy='mlp'):
        super(BCO, self).__init__()
        
        self.policy = policy
        self.act_n = env.action_space.shape[0]
        
        if self.policy=='mlp':
            self.obs_n = env.observation_space.shape[0]
            self.inv = nn.Sequential(*[nn.Linear(self.obs_n*2, 100), nn.LeakyReLU(), 
                                       nn.Linear(100, 100), nn.LeakyReLU(), 
                                       nn.Linear(100, self.act_n)])
            self.pol = nn.Sequential(*[nn.Linear(self.obs_n, 32), nn.LeakyReLU(), 
                                       nn.Linear(32, 32), nn.LeakyReLU(),  
                                       nn.Linear(32, self.act_n)])
        
        elif self.policy=='cnn':
            pass
    
    def pred_act(self, obs):
        out = self.pol(obs)
        
        return out
    
    def pred_inv(self, obs1, obs2):
        obs = T.cat([obs1, obs2], dim=1)
        out = self.inv(obs)
        
        return out

POLICY = 'mlp'
model = BCO(env, policy=POLICY).cuda()
model.load_state_dict(torch.load('Model/model_ant_ID_train_1.pt'))

<All keys matched successfully>

In [6]:
from torch.utils.data import Dataset, DataLoader

class DS_Inv(Dataset):
    def __init__(self, trajs):
        self.dat = []
        
        for traj in trajs:
            for dat in traj:
                obs, act, new_obs = dat
                
                self.dat.append([obs, new_obs, act])
    
    def __len__(self):
        return len(self.dat)
    
    def __getitem__(self, idx):
        obs, new_obs, act = self.dat[idx]
        
        return obs, new_obs, act

class DS_Policy(Dataset):
    def __init__(self, traj):
        self.dat = []
        
        for dat in traj:
            obs, act = dat
                
            self.dat.append([obs, act])
    
    def __len__(self):
        return len(self.dat)
    
    def __getitem__(self, idx):
        obs, act = self.dat[idx]
        
        return obs, act

In [None]:
import pickle

trajs_demo = pickle.load(open('Demo/demo_ant.pkl', 'rb'))
print(len(trajs_demo))
ld_demo = DataLoader(DS_Inv(trajs_demo), batch_size=50)

print(len(ld_demo))
for obs1, obs2,act in ld_demo:
    print(obs1.shape, obs2.shape, act.shape)
    print(act[0])
    

In [None]:
loss_func = nn.MSELoss().cuda()
optim = T.optim.Adam(model.parameters(), lr=5e-4)

alpha = 0
M = 500000

EPS = 0.9
DECAY = 2e-3
random_seed = 42
epochs = 1000
patience = 100

In [None]:
def train_valid_loader(dataset, batch_size, validation_split, shuffle_dataset):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    train_loader = torch.utils.data.DataLoader(dataset,batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                    sampler=valid_sampler)
    return train_loader, validation_loader

def train_NN(train_loader, NN):
    
    with tqdm(train_loader, desc='Training',  disable = True, position=0, leave=True) as TQ:
        ls_ep = 0
        correct = 0
        total = 0
        
        if (NN == 'inv'):
            for obs1, obs2, act in TQ:
                out = model.pred_inv(obs1.float().cuda(), obs2.float().cuda())
                ls_bh = loss_func(out, act.cuda())
                
                optim.zero_grad()
                ls_bh.backward()
                optim.step()

                ls_bh = ls_bh.cpu().detach().numpy()
                TQ.set_postfix(loss_policy='%.3f' % (ls_bh))
                ls_ep += ls_bh
                total += obs1.shape[0]
                
        elif(NN == 'pred'):
            for obs, act in TQ:
                out = model.pred_act(obs.float().cuda())
                ls_bh = loss_func(out, act.cuda())

                optim.zero_grad()
                ls_bh.backward()
                optim.step()

                ls_bh = ls_bh.cpu().detach().numpy()
                TQ.set_postfix(loss_policy='%.3f' % (ls_bh))
                ls_ep += ls_bh
                total += obs.shape[0]
            
        ls_ep /= len(TQ)
        
    return ls_ep

def validate_NN(validation_loader, NN):
    
    with tqdm(validation_loader, desc='Validate', disable = True, position=0, leave=True) as TQ:
        ls_val_ep = 0
        correct = 0
        total = 0
        
        if (NN == 'inv'):
            for obs1, obs2, act in TQ:
                out = model.pred_inv(obs1.float().cuda(), obs2.float().cuda())
                ls_bh = loss_func(out, act.cuda())
                ls_bh = ls_bh.cpu().detach().numpy()
                TQ.set_postfix(loss_policy='%.3f' % (ls_bh))
                ls_val_ep += ls_bh
                total += obs1.shape[0]
        elif (NN == 'pred'):
            for obs, act in TQ:
                out = model.pred_act(obs.float().cuda())
                ls_bh = loss_func(out, act.cuda())
                ls_bh = ls_bh.cpu().detach().numpy()
                TQ.set_postfix(loss_policy='%.3f' % (ls_bh))
                ls_val_ep += ls_bh
                total += obs.shape[0]
            
        ls_val_ep /= len(TQ)
        
        return ls_val_ep

In [None]:
trajs_inv = []
tqdm_alpha = trange(alpha+1, position=0, desc='alpha:', leave=True)
policy_best = 10
policy_patience = 5
policy_patience_cnt = 0

for e in tqdm_alpha:
    
    # step1, generate inverse samples
    if e==0:
        trajs_inv = np.load('PreDemo_Interactions_Ant_5Lakh.npy',allow_pickle=True)
    else:
        tqdm_alpha.set_description("alpha: %i, Step1: Exploration" % e,refresh=True)
        time.sleep(1)
        cnt = 0 #count
        epn = 0 #Episode number

        rews = 0 #Rewards

        while True:
            traj = []
            rew = 0
            N=0 
            obs = env.reset()
            while True:
                inp = T.from_numpy(obs).view(((1, )+obs.shape)).float().cuda()
                out = model.pred_act(inp).cpu().detach().numpy()
                if e==0:
                    act = env.action_space.sample()               
                else:
                    act = out[0]


                new_obs, r, done, _ = env.step(act)

                traj.append([obs, act, new_obs])
                obs = new_obs
                rew += r

                cnt += 1
                tqdm_alpha.set_description("alpha: %i, Step1: Exploration - %i" % (e,cnt),refresh=True)
                N+=1   
                if done==True :
                    rews += rew
                    trajs_inv.append(traj)

                    epn += 1

                    break

            if cnt >= M:
                break

        rews /= epn
        tqdm_alpha.set_description("alpha: %i, step1: Exploration, Reward: %.2f" % (e,rews),refresh=True)
        time.sleep(1)
      
    
    # step2, update inverse model

    if e!=0:
    
#         ls_val_best = 0.025
        ls_val_best = 1
        patience_cnt = 0
        tqdm_alpha.set_description("alpha: %i, Step2: Update Inverse Model" % e,refresh=True)
        time.sleep(1)
        tqdm_epoch = trange(epochs, position=0, desc='Epoch:', leave=True)
        for i in  tqdm_epoch:
            dataset=DS_Inv(trajs_inv)
            train_loader, validation_loader = train_valid_loader(dataset, batch_size=32, 
                                                                 validation_split=0.3,
                                                                 shuffle_dataset=True)
            
            ls_ep = train_NN(train_loader, NN = 'inv')
            ls_val_ep = validate_NN(validation_loader, NN = 'inv')
            
            tqdm_epoch.set_description("ID Model Update - Epoch: %i, val loss: %.3f" % (i,ls_val_ep),refresh=True)
            
            if ls_val_ep < ls_val_best:
                ls_val_best = ls_val_ep
                patience_cnt = 0
        
            else:
                patience_cnt += 1
                if patience_cnt == patience:
    #                 print("break")
                    break

#             if ls_val_ep < ls_val_best:
#                 break

    
#     T.save(model.state_dict(), 'Model/model_ant_ID_train_%d.pt' % (e+1))
    
    
    # step3, predict actions for demo trajectories
    traj_policy = []
    tqdm_alpha.set_description("alpha: %i, Step3: Predict most probable actions for expert demos" % e,refresh=True)
    obs_cnt = 0
    for obs1, obs2, _ in ld_demo:
        out = model.pred_inv(obs1.float().cuda(), obs2.float().cuda())
        obs = obs1.cpu().detach().numpy()
        out = out.cpu().detach().numpy()
        for i in range(len(obs1)):
            traj_policy.append([obs[i], out[i]])
        obs_cnt+=1
        if obs_cnt==25:
            break
    
#     pred_id_acts = np.asarray(traj_policy, dtype=object)
#     np.save('Pred_acts_demo_obs_Ant_5Lakh.npy', pred_id_acts)

    # step4, update policy via demo samples
    ls_val_best = 5
    patience_cnt = 0
    tqdm_alpha.set_description("alpha: %i, Step4: Update Policy" % e,refresh=True)
    tqdm_epoch = trange(epochs, position=0, desc='Epochs', leave=True)
    for i in  tqdm_epoch:
        dataset=DS_Policy(traj_policy)
        train_loader, validation_loader = train_valid_loader(dataset, batch_size=32, 
                                                             validation_split=0.3,
                                                             shuffle_dataset=True)
        
        ls_ep = train_NN(train_loader, NN = 'pred')
        ls_val_ep = validate_NN(validation_loader, NN = 'pred')
        
        tqdm_epoch.set_description("Policy Update - Epoch: %i, val loss: %.3f" % (i,ls_val_ep),refresh=True)
        
        if ls_val_ep < ls_val_best:
            ls_val_best = ls_val_ep
            patience_cnt = 0
    
        else:
            patience_cnt += 1
            if patience_cnt == patience:
#                 print("break")
                break

    # step5, save model
    if ls_val_ep < policy_best:
        policy_best = ls_val_ep
        policy_patience_cnt = 0
        T.save(model.state_dict(), 'Model/model_ant_best.pt')

    else:
        policy_patience_cnt += 1
    
    if policy_patience_cnt==5:
        break
        
    M *= DECAY

In [None]:
# Ipre = np.asarray(trajs_inv, dtype=object)
# np.save('PreDemo_Interactions_Ant_5Lakh.npy', Ipre)

In [None]:
# T.save(model.state_dict(), 'Model/model_test_ant_ID_train_%d.pt' % (e+1))


In [None]:
import time
import matplotlib.pyplot as plt
%matplotlib inline

reward = 0
reward_per_obs=np.array([])
episodes = 20
tqdm_episodes = trange(episodes, position=0, desc='Episode', leave=True)

# model = BCO(env, policy=POLICY).cuda()
# model.load_state_dict(torch.load('Model/model_ant_1.pt'))

for i_episode in tqdm_episodes:
    observation = env.reset()
    rews=0
    t=0
    while True:
        inp = T.from_numpy(observation).view(((1, )+observation.shape)).float().cuda()
        out = model.pred_act(inp).cpu().detach().numpy()
        act = out  ## Take actions predicted by the inverse dynamics model
#         env.render()
        observation, reward, done, info = env.step(act)
        rews+=reward
        t+=1
        tqdm_episodes.set_description("Episode: %i, Step: %i" % (i_episode+1,t),refresh=True)

        if done:
#             print("Episode finished after {} timesteps".format(t+1))
            print(rews)
            reward_per_obs=np.append(reward_per_obs,rews)
            break
print(np.mean(reward_per_obs))


In [7]:
import time
import matplotlib.pyplot as plt
%matplotlib inline

reward = 0
reward_per_obs=np.array([])
episodes = 20
tqdm_episodes = trange(episodes, position=0, desc='Episode', leave=True)

model_x = BCO(env, policy=POLICY).cuda()
model_x.load_state_dict(torch.load('Model/model_ant_best_15.pt'))

for i_episode in tqdm_episodes:
    observation = env.reset()
    rews=0
    t=0
    while True:
        inp = T.from_numpy(observation).view(((1, )+observation.shape)).float().cuda()
        out = model_x.pred_act(inp).cpu().detach().numpy()
        act = out  ## Take actions predicted by the inverse dynamics model
        env.render()
        observation, reward, done, info = env.step(act)
        rews+=reward
        t+=1
        tqdm_episodes.set_description("Episode: %i, Step: %i" % (i_episode+1,t),refresh=True)

        if done:
#             print("Episode finished after {} timesteps".format(t+1))
            print(rews)
            reward_per_obs=np.append(reward_per_obs,rews)
            break
print(np.mean(reward_per_obs))

Episode: 1, Step: 24:   0%|          | 0/20 [00:00<?, ?it/s]

Creating window glfw


Episode: 2, Step: 17:   5%|▌         | 1/20 [00:11<03:37, 11.43s/it]  

4603.344860457913


Episode: 3, Step: 17:  10%|█         | 2/20 [00:56<09:22, 31.27s/it]  

4664.65762974676


Episode: 4, Step: 16:  15%|█▌        | 3/20 [01:08<06:18, 22.29s/it]  

4627.632354215873


Episode: 5, Step: 17:  20%|██        | 4/20 [01:20<04:49, 18.11s/it]  

4852.233651763291


Episode: 5, Step: 920:  20%|██        | 4/20 [01:30<06:02, 22.65s/it]


You can access the simulator by self.sim


ModuleNotFoundError: No module named 'ipdb'

In [None]:
print(val_loss_pol)
print("hello")
print(train_loss_pol)

In [None]:
import matplotlib.pyplot as plt
plt.plot(val_loss_id,'r')
plt.plot(train_loss_id)
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(val_loss_pol,'r')
plt.plot(train_loss_pol)
plt.show()