<a href="https://colab.research.google.com/github/SilentClaw27/AdvancedRL/blob/main/lander_reinforce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash

apt-get install swig

git clone https://github.com/pybox2d/pybox2d
cd pybox2d
python setup.py build
python setup.py install

apt-get install -y xvfb

pip install \
  gym==0.21\
  gym[box2d]==0.21\
  pyvirtualdisplay\
  pyglet==1.5.27 \
  colabgymrender

Reading package lists...
Building dependency tree...
Reading state information...
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 20 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]
Fetched 1,100 kB in 0s (2,602 kB/s)
Selecting previously unselected package swig3.0.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database .

Cloning into 'pybox2d'...
Box2D/Box2D_wrap.cpp: In function ‘PyObject* b2GetPointStates(const b2Manifold*, const b2Manifold*)’:
             if (state1[i]==b2_nullState && state1_length==0)
             ^~
Box2D/Box2D_wrap.cpp:4129:17: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘if’
                 if (state2_length > -1)
                 ^~
             if (state2[i]==b2_nullState && state2_length==0)
             ^~
Box2D/Box2D_wrap.cpp:4133:17: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘if’
                 if (state1_length > -1)
                 ^~
Box2D/Box2D_wrap.cpp: In function ‘PyObject* _wrap_b2Vec2_cross(PyObject*, PyObject*)’:
     if (!_v) goto check_1; return _wrap_b2Vec2_cross__SWIG_0(self, argc, argv);}  check_1: if (argc == 2) {
     ^~
Box2D/Box2D_wrap.cpp:6328:28: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘if’
  

In [2]:
import gym
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

In [5]:
os.makedirs("outputs",exist_ok=True)

DEVICE="cuda:0"
ACTION_SPACE = [0,1,2,3]
EPISODES = 1000
STEPS = 1000
GAMMA=0.99
RENDER=False

In [6]:
class ReinforceModel(nn.Module):
    def __init__(self,num_action,num_input):
        super(ReinforceModel,self).__init__()
        self.num_action = num_action
        self.num_input = num_input

        self.layer1 = nn.Linear(num_input,64)
        self.layer2 = nn.Linear(64,num_action)
        
    def forward(self,x):
        x = torch.tensor(x,dtype=torch.float32,device=DEVICE).unsqueeze(0)
        x = F.relu(self.layer1(x))
        actions = F.softmax(self.layer2(x))
        action = self.get_action(actions)
        log_prob_action = torch.log(actions.squeeze(0))[action]
        return action,log_prob_action
    def get_action(self,a):
        return np.random.choice(ACTION_SPACE,p=a.squeeze(0).detach().cpu().numpy())

In [7]:
env = gym.make("LunarLander-v2")
print(env.action_space,env.observation_space)

model = ReinforceModel(4,8).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)
all_rewards =[]
best_rolling = -99999

Discrete(4) Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)


In [9]:
for episode in range(EPISODES):
    done=False
    state = env.reset()
    lp=[]
    a=[]
    r=[]
    d=[]
    s=[]
    for step in range(STEPS):
        if RENDER:
            env.render()
        action,log_prob = model(state)
        state,r_,done,i_ = env.step(action)
        lp.append(log_prob)
        r_ = r_ /100
        r.append(r_)
        if done:
            all_rewards.append(np.sum(r))
            
            if episode%100 ==0:
                print(f"EPISODE {episode} SCORE: {np.sum(r)} roll{pd.Series(all_rewards).tail(100).mean()}")
                # RENDER = True
                torch.save(model.state_dict(), 'outputs/last_params_cloud.ckpt')
                if pd.Series(all_rewards).tail(100).mean()>best_rolling:
                    best_rolling = pd.Series(all_rewards).tail(100).mean()
                    print("saving...")
                    torch.save(model.state_dict(), 'outputs/best_params_cloud.ckpt')
            break
 

    discounted_rewards = []

    for t in range(len(r)):
        Gt = 0 
        pw = 0
        for r_ in r[t:]:
            Gt = Gt + GAMMA**pw * r_
            pw = pw + 1
        discounted_rewards.append(Gt)
    
    discounted_rewards = np.array(discounted_rewards)

    discounted_rewards = torch.tensor(discounted_rewards,dtype=torch.float32,device=DEVICE)
    discounted_rewards = (discounted_rewards - torch.mean(discounted_rewards))/ (torch.std(discounted_rewards))
    log_prob = torch.stack(lp)
    policy_gradient = -log_prob*discounted_rewards

    model.zero_grad()
    policy_gradient.sum().backward()
    optimizer.step()

  actions = F.softmax(self.layer2(x))


EPISODE 0 SCORE: -0.5903269697615916 roll-0.5903269697615916
saving...
EPISODE 100 SCORE: -2.44878137999429 roll-1.8245016403182188
EPISODE 200 SCORE: -2.124063857812482 roll-1.654876909289788
EPISODE 300 SCORE: -0.9928783785171148 roll-1.774826514695242
EPISODE 400 SCORE: -2.579730622205552 roll-1.5905339279840576
EPISODE 500 SCORE: -1.433729318646151 roll-1.6722529107576118
EPISODE 600 SCORE: -0.1894907737614624 roll-1.588587221957965
EPISODE 700 SCORE: -0.8450540762470876 roll-1.4101841666253419
EPISODE 800 SCORE: -1.261705157950003 roll-1.5457056372732223
EPISODE 900 SCORE: -1.4204424974781555 roll-1.474525021950828


In [13]:
import cv2
import matplotlib.pyplot as plt
import matplotlib.animation as animation

font                   = cv2.FONT_HERSHEY_SIMPLEX
bottomLeftCornerOfText = (10,500)
fontScale              = 1
fontColor              = (255,255,255)
lineType               = 2

fig = plt.figure()
env = gym.make("LunarLander-v2")
print(env.action_space,env.observation_space)

model = ReinforceModel(4,8).to(DEVICE)
model.load_state_dict(torch.load("outputs/best_params_cloud.ckpt"))

model.eval()
ims = []
rewards = []
state = env.reset()

for step in range(STEPS):
    img = env.render(mode='rgb_array')
    action,log_prob = model(state)
        # print(action)
    state,reward,done,i_ = env.step(action)
    rewards.append(reward)
    # print(reward,done)
    cv2_im_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    pil_im = Image.fromarray(cv2_im_rgb)

    draw = ImageDraw.Draw(pil_im)

    # Choose a font
    font = ImageFont.truetype("Roboto-Regular.ttf", 20)

    # Draw the text
    draw.text((0, 0), f"Step: {step} Action : {action} Reward: {int(reward)} Total Rewards: {int(np.sum(rewards))} done: {done}", font=font,fill="#FDFEFE")

    # Save the image
    img = cv2.cvtColor(np.array(pil_im), cv2.COLOR_RGB2BGR)
    im = plt.imshow(img, animated=True)
    ims.append([im])
    if done:
        env.close()


                
        
        break

Writer = animation.writers['pillow']
writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800)
im_ani = animation.ArtistAnimation(fig, ims, interval=50, repeat_delay=3000,
                                    blit=True)
im_ani.save('ll_train1.gif', writer=writer)

Discrete(4) Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)


NoSuchDisplayException: ignored

<Figure size 432x288 with 0 Axes>