<a href="https://colab.research.google.com/github/Ravio1i/ki-lab/blob/master/4_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install box2d-py
import gym
import numpy as np
import matplotlib.pyplot as plt
import random 
import torch
import torch.nn.functional as F
from torch import optim



In [2]:
class Net(torch.nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [16]:
def generate_episodes(n_episodes: int, n_episode_steps: int):
    episode_states, episode_actions, episode_rewards = [], [], []

    for episode in range(1,n_episodes+1):
        state = env.reset()
        states, actions = [], []
        total_reward = 0
        for s in range(n_episode_steps):
            state_tensor = torch.FloatTensor([state])
            # Get action probablities as output
            out = model(state_tensor)
            action_probs = activation(out).data.numpy()[0]
            # Sample from action probablities
            action = np.random.choice(len(action_probs), p=action_probs)

            next_state, reward, done, _ = env.step(action)

            states.append(state)
            actions.append(action)
            total_reward += reward

            state = next_state
            if done:
                episode_actions.append(actions)
                episode_states.append(states)
                episode_rewards.append(total_reward)
                break

        #if episode % 20 == 0:
        #    print('\r\tAverage Reward: {:.2f}\tEpisode {}'.format(np.mean(total_reward), episode), end="")
    return episode_states, episode_actions, episode_rewards

In [17]:
def filter_episodes(episode_states, episode_actions, episode_rewards, best_k: int = 20):
    reward_threshold = np.percentile(episode_rewards, best_k)
    
    best_k_states, best_k_actions = [], []

    for i in range(len(episode_rewards)):
        if episode_rewards[i] > reward_threshold:
            for j in range(len(episode_states[i])):
                best_k_states.append(episode_states[i][j])
                best_k_actions.append(episode_actions[i][j])

    return best_k_states, best_k_actions

In [18]:
def train():
    episode_states, episode_actions, episode_rewards = generate_episodes(n_episodes, n_episode_steps)
    best_states, best_actions = filter_episodes(episode_states, episode_actions, episode_rewards, best_k = 80)

    optimizer.zero_grad()
    states = torch.FloatTensor(best_states)
    actions = torch.LongTensor(best_actions)  # .to(device)
    
    actions_pred = model(states)
    loss = criterion(actions_pred, actions)
    loss.backward()
    optimizer.step()

    return episode_rewards


In [19]:
#@title Hyperparameters
hidden_dim = 256 #@param {type:"integer"}
#@markdown Learning rate:
lr = 0.01 #@param {type:"number"}
#@markdown How many episodes should be generated:
n_episodes = 100 #@param {type:"integer"}
#@markdown Limitation of steps during generation of episodes:
n_episode_steps = 500 #@param {type:"integer"}
#@markdown Train until mean `reward_goal` is reached:
reward_goal = 100 #@param {type:"integer"}
#@markdown Take `best_k` amount of episodes in terms of reward:
best_k = 20 #@param {type:"integer"}

# device = torch.device("cuda:0")

In [20]:
env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
print("States: {}".format(n_states))
print("Actions: {}".format(n_actions))

States: 8
Actions: 4


In [21]:
model = Net(
    input_dim = n_states, 
    hidden_dim = hidden_dim, 
    output_dim = n_actions
)
#model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
activation = torch.nn.Softmax(dim=1)

In [23]:
train_idx = 1
mean_reward = 0

while mean_reward < 100:
    rewards = train()
    mean_reward = np.mean(rewards)
    print('{}: Mean Reward: {:.2f}'.format(train_idx, mean_reward))
    train_idx += 1


torch.save(model, 'model.pth')
print("Environment has been successfullly completed!")


1: Mean Reward: -127.87
2: Mean Reward: -124.99
3: Mean Reward: -124.14
4: Mean Reward: -108.85
5: Mean Reward: -124.04
6: Mean Reward: -91.24
7: Mean Reward: -68.39
8: Mean Reward: -71.65
9: Mean Reward: -81.43
10: Mean Reward: -117.10
11: Mean Reward: -126.00
12: Mean Reward: -95.72
13: Mean Reward: -62.84
14: Mean Reward: -51.10
15: Mean Reward: -43.10
16: Mean Reward: -34.40
17: Mean Reward: -28.58
18: Mean Reward: -22.77
19: Mean Reward: -22.37
20: Mean Reward: -17.93
21: Mean Reward: -3.81
22: Mean Reward: -3.49
23: Mean Reward: -1.26
24: Mean Reward: 10.68
25: Mean Reward: 6.34
26: Mean Reward: 3.80
27: Mean Reward: 11.36
28: Mean Reward: 19.69
29: Mean Reward: 15.19
30: Mean Reward: 17.29
31: Mean Reward: 27.89
32: Mean Reward: 32.26
33: Mean Reward: 28.18
34: Mean Reward: -7.68
35: Mean Reward: 3.21
36: Mean Reward: -7.36
37: Mean Reward: -13.43
38: Mean Reward: 13.73
39: Mean Reward: 19.89
40: Mean Reward: 35.48
41: Mean Reward: 80.82
42: Mean Reward: 38.44
43: Mean Reward: 5

KeyboardInterrupt: ignored

In [24]:
!apt-get install -y xvfb x11-utils

!pip install pyvirtualdisplay==0.2.* \
             PyOpenGL==3.1.* \
             PyOpenGL-accelerate==3.1.*

!pip install gym[box2d]==0.17.*

import pyvirtualdisplay

_display = pyvirtualdisplay.Display(visible=False, size=(1400, 900))
_ = _display.start()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libxxf86dga1
Suggested packages:
  mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 x11-utils xvfb
0 upgraded, 3 newly installed, 0 to remove and 34 not upgraded.
Need to get 994 kB of archives.
After this operation, 2,981 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.9 [784 kB]
Fetched 994 kB in 1s (1,127 kB/s)
Selecting previously unselected package libxxf86dga1:amd64.
(Reading database ... 160706 file

In [28]:
import time

FPS = 25
record_folder="video"  

env = gym.make('LunarLander-v2')
env = gym.wrappers.Monitor(env, record_folder, force=True)

state = env.reset()
total_reward = 0.0

activation = torch.nn.Softmax(dim=1)

while True:
        start_ts = time.time()
        env.render()
           
        state_tensor = torch.FloatTensor([state])
        out = model(state_tensor)
        action_probs = activation(out).data.numpy()[0]
        action = np.random.choice(len(action_probs), p=action_probs)

        state, reward, done, _ = env.step(action)
        total_reward += reward
        if done:
            break
            
        delta = 1/FPS - (time.time() - start_ts)
        if delta > 0:
            time.sleep(delta)

print("Total reward: %.2f" % total_reward)
env.close()

Total reward: 261.23
