<a href="https://colab.research.google.com/github/Ravio1i/ki-lab/blob/master/4_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install box2d-py
import gym
import numpy as np
import matplotlib.pyplot as plt
import random 
from time import time
import torch
import torch.nn.functional as F
from torch import optim



In [2]:
class Net(torch.nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.fc1(x))
        return F.log_softmax(self.fc2(x))

In [6]:
#@title Hyperparameters
hidden_dim = 256 #@param {type:"integer"}
#@markdown Learning rate:
lr =  0.01#@param {type:"number"}
#@markdown How many episodes should be generated:
n_episodes = 100 #@param {type:"integer"}
#@markdown Limitation of steps during generation of episodes:
n_episode_steps = 500 #@param {type:"integer"}
#@markdown Train until mean `reward_goal` is reached:
reward_goal = 100 #@param {type:"integer"}
#@markdown Take `best_k` amount of episodes in terms of reward:
best_k = 20 #@param {type:"integer"}

device = torch.device("cuda:0")

In [7]:
env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
print("States: {}".format(n_states))
print("Actions: {}".format(n_actions))

States: 8
Actions: 4


In [8]:
model = Net(
    input_dim = n_states, 
    hidden_dim = hidden_dim, 
    output_dim = n_actions
)
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()

In [23]:
total_reward = 0
state = env.reset()
done = False

def generate_episodes(n_episodes: int, n_episode_steps: int):
    rewards = []
    state_action_reward = [] # [((tuple): (state, action), (int): reward_episode)]

    for episode in range(1,n_episodes+1):
        state = env.reset()
        state_action_episode = []
        reward_episode = 0
        for s in range(n_episode_steps):
            state = torch.from_numpy(state)
            out = model(state)
            action = torch.argmax(out).item()
            next_state, reward, done, _ = env.step(action)
            state = next_state
            reward_episode += reward
            state_action_episode.append((state, action))
            if done:
                break
        state_action_reward.append((state_action_episode, reward_episode))

        print('\r\tAverage Reward: {:.2f}\tEpisode {}'.format(np.mean(rewards), episode), end="")
        rewards.append(reward_episode)
        if episode % 100 == 0:
            print('\r\tAverage Reward: {:.2f}\tEpisode {}'.format(np.mean(rewards), episode))

    return state_action_reward

In [24]:
def train(best_k: int = 20):
    episodes = generate_episodes(n_episodes, n_episode_steps)
    best_k_episodes = sorted(episodes, key=lambda k: k[1], reverse=True)[:best_k]
    rewards = [episode[1] for episode in episodes]
    states = torch.FloatTensor([state_actions[0] for episode in best_k_episodes for state_actions in episode[0]])
    actions = torch.LongTensor([state_actions[1] for episode in best_k_episodes for state_actions in episode[0]]).to(device)

    actions_pred = model(torch.FloatTensor(states))
    loss = criterion(actions_pred, actions)
    loss.backward()
    optimizer.step()

    torch.save(model, 'model.pth')

In [25]:
mean_reward = 0
train_idx = 1

while mean_reward < 100:
    print("Iteration {}:".format(train_idx))
    train()
    train_idx += 1


Iteration 1:
	Average Reward: nan	Episode 1	Average Reward: -526.64	Episode 2

  # Remove the CWD from sys.path while we load stuff.
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


	Average Reward: -876.98	Episode 100
Iteration 2:
	Average Reward: -838.17	Episode 100
Iteration 3:
	Average Reward: -775.76	Episode 100
Iteration 4:
	Average Reward: -780.15	Episode 100
Iteration 5:
	Average Reward: -822.90	Episode 100
Iteration 6:
	Average Reward: -835.82	Episode 100
Iteration 7:
	Average Reward: -791.30	Episode 100
Iteration 8:
	Average Reward: -830.18	Episode 100
Iteration 9:
	Average Reward: -896.66	Episode 87

KeyboardInterrupt: ignored

In [None]:
# plot the scores
def plot():
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(rewards)), rewards)
    plt.ylabel('Rewards')
    plt.xlabel('Episode #')
    plt.show()