In [2]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
import matplotlib.pyplot as plt
import time
import warnings
import numpy as np
import gym
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

random_seed = 1
torch.backends.cudnn.enabled = False


device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
def plot_rewards(rewards, round):
    plt.plot(range(0,round + 1), rewards)
    plt.xlabel("Episodes")
    plt.ylabel("Mean Reward")
    plt.title("Mean Reward per Episode")
    plt.show()

In [7]:
def nn_train(gpu = False, size_hidden_layer = 100, learningrate=1e-4, momentum = 0.5, mean_reward_threshold = 100, EPISODEN = 100,  STEPS_PER_EPISODE = 500):

    env = gym.make('LunarLander-v2')
    env = gym.wrappers.Monitor(env, "recording_lunar", force=True)
    env.seed(random_seed)

    class Net(nn.Module):
        def __init__(self, D_in, H, D_out):
            super(Net, self).__init__()
            self.linear1 = nn.Linear(D_in, H)
            self.relu1 = torch.nn.ReLU()
            self.linear2 = nn.Linear(H, D_out)

        def forward(self, x):
            """
            In the forward function we accept a Tensor of input data and we must return
            a Tensor of output data. We can use Modules defined in the constructor as
            well as arbitrary operators on Tensors.
            """
            l1 = self.linear1(x) 
            h_relu = self.relu1(l1)
            y_pred = self.linear2(h_relu)
            return y_pred

    def train(episodes_data):
        model.train()

        for episode in episodes_data:
            for data, target in episode:
                data = torch.tensor(data, device = device)
                target = torch.tensor(target, device = device)
                optimizer.zero_grad()
                y_pred = model(data)
                y_pred = torch.unsqueeze(y_pred, 0)
                target = torch.unsqueeze(target, 0)

                loss = criterion(y_pred, target)
                loss.backward()
                optimizer.step()
                

    #diese methode kann zum erstellen von trainingdaten als auch für das 
    #testen des aktuellen modells verwendet werden
    def generate_train_data(episoden, rendering):
        model.eval()
        all_scores = []
        episodes = [] # [ [(state, action), (),...()], [(state, action), (),...()], ...]
        rewards = []
        sm = nn.Softmax(dim=0)
        for i in range(episoden):
            obs = env.reset() # reset for each new trial  
            state_action_pairs_per_episode = [] #[(state, action),(), ...]
            episode_reward = 0

            for t in range(STEPS_PER_EPISODE): # run for maximum 500 timesteps or until done, whichever is first               
                with torch.no_grad():
                    data = torch.tensor(obs, device = device)
                    outputs = model(data)
                    propability_dist = sm(outputs).cpu().detach().numpy()
                    outputs = outputs.cpu().detach().numpy()
                    choice = np.random.choice(outputs, size=1, p=propability_dist)
                    new_action = np.where((outputs == choice))[0][0]

                state_action_tupel = (obs, new_action)
                state_action_pairs_per_episode.append(state_action_tupel)
        
                if rendering:
                    env.render()
                obs, reward, done, info = env.step(new_action)
                episode_reward+=reward
                if not rendering:
                    if done:
                        break

            episodes.append(state_action_pairs_per_episode)
            rewards.append(episode_reward)
            env.close()


        rewards = np.array(rewards)
        episodes = np.array(episodes)
        sort_index = rewards.argsort()
        top_rewards = rewards[sort_index[::-1]][:20]
        top_episodes = episodes[sort_index[::-1]][:20]
        return top_episodes, top_rewards.mean(), rewards.mean()

    D_in = 8
    H = size_hidden_layer
    D_out = 4
    mean_reward = 0
    all_reward = 0
    round = 0 
    
    model = Net(D_in, H, D_out)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learningrate, momentum=momentum)
    if gpu: 
        model.cuda()

    """ Get initial training episodes  """
    train_data, top_rewards, all_reward = generate_train_data(EPISODEN, rendering = False)
    print("Round -1 :Initial mean reward {} --- Initial top 20 reward is {}".format(all_reward, top_rewards, round))
    print("Start training...")

    """ Traing until mean reward hits threshold  """
    while (all_reward < mean_reward_threshold):
        train(train_data)
        train_data, top_rewards, all_reward = generate_train_data(EPISODEN, rendering = False)

        print("Round {}: Current mean reward {} --- Top 20 reward is {}".format(round, all_reward, top_rewards))
        round = round + 1
    
    """ test with one final rendered epsiode  """
    waiting_mechanism = input("Press any key to continue. Please record the final episode")
    print("Start testing")

    train_data, top_rewards, all_reward = generate_train_data(1, rendering = True)
    print("Current mean reward {}".format(all_reward, top_rewards, round))
        
nn_train(gpu = True, 
         size_hidden_layer = 500, 
         learningrate=0.001, 
         momentum = 0.9, 
         mean_reward_threshold = 100, 
         EPISODEN = 100,  
         STEPS_PER_EPISODE = 500)

Round -1 :Initial mean reward -194.01329909115947 --- Initial top 20 reward is -74.0963727311935
Start training...
Round 0: Current mean reward -126.87497605424842 --- Top 20 reward is -71.88651725057994
Round 1: Current mean reward -98.97527289392065 --- Top 20 reward is -73.95753308923808
Round 2: Current mean reward -97.614251686907 --- Top 20 reward is -67.15633479618234
Round 3: Current mean reward -107.66414338474688 --- Top 20 reward is -76.91317947978358
Round 4: Current mean reward -104.57169962751142 --- Top 20 reward is -77.70056009064288
Round 5: Current mean reward -98.58600755172583 --- Top 20 reward is -70.34231959332509
Round 6: Current mean reward -98.60927783900225 --- Top 20 reward is -73.79955782625359
Round 7: Current mean reward -92.61992838447169 --- Top 20 reward is -63.660242157514894
Round 8: Current mean reward -95.73131983908426 --- Top 20 reward is -70.9318856323034
Round 9: Current mean reward -97.31423645863963 --- Top 20 reward is -77.68201009245176
Roun

Round 91: Current mean reward 11.083479303929373 --- Top 20 reward is 100.1606477728361
Round 92: Current mean reward 12.294794651656343 --- Top 20 reward is 115.4979967149779
Round 93: Current mean reward 26.945338687454754 --- Top 20 reward is 137.1157218471562
Round 94: Current mean reward 46.17014984493163 --- Top 20 reward is 166.4880961634149
Round 95: Current mean reward 64.70935824680755 --- Top 20 reward is 201.66361957791682
Round 96: Current mean reward 87.08267807956965 --- Top 20 reward is 240.60640296510755
Round 97: Current mean reward 69.96749426534629 --- Top 20 reward is 223.81733421877212
Round 98: Current mean reward 90.99926758703332 --- Top 20 reward is 247.36754773832203
Round 99: Current mean reward 75.2245378627573 --- Top 20 reward is 217.53268111353478
Round 100: Current mean reward 56.59907278236318 --- Top 20 reward is 208.22850746677574
Round 101: Current mean reward 85.86444595160478 --- Top 20 reward is 227.40673896523964
Round 102: Current mean reward 8