In [1]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
import matplotlib.pyplot as plt
import time
import warnings
import numpy as np
import gym
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

random_seed = 1
torch.backends.cudnn.enabled = False


device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [2]:
def plot_rewards(rewards, round):
    plt.plot(range(0,round + 1), rewards)
    plt.xlabel("Episodes")
    plt.ylabel("Mean Reward")
    plt.title("Mean Reward per Episode")
    plt.show()

In [None]:
def nn_train(gpu = False, size_hidden_layer = 100, learningrate=1e-4, momentum = 0.5, mean_reward_threshold = 100, EPISODEN = 100,  STEPS_PER_EPISODE = 500):

    env = gym.make('LunarLander-v2')
    env.seed(random_seed)

    class Net(nn.Module):
        def __init__(self, D_in, H, D_out):
            super(Net, self).__init__()
            self.linear1 = nn.Linear(D_in, H)
            self.relu1 = torch.nn.ReLU()
            self.linear2 = nn.Linear(H, D_out)

        def forward(self, x):
            """
            In the forward function we accept a Tensor of input data and we must return
            a Tensor of output data. We can use Modules defined in the constructor as
            well as arbitrary operators on Tensors.
            """
            l1 = self.linear1(x) 
            h_relu = self.relu1(l1)
            #y_pred = nn.Softmax(self.linear2(h_relu))
            y_pred = self.linear2(h_relu)
            # y_pred = self.soft_max(self.linear2(h_relu))
            return y_pred

    def train(episodes_data):
        model.train()

        for episode in episodes_data:
            for data, target in episode:
                data = torch.tensor(data, device = device)
                target = torch.tensor(target, device = device)
                optimizer.zero_grad()
                y_pred = model(data)
                y_pred = torch.unsqueeze(y_pred, 0)
                target = torch.unsqueeze(target, 0)
                #y_pred = torch.Tensor(y_pred, device="cpu")
                #y_pred = int(np.argmax(y_pred.cpu().detach().numpy()))
                # print(y_pred.shape)
                loss = criterion(y_pred, target)
                loss.backward()
                optimizer.step()
            # print(f"Loss for most recent episode is {loss.item()}")

    #diese methode kann zum erstellen von trainingdaten als auch für das 
    #testen des aktuellen modells verwendet werden
    def generate_train_data(episoden, rendering):
        model.eval()
        all_scores = []
        episodes = [] # [ [(state, action), (),...()], [(state, action), (),...()], ...]
        rewards = []
        sm = nn.Softmax(dim=0)
        for i in range(episoden):
            obs = env.reset() # reset for each new trial  
            state_action_pairs_per_episode = [] #[(state, action),(), ...]
            episode_reward = 0

            for t in range(STEPS_PER_EPISODE): # run for maximum 500 timesteps or until done, whichever is first
                #get states and put into network
                
                with torch.no_grad():
                    data = torch.tensor(obs, device = device)
                    outputs = model(data)
#                     print(outputs)
                    propability_dist = sm(outputs).cpu().detach().numpy()
#                     print(propability_dist)
                    outputs = outputs.cpu().detach().numpy()
                    choice = np.random.choice(outputs, size=1, p=propability_dist)

                    new_action = np.where((outputs == choice))[0][0] # new approach
#                 new_action = int(np.argmax(outputs.cpu())) #old approach
                state_action_tupel = (obs, new_action)
                state_action_pairs_per_episode.append(state_action_tupel)
        
                if rendering:
                    env.render()
                obs, reward, done, info = env.step(new_action)
                episode_reward+=reward
                if not rendering:
                    if done:
                      #print("Episode finished after {} timesteps".format(t+1))
                        break

                #print(episode_reward/STEPS_PER_EPISODE)
                episodes.append(state_action_pairs_per_episode)
                rewards.append(episode_reward)
            env.close()


        rewards = np.array(rewards)
        episodes = np.array(episodes)
        sort_index = rewards.argsort()
        top_rewards = rewards[sort_index[::-1]][:20]
        top_episodes = episodes[sort_index[::-1]][:20]
        return top_episodes, top_rewards.mean(), rewards.mean()

    D_in = 8
    H = size_hidden_layer
    D_out = 4
    mean_reward = 0
    all_reward = 0
    round = 0 
    
    model = Net(D_in, H, D_out)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learningrate, momentum=momentum)
    if gpu: 
        model.cuda()

    """ Get initial training episodes  """
    train_data, top_rewards, all_reward = generate_train_data(EPISODEN, rendering = False)
    print("Round -1 :Initial mean reward {} --- Initial top 20 reward is {}".format(all_reward, top_rewards, round))
    print("Start training...")

    """ Traing until mean reward hits threshold  """
    while (all_reward < mean_reward_threshold):
        train(train_data)
        train_data, top_rewards, all_reward = generate_train_data(EPISODEN, rendering = False)

        print("Round {}: Current mean reward {} --- Top 20 reward is {}".format(round, all_reward, top_rewards))
        round = round + 1
    
    """ test with one final rendered epsiode  """
    waiting_mechanism = input("Press any key to continue. Please record the final episode")
    print("Start testing")

    train_data, top_rewards, all_reward = generate_train_data(1, rendering = True)
    print("Current mean reward {}".format(all_reward, top_rewards, round))
        
nn_train(gpu = True, 
         size_hidden_layer = 500, 
         learningrate=0.1, 
         momentum = 0.3, 
         mean_reward_threshold = 100, 
         EPISODEN = 100,  
         STEPS_PER_EPISODE = 500)

Round -1 :Initial mean reward -38.51850381932774 --- Initial top 20 reward is 124.91011414386193
Start training...
Round 0: Current mean reward -33.94490274949048 --- Top 20 reward is 88.46014947498502
Round 1: Current mean reward -44.58101711947396 --- Top 20 reward is 97.02556082724848
Round 2: Current mean reward -54.06519482221959 --- Top 20 reward is 90.29880349830384
Round 3: Current mean reward -46.645250066149174 --- Top 20 reward is 34.395083193248425
Round 4: Current mean reward -36.78557781778498 --- Top 20 reward is 76.63841571189381
Round 5: Current mean reward -36.02611245592557 --- Top 20 reward is 128.10370383772022
Round 6: Current mean reward -26.31300868940332 --- Top 20 reward is 175.76649915907686
Round 7: Current mean reward -36.28940858553412 --- Top 20 reward is 52.10801583569715
Round 8: Current mean reward -42.679804633705814 --- Top 20 reward is 115.21950214671224
Round 9: Current mean reward -37.120619946072935 --- Top 20 reward is 96.83774965990631
Round 10

Round 91: Current mean reward -37.642465779347035 --- Top 20 reward is 148.25741356670534
Round 92: Current mean reward -36.548842729048715 --- Top 20 reward is 146.29145576904048
Round 93: Current mean reward -28.299784285095317 --- Top 20 reward is 162.85543728304074
Round 94: Current mean reward -40.64539809005084 --- Top 20 reward is 139.2946160901497
Round 95: Current mean reward -36.33415485342934 --- Top 20 reward is 145.13829391593947
Round 96: Current mean reward -27.97968034544981 --- Top 20 reward is 167.0014379200761
Round 97: Current mean reward -33.1704996543535 --- Top 20 reward is 144.48365064186
Round 98: Current mean reward -40.72429233638389 --- Top 20 reward is 152.8400407954651
Round 99: Current mean reward -40.6370942745566 --- Top 20 reward is 129.59808099126283
Round 100: Current mean reward -39.95226172659675 --- Top 20 reward is 148.64071351723578
Round 101: Current mean reward -38.80135583943914 --- Top 20 reward is 145.9983730734172
Round 102: Current mean r

Round 183: Current mean reward -52.46897580858551 --- Top 20 reward is 148.30977830401548
Round 184: Current mean reward -51.63500649138921 --- Top 20 reward is 148.59714080294034
Round 185: Current mean reward -52.789935330220416 --- Top 20 reward is 147.15127298361148
Round 186: Current mean reward -56.8417518896267 --- Top 20 reward is 136.4630500867781
Round 187: Current mean reward -51.87729325009027 --- Top 20 reward is 137.99982599493626
Round 188: Current mean reward -52.7823977287062 --- Top 20 reward is 149.7384784175276
Round 189: Current mean reward -54.958279604468906 --- Top 20 reward is 114.38947383976702
Round 190: Current mean reward -55.91551631025822 --- Top 20 reward is 129.6880908517439
Round 191: Current mean reward -44.84338312852009 --- Top 20 reward is 173.13008968522823
Round 192: Current mean reward -47.836812801418496 --- Top 20 reward is 150.18610210178437
Round 193: Current mean reward -45.220643580342504 --- Top 20 reward is 163.19761137707331
Round 194: 

In [1]:
# import gym
# import random
# import torch
# import torchvision
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# import numpy as np
# import time
# import matplotlib.pyplot as plt

# class Net(nn.Module):
#     def __init__(self, D_in, H, D_out):
#         super(Net, self).__init__()
#         self.linear1 = torch.nn.Linear(D_in, H)
#         self.linear2 = torch.nn.Linear(H, D_out)
    
#     def forward(self, x):
#         h_relu = self.linear1(x).clamp(min=0)
#         y_pred = self.linear2(h_relu)
#         #return F.softmax(y_pred, dim=0)
#         return y_pred
    
# def train(network, optimizer, criterion, top20):
#     network.train()
#     counter = 0
    
#     for episode in top20:
#         for state, target in episode:
#             optimizer.zero_grad()
#             target = torch.tensor([target])
#             output = network(torch.from_numpy(np.array(state))).unsqueeze(0)
#             #if counter < 5:
#             #    print(F.softmax(output), target)
#             #    counter += 1
#             #print(output.shape)
#             #print(torch.tensor([target]))
#             loss = criterion(output, target)
#             loss.backward()
#             optimizer.step()
            
            
# def choose_action(output, epsilon):
#     if random.random() > epsilon:
#         return np.argmax(output.detach().numpy(), axis=0)
#     else:
#         return random.randint(0, 3)

        

# def main():
#     env = gym.make("LunarLander-v2")
#     is_monitor_active = False
    

#     no_of_actions = env.action_space.n
#     total_reward = 0
#     state = env.reset()
#     done = False
    
#     num_episodes = 100
#     num_steps = 500
#     episodes = []
#     plot_data = []
    
#     network = Net(8, 256, 4)
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.SGD(network.parameters(), momentum=0.9, lr=0.00015)
    
#     while(len(episodes) == 0 or episodes[:,1].mean() <= 100):
#     #for i in range(100):
#         episodes = []
#         for e in range(num_episodes):
#             episode = []
#             reward_e = 0
#             for s in range(num_steps):
                
#             #while (not done):
#                 # probabilities for action
#                 output = F.softmax(network(torch.from_numpy(np.array(state))))
#                 action =  random.choices([0,1,2,3], output, k=1)[0]
#                 episode.append((state, action))
#                 state, reward, done, _ = env.step(action)
#                 reward_e += reward
#                 last_reward = reward

#                 if done:
#                     #reward_e /= s
#                     break

#             episodes.append((episode, reward_e, last_reward))
#             if is_monitor_active:
#                 time.sleep(0.1)
#             state = env.reset()
#             done = False
            

#         episodes = np.array(episodes)
#         mean = episodes[:,1].mean()
#         #if (mean > 50) and (not is_monitor_active):
#         #    env = gym.wrappers.Monitor(env, "recording_lunar", force=True)
#         #    time.sleep(1)
#         #    state = env.reset()
#         #    is_monitor_active = True
            
#         plot_data.append(mean)
#         print(mean)

#         episodes_sorted = episodes[episodes[:,1].argsort()]
#         top20 = episodes_sorted[80:]
#         #print(top20[:,1])
#         train(network, optimizer, criterion, top20[:,0])
    
#     plt.figure()
#     plt.xlabel("episode")
#     plt.ylabel("mean reward")
#     plt.plot(plot_data, label="100 Hidden layer, lr=0.1")
#     plt.legend()
#     plt.show()

# main()

  output = F.softmax(network(torch.from_numpy(np.array(state))))
  episodes = np.array(episodes)


-227.72416459975923
-209.233020043464
-204.56590027019527
-170.79569554088926
-152.82717445588935
-139.21471022686734
-118.38977043713042
-130.5544844234603
-134.4211325287241
-152.40297967024617
-115.02294271443172
-102.54474385845488
-125.18068119522182
-129.22903614830776
-103.88511584823539
-108.69902174343261
-132.98083401895497
-120.96077481169397
-120.57095849999929
-117.8733275001573
-98.46196502457244
-126.47690972070365
-89.78581705656221
-82.93342066098306
-83.78237987889425
-85.21313299925829
-76.81274942461646
-98.76807390744113
-130.7047993853035
-98.48865213457289
-92.80659844894105
-98.85247024502522
-99.81275231452179
-111.12236223272313
-134.9168058589075
-107.21057835404916
-98.85592396983971
-121.26695026725582
-115.78625725817638
-118.2333395447468
-115.54881781769845
-114.10614773277187
-98.14885504535388
-124.38127546428949
-137.37627626838676
-134.31500331282547
-143.55041838954614
-160.6299493188343
-136.95609998127412
-141.33794358801933
-124.43144974202355
-1

KeyboardInterrupt: 