In [61]:
import gym
import torch
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import random

In [62]:
def get_env_data(n):
    env = gym.make("ALE/Freeway-v5", render_mode="rgb_array", obs_type="ram", difficulty=1, mode=3)
    observation = env.reset()

    df = pd.DataFrame([observation])
    # Actions: 0: nichts, 1: up, 2: down

    actions = []

    for i in range(n):
        action = get_action_sample()
        if i != 0:
            df.loc[len(df)] = observation
        actions.append(action)
        observation, reward, done, info = env.step(action)
        if done:
            observation = env.reset()
        if i % (n / 100) == 0:
            print(f"{(i / n) * 100}%")
    env.close()
    df = df[[14, 103, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117]]
    df["actions"] = actions
    return df

In [63]:
def get_action_sample():
    x = random.randint(0, 101)
    if x < 90:
        return 1
    if x < 97:
        return 2
    return 0
GAME_START = [6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# NN

In [68]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(14, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 13),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.linear_relu_stack(x)
        #x = torch.FloatTensor(x)
        return x

In [21]:
def get_dfs(df):
    df = df.tail(len(df) - 1)

    dfY = df.copy()
    dfY.drop(["actions"], axis=1, inplace=True)

    dfY = dfY.drop(dfY.index[[0]])
    df = df.drop(df.index[[len(df) - 1]])
    dfY.index = df.index

    df = df.reset_index(drop=True)
    dfY = dfY.reset_index(drop=True)
    return df, dfY

In [69]:
def train(X, y, model, loss_fn, optimizer):
    model.train()
    loss_sum = 0
    batch_size = 32
    for i in range(len(y)):
        X_data = list(X.iloc[i])
        y_data = list(y.iloc[i])
        X_data = torch.tensor(X_data).cuda()
        y_data = torch.tensor(y_data).cuda()

        pred = model.forward(X_data.float())

        loss = loss_fn(pred.to(torch.float32), y_data.to(torch.float32))
        loss_sum += loss.item()

        if i % batch_size == 0:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if i % 1000 == 0:
            print(f"loss: {loss_sum / 1000}")
            loss_sum = 0


def test(X, y, model, loss_fn):
    loss_sum = 0

    model.eval()
    with torch.no_grad():
        for i in range(len(y)):
            X_data = list(X.iloc[i])
            y_data = list(y.iloc[i])
            X_data = torch.tensor(X_data).cuda()
            y_data = torch.tensor(y_data).cuda()

            pred = model.forward(X_data.float())
            loss = loss_fn(pred, y_data)
            loss_sum += loss

    loss_sum /= len(y)

    print(f"Avg loss: {loss_sum}!")
    return loss_sum

In [70]:
def cv_model(X, y):
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=69)

    model = Net().cuda()
    for layer in model.children():
        if hasattr(layer, "reset_parameters"):
            layer.reset_parameters()

    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    last_test_avg = 10000
    test_avg = 0
    overfit = 0

    epochs = 20000

    avg_losses = []

    avg_losses.append(test(test_X, test_y, model, loss_fn))

    for t in range(epochs):
        print(f"Epoch {t + 1}-----------------------------")
        train(train_X, train_y, model, loss_fn, optimizer)
        test_avg = test(test_X, test_y, model, loss_fn)
        avg_losses.append(test_avg)
        if test_avg > last_test_avg:
            overfit += 1
        else:
            overfit = 0
            last_test_avg = test_avg
        if overfit >= 20:
            print(f"Epoche: {t}")
            break
    torch.save(model, f"game_model/game_model_1")
    return avg_losses
print("done")

done


In [71]:
%%time
n_data = [4096]
df_losses = pd.DataFrame()

for n in n_data:
    df = get_env_data(n)
    df, dfY = get_dfs(df)

    avg_losses = cv_model(df, dfY)

    x = []

    for i in avg_losses:
        x.append(i.item())
x

0.0%
Avg loss: 6728.08837890625!
Epoch 1-----------------------------
loss: 6.65534228515625
loss: 6667.668668241501
loss: 6618.763896484375
loss: 6539.144665039063
Avg loss: 6430.09375!
Epoch 2-----------------------------
loss: 6.404578125
loss: 6315.120605993271
loss: 6140.1093515625
loss: 5918.372184082031
Avg loss: 5678.767578125!
Epoch 3-----------------------------
loss: 5.74822265625
loss: 5468.597339950562
loss: 5098.823958251953
loss: 4707.692186279297
Avg loss: 4380.43017578125!
Epoch 4-----------------------------
loss: 4.5786376953125
loss: 4159.171546508789
loss: 3821.880580566406
loss: 3496.618873535156
Avg loss: 3311.83154296875!
Epoch 5-----------------------------
loss: 3.307545166015625
loss: 3201.3200161132813
loss: 3100.582257446289
loss: 3040.185616821289
Avg loss: 2997.701904296875!
Epoch 6-----------------------------
loss: 2.68127294921875
loss: 2950.5815877685545
loss: 2932.5497436523438
loss: 2915.6361466064454
Avg loss: 2896.9228515625!
Epoch 7--------------

KeyboardInterrupt: 

In [None]:
plt.plot(x)

# Test

In [75]:
x = [6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(x)

13

In [141]:
from gym import spaces

class CustomEnv(gym.Env):
    def __init__(self, model):
        self.model = model
        self.start = [6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.state = self.start
        self.score = 0
        self.observation_space = spaces.Box(low=0, high=210, shape=(1,13))
        self.action_space = spaces.Discrete(3)

    def step(self, action):
        self.state.append(action)
        self.state = torch.tensor(self.state).cuda()
        self.state = self.model.forward(self.state.float())
        self.state = self.state.tolist()
        reward = 0
        reward += self.state[0]
        if self.state[0] == 0:
            self.state[0] += 0.1
        reward -= 1000/self.state[0]
        if 90 <= self.state[2] <= 100:
            reward -= 10000
        if self.state[1] - 0.5 > self.score:
            self.score = self.state
            reward += 10000

        return self.state, reward, False, {}

    def reset(self):
        self.state = self.start
        self.score = 0
        return self.state

In [142]:
spaces.Box(low=0, high=210, shape=(1, 13)).sample()

array([[138.16176 ,  34.262993, 157.0522  ,  86.56509 ,  61.958645,
         78.408966,  39.83938 , 153.72751 , 208.01186 , 111.84863 ,
         92.11368 , 209.64507 ,  23.819668]], dtype=float32)

In [144]:
model = torch.load("game_model/game_model_1").cuda()

env = CustomEnv(model)
env.reset()

env.observation_space

Box([[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], [[210. 210. 210. 210. 210. 210. 210. 210. 210. 210. 210. 210. 210.]], (1, 13), float32)

In [145]:
from sb3_contrib import TRPO

model = torch.load("game_model/game_model_1").cuda()

env = CustomEnv(model)
trpo = TRPO("MlpPolicy", env, gamma=0.99, verbose=1)
trpo.learn(total_timesteps=10_000, log_interval=4)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------------------
| time/                     |           |
|    fps                    | 462       |
|    iterations             | 4         |
|    time_elapsed           | 17        |
|    total_timesteps        | 8192      |
| train/                    |           |
|    explained_variance     | -1.19e-07 |
|    is_line_search_success | 1         |
|    kl_divergence_loss     | 0.00738   |
|    learning_rate          | 0.001     |
|    n_updates              | 3         |
|    policy_objective       | 0.0112    |
|    value_loss             | 1.71e+10  |
-----------------------------------------


<sb3_contrib.trpo.trpo.TRPO at 0x18925b250a0>

In [264]:
model = torch.load("game_model/game_model_1")
x = torch.tensor(list(pd.read_csv("gamedata/start.csv").drop(["Unnamed: 0.1", "Unnamed: 0", "actions"], axis=1).loc[0]))
#model.forward(torch.tensor(x.tolist().append(2)))
x = x.tolist()
x.append(2)
x = torch.tensor(x)
x = x.type(torch.IntTensor)

model.forward(x)
x

RuntimeError: expected scalar type Int but found Float