In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/Documents/Fulbright\ Application\ 2020-2021/Courses/'Spring Semester 2024'/'Deep Decision and Reinforcement Learning'/project
%ls demos

Mounted at /content/drive
/content/drive/MyDrive/Documents/Fulbright Application 2020-2021/Courses/Spring Semester 2024/Deep Decision and Reinforcement Learning/project
circle_clock.json         never_seen.json  recover_3.json  recover_6.json
circle_counterclock.json  recover_1.json   recover_4.json  snake_2.json
eight.json                recover_2.json   recover_5.json  snake.json


In [None]:
import os
import json
from enum import Enum

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

from src.agent import BehaviorCloningModel, Constants
from src.ppo_agent import PPOModel
from src.dataset import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

obs_list = torch.tensor([])
action_list = torch.tensor([])

for file in os.listdir("./demos"):
    if file.startswith("*") or file.startswith("."):
        continue
    with open(f"./demos/{file}", "r") as f:
        data = json.load(f)
        for episode in data:
            min_length = min(len(episode[0]), len(episode[1]))
            obs = episode[0][:min_length]
            action = episode[1][:min_length]

            if len(obs) == 0 or len(action) == 0:
                continue

            obs = torch.tensor(obs, dtype=torch.float32)
            action = torch.tensor(action, dtype=torch.float32)
            obs_list = torch.cat([obs_list, obs])
            action_list = torch.cat([action_list, action])

dataset = Dataset(Constants.INPUT_SIZE.value, obs_list, action_list, Constants.NUM_HISTORY.value)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Instantiate model, loss function, and optimizer
# model = MultiHistoryNetwork(
#     Constants.INPUT_SIZE.value,
#     Constants.HIDDEN_SIZE.value,
#     Constants.OUTPUT_SIZE.value,
#     Constants.NUM_HISTORY.value).to(device)

# model = BehaviorCloningModel(
#     Constants.NUM_HISTORY.value,
#     Constants.INPUT_SIZE.value,
#     Constants.OUTPUT_SIZE.value
# ).to(device)

model = PPOModel(
    Constants.NUM_HISTORY.value,
    Constants.INPUT_SIZE.value,
    Constants.OUTPUT_SIZE.value
).to(device)

# create a loss function
loss_fn = nn.MSELoss().to(device)

# create an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=Constants.lr.value)
scheduler = ReduceLROnPlateau(optimizer, "min", patience=2)
# train the model
iterator = tqdm(range(1, Constants.EPOCHS.value + 1), total=Constants.EPOCHS.value, desc="Training")

for epoch in iterator:
    model.train()
    iterator.set_description("Training")
    for obs, action in train_dataloader:
        optimizer.zero_grad()
        obs = obs.to(device)
        action = action.to(device)
        # pred = model(obs)
        dist = model(obs)
        log_prob = dist.log_prob(action).sum()
        loss = -log_prob.mean()
        loss.backward()
        optimizer.step()

    # evaluate the model
    iterator.set_description("Evaluating")
    model.eval()
    with torch.no_grad():
        test_loss = 0
        for obs, action in test_dataloader:
            obs = obs.to(device)
            action = action.to(device)
            # pred = model(obs)
            dist = model(obs)
            log_prob = dist.log_prob(action).sum()
            loss = -log_prob.mean()
            test_loss += loss.item()
        test_loss /= len(test_dataloader)
    iterator.set_postfix(epoch=epoch, loss=test_loss)
    scheduler.step(test_loss)

# save the model
torch.save(model.state_dict(), f"pretrained_model_dict_{device}.pt")
torch.save(optimizer.state_dict(), f"pretrained_optimizer_dict_{device}.pt")

cuda


Evaluating: 100%|██████████| 40/40 [08:31<00:00, 12.78s/it, epoch=40, loss=-118]
