In [None]:
!pip install -q https://github.com/PatrickKudo/flappy-bird-gymnasium/archive/refs/heads/main.zip
!pip install -q optuna

[2K     [32m\[0m [32m41.4 MB[0m [31m14.4 MB/s[0m [33m0:00:03[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for flappy-bird-gymnasium (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%writefile run_trial.py
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

import math
import random
import time
from collections import namedtuple, deque
from itertools import count
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import pygame
import gymnasium
import flappy_bird_gymnasium
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
import argparse
from statistics import mean, stdev


class DuelingDQN(nn.Module):
    def __init__(self, input_channels, input_length, action_space):
        super(DuelingDQN, self).__init__()

        # 1D Convolutional feature extractor
        self.conv1d_features = nn.Sequential(
            nn.Conv1d(input_channels, 8, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv1d(8, 16, kernel_size=2, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )

        # Fully Connected feature extractor
        self.dense_features = nn.Sequential(
            nn.Linear(input_length * input_channels, 128),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

        # Combined feature size
        combined_feature_size = 144 + 128

        # Value stream
        self.value_stream = nn.Sequential(
            nn.Linear(combined_feature_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

        # Advantage stream
        self.advantage_stream = nn.Sequential(
            nn.Linear(combined_feature_size, 128),
            nn.ReLU(),
            nn.Linear(128, action_space)
        )

    def forward(self, state):
        conv_features = self.conv1d_features(state.unsqueeze(1))
        dense_features = self.dense_features(state.view(state.size(0), -1))
        combined_features = torch.cat((conv_features, dense_features), dim=1)

        value = self.value_stream(combined_features)
        advantages = self.advantage_stream(combined_features)
        # from the paper: Q(s, a; θ, α, β) = V (s; θ, β) + (A(s, a; θ, α) − mean(A(s, a'; θ, α))).
        qvals = value + (advantages - advantages.mean(dim=1, keepdim=True))
        return qvals


# Replay memory
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)



# Initialize misc. game settings
os.environ["SDL_VIDEODRIVER"] = "dummy"
os.environ["SDL_AUDIODRIVER"] = "dummy"
pygame.init()

# Establish Flappy Bird environment
env = gymnasium.make(
    "FlappyBird-v0", audio_on=False, render_mode="rgb_array", use_lidar=False
)

# set up matplotlib display functionality
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# Check if GPU is available to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("cuda is available: ", torch.cuda.is_available())

# Single transition of environment: map state-action pairs to rewards
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))



# Initialize the parser
parser = argparse.ArgumentParser()

# Adding arguments
parser.add_argument("--batch_size", type=int, help="The batch size for the model training", required=True)
parser.add_argument("--lr", type=float, help="Learning rate", required=True)
parser.add_argument("--gamma", type=float, help="Gamma value used in optimizations", required=True)
parser.add_argument("--tau", type=float, help="Target network update rate", required=True)

# Parse the arguments
args = parser.parse_args()

BATCH_SIZE = args.batch_size
GAMMA = args.gamma
LR = args.lr
TAU = args.tau

EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000

print("Batch size:", BATCH_SIZE)
print("Learning rate:", LR)
print("Gamma:", GAMMA)
print("Tau:", TAU)

# Get number of actions from gym action space
n_actions = env.action_space.n
# Get the number of state observations
state, info = env.reset()
n_observations = len(state)

input_channels = 1
input_length = 12
policy_net = DuelingDQN(input_channels, input_length, n_actions).to(device)
target_net = DuelingDQN(input_channels, input_length, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

# Define optimizer
optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
# Set replay limit
memory = ReplayMemory(2000)

# Initialize step counter
steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1).indices.view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

episode_durations = []

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()


# Determine episodes based on GPU availability
if torch.cuda.is_available():
    num_episodes = 2000
else:
    num_episodes = 50

# Start training loop
for i_episode in range(num_episodes):
    # Initialize the environment and get its state
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    for t in count():
        action = select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device)
        done = terminated or truncated

        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)

        if done:
            episode_durations.append(t + 1)
            break

print("std: ", stdev(episode_durations))
print("mean: ", mean(episode_durations))
print("maxL: ", max(episode_durations))

with open('dqcnn_results.txt', 'w') as f:
    f.write(f"{mean(episode_durations)}")

Writing run_trial.py


In [None]:
import optuna
import gc
from statistics import mean, stdev


def objective(trial):
    # Define Hyperparameters
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    BATCH_SIZE = trial.suggest_categorical('batch_size', [64, 128, 256])
    GAMMA = trial.suggest_float('gamma', 0.88, 0.99)
    TAU = trial.suggest_float('tau', 0.001, 0.01)

    # Run python script with parameters
    !python run_trial.py --lr $lr --batch_size $BATCH_SIZE --gamma $GAMMA --tau $TAU
    gc.collect()

    with open('dqcnn_results.txt', 'r') as f:
        mean_reward = float(f.read())
    print(f"Mean reward: {mean_reward}")
    return mean_reward


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# save tuna results
study.trials_dataframe().to_csv('dqcnn_tuna_results.csv')
!cp dqcnn_tuna_results.csv "/content/drive/MyDrive/MSAI/spr24/RL/"
# visualize
optuna.visualization.plot_optimization_history(study)

[I 2024-04-24 02:59:44,160] A new study created in memory with name: no-name-65d6cdfe-58cc-45db-95a1-21559fd0c74b


pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 256
Learning rate: 1.2054082240560545e-05
Gamma: 0.9664969978823046
Tau: 0.007326869282989046
std:  14.025398377580043
mean:  58.3675
maxL:  122


[I 2024-04-24 03:16:17,531] Trial 0 finished with value: 58.3675 and parameters: {'lr': 1.2054082240560545e-05, 'batch_size': 256, 'gamma': 0.9664969978823046, 'tau': 0.007326869282989046}. Best is trial 0 with value: 58.3675.


Mean reward: 58.3675
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 128
Learning rate: 0.00019908739372021862
Gamma: 0.9854873185924804
Tau: 0.003087789523396862
std:  42.3303753123811
mean:  81.2525
maxL:  350


[I 2024-04-24 03:35:22,866] Trial 1 finished with value: 81.2525 and parameters: {'lr': 0.00019908739372021862, 'batch_size': 128, 'gamma': 0.9854873185924804, 'tau': 0.003087789523396862}. Best is trial 1 with value: 81.2525.


Mean reward: 81.2525
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 256
Learning rate: 0.00010207719094496091
Gamma: 0.9133164525292045
Tau: 0.0031290697553260152
std:  38.883551857469584
mean:  76.3645
maxL:  312


[I 2024-04-24 03:56:36,269] Trial 2 finished with value: 76.3645 and parameters: {'lr': 0.00010207719094496091, 'batch_size': 256, 'gamma': 0.9133164525292045, 'tau': 0.0031290697553260152}. Best is trial 1 with value: 81.2525.


Mean reward: 76.3645
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 64
Learning rate: 0.0009811163667618748
Gamma: 0.9585385406069935
Tau: 0.007998939936210122
std:  42.91147859544837
mean:  82.5985
maxL:  391


[I 2024-04-24 04:13:43,705] Trial 3 finished with value: 82.5985 and parameters: {'lr': 0.0009811163667618748, 'batch_size': 64, 'gamma': 0.9585385406069935, 'tau': 0.007998939936210122}. Best is trial 3 with value: 82.5985.


Mean reward: 82.5985
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 64
Learning rate: 4.343443847000902e-05
Gamma: 0.900754602159937
Tau: 0.009253021977158622
std:  26.326240772784352
mean:  69.1615
maxL:  216


[I 2024-04-24 04:27:59,927] Trial 4 finished with value: 69.1615 and parameters: {'lr': 4.343443847000902e-05, 'batch_size': 64, 'gamma': 0.900754602159937, 'tau': 0.009253021977158622}. Best is trial 3 with value: 82.5985.


Mean reward: 69.1615
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 256
Learning rate: 0.00015187711625010592
Gamma: 0.893116210055214
Tau: 0.007987394057849175
std:  31.98478559025568
mean:  73.002
maxL:  235


[I 2024-04-24 04:48:53,574] Trial 5 finished with value: 73.002 and parameters: {'lr': 0.00015187711625010592, 'batch_size': 256, 'gamma': 0.893116210055214, 'tau': 0.007987394057849175}. Best is trial 3 with value: 82.5985.


Mean reward: 73.002
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 64
Learning rate: 0.0007767394880339421
Gamma: 0.8914419299993633
Tau: 0.004163987622645837
std:  32.03157795040695
mean:  74.255
maxL:  348


[I 2024-04-24 05:04:46,201] Trial 6 finished with value: 74.255 and parameters: {'lr': 0.0007767394880339421, 'batch_size': 64, 'gamma': 0.8914419299993633, 'tau': 0.004163987622645837}. Best is trial 3 with value: 82.5985.


Mean reward: 74.255
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 128
Learning rate: 0.0005632499188606747
Gamma: 0.8837067783013438
Tau: 0.002346034173940196
std:  13.98044155679861
mean:  54.4555
maxL:  202


[I 2024-04-24 05:17:31,845] Trial 7 finished with value: 54.4555 and parameters: {'lr': 0.0005632499188606747, 'batch_size': 128, 'gamma': 0.8837067783013438, 'tau': 0.002346034173940196}. Best is trial 3 with value: 82.5985.


Mean reward: 54.4555
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 64
Learning rate: 0.0001488330027418392
Gamma: 0.9229620679133506
Tau: 0.006140939340181215
std:  38.94709627050458
mean:  79.6055
maxL:  348


[I 2024-04-24 05:33:47,734] Trial 8 finished with value: 79.6055 and parameters: {'lr': 0.0001488330027418392, 'batch_size': 64, 'gamma': 0.9229620679133506, 'tau': 0.006140939340181215}. Best is trial 3 with value: 82.5985.


Mean reward: 79.6055
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 128
Learning rate: 1.2139341639058215e-05
Gamma: 0.9767927980452539
Tau: 0.004191595375387928
std:  14.667376383149868
mean:  56.194
maxL:  199


[I 2024-04-24 05:46:50,945] Trial 9 finished with value: 56.194 and parameters: {'lr': 1.2139341639058215e-05, 'batch_size': 128, 'gamma': 0.9767927980452539, 'tau': 0.004191595375387928}. Best is trial 3 with value: 82.5985.


Mean reward: 56.194
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 64
Learning rate: 0.0004056184246135034
Gamma: 0.949541160481272
Tau: 0.009253431457415097
std:  35.938634694877685
mean:  73.927
maxL:  404


[I 2024-04-24 06:02:06,643] Trial 10 finished with value: 73.927 and parameters: {'lr': 0.0004056184246135034, 'batch_size': 64, 'gamma': 0.949541160481272, 'tau': 0.009253431457415097}. Best is trial 3 with value: 82.5985.


Mean reward: 73.927
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 128
Learning rate: 0.000258004402277914
Gamma: 0.9887658357052606
Tau: 0.0013554696839748678
std:  38.223798939962805
mean:  73.082
maxL:  407


[I 2024-04-24 06:19:15,019] Trial 11 finished with value: 73.082 and parameters: {'lr': 0.000258004402277914, 'batch_size': 128, 'gamma': 0.9887658357052606, 'tau': 0.0013554696839748678}. Best is trial 3 with value: 82.5985.


Mean reward: 73.082
pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
cuda is available:  True
Batch size: 128
Learning rate: 4.26817143166533e-05
Gamma: 0.9512479275085138
Tau: 0.005420801907344046
