In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, namedtuple

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class NeuralNetwork(nn.Module):
    def __init__(self, observation_space,action_space, learningRate):
        super().__init__()
        self.layer_1 = nn.Linear(observation_space, 256)
        self.layer_2 = nn.Linear(256, 256)
        self.layer_3 = nn.Linear(256, action_space)
        self.optimizer = optim.Adam(self.parameters(), lr=learningRate)
        self.loss = nn.MSELoss()
        self.to(device)

    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        return self.layer_3(x)


class ReplayMemory:
    def __init__(self, size):
        self.memory = deque([],maxlen=size)

    def add(self, transition):
        self.memory.append(transition)

    def sample(self, batchSize):
        return random.sample(self.memory,batchSize)

    def __len__(self):
        return len(self.memory)


class DQN:
    def __init__(self, env, hyperparams, nnModel):
        self.env = env
        self.epsilon = hyperparams.epsilon
        self.epsilonMax = self.epsilon
        self.epsilonMin = hyperparams.epsilonMin
        self.epsilonDecay = hyperparams.epsilonDecay
        self.discountFactor = hyperparams.discountFactor
        self.updateFrequency = hyperparams.targetNetworkUpdateFrequency
        self.batchSize = hyperparams.batchSize
        self.episodes = hyperparams.episodes
        self.action_space = env.action_space.n
        self.learningRate = hyperparams.learningRate
        self.observation_space = env.observation_space.shape[0]
        self.memory = ReplayMemory(hyperparams.memorySize)
        self.policyNetwork = nnModel(self.observation_space, self.action_space, self.learningRate)
        self.targetNetwork = nnModel(self.observation_space, self.action_space, self.learningRate)
        self.targetNetwork.load_state_dict(self.policyNetwork.state_dict())
        self.iterations = 0

    def getAction(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        state = torch.tensor(state).float().detach()
        state = state.to(device)
        state = state.unsqueeze(0)
        qValues = self.policyNetwork(state)
        return torch.argmax(qValues).item()

    def optimize(self):
        batchSize = self.batchSize
        if len(self.memory) > batchSize:
            minibatch = np.array(self.memory.sample(batchSize))
            states = minibatch[:, 0].tolist()
            actions = minibatch[:, 1].tolist()
            rewards = minibatch[:, 2].tolist()
            nextStates = minibatch[:, 3].tolist()
            dones = minibatch[:, 4].tolist()

            states = torch.tensor(states, dtype=torch.float32).to(device)
            actions = torch.tensor(actions, dtype=torch.long).to(device)
            rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
            nextStates = torch.tensor(
                nextStates, dtype=torch.float32).to(device)
            dones = torch.tensor(dones, dtype=torch.bool).to(device)
            indices = np.arange(batchSize, dtype=np.int64)

            qValues = self.policyNetwork(states)
            qDotValues = None
            with torch.no_grad():
                qDotValues = self.targetNetwork(nextStates)

            predictedValues = qValues[indices, actions]
            predictedQDotValues = torch.max(qDotValues, dim=1)[0]

            targetValues = rewards + self.discountFactor * predictedQDotValues * dones

            loss = self.policyNetwork.loss(targetValues, predictedValues)
            self.policyNetwork.optimizer.zero_grad()
            loss.backward()
            self.policyNetwork.optimizer.step()

        

    def train(self):
        env = self.env
        observation_space = self.observation_space
        bestReward = 0
        bestAverageReward = 0
        rewards = []
        averageRewards = []
        for i in range(1, self.episodes):
            state, info = env.reset()
            state = np.reshape(state, [1, observation_space])
            totalRewardPerEpisode = 0
            steps = 0
            while True:
                action = self.getAction(state)
                nextState, reward, terminated, truncated, info = env.step(action)
                done = terminated or truncated
                nextState = np.reshape(nextState, [1, observation_space])
                self.memory.add((state[0], action, reward, nextState[0], 1 - done))
                self.optimize()
                state = nextState
                totalRewardPerEpisode += reward
                
                diff = self.epsilonMax - self.epsilonMin
                decayed_epsilon = self.epsilonMin + diff * \
                    np.exp((-1 * self.iterations) / self.epsilonDecay)
                self.iterations += 1
                self.epsilon = max(self.epsilonMin, decayed_epsilon)

                steps += 1
                if steps % 10 == 0:
                    self.targetNetwork.load_state_dict(
                        self.policyNetwork.state_dict())

                if done:
                    rewards.append(totalRewardPerEpisode)
                    if totalRewardPerEpisode > bestReward:
                        bestReward = totalRewardPerEpisode

                    averageReward = np.mean(np.array(rewards)[-100:])
                    if averageReward > bestAverageReward:
                        bestAverageReward = averageReward
                    print('-'*80)
                    print(
                        f"\nEpisode {i} \
                          \nAverage Reward of last 100 {averageReward} \
                          \nBest Average Reward of last 100 {bestAverageReward} \
                          \nBest Reward {bestReward} \
                          \nCurrent Reward {totalRewardPerEpisode} \
                          \nEpsilon {self.epsilon}\n"
                    )
                    averageRewards.append(averageReward)

                    break

                

        plt.plot(averageRewards)
        plt.show()


Hyperparams = namedtuple('Hyperparams', (
    'epsilon',
    'epsilonMin',
    'epsilonDecay',
    'learningRate',
    'batchSize',
    'discountFactor',
    'targetNetworkUpdateFrequency',
    'episodes',
    'memorySize'
))

dqnCartPole = DQN(
    env=gym.make('CartPole-v1'),
    hyperparams=Hyperparams(
        epsilon=0.99,
        epsilonMin=0.001,
        epsilonDecay=10000,
        memorySize=10000,
        learningRate=1e-4,
        batchSize=128,
        discountFactor=0.99,
        targetNetworkUpdateFrequency=20,
        episodes=700
    ),
    nnModel=NeuralNetwork
)

# dqnCartPole.train()

In [3]:
dqnLunarLander = DQN(
    env=gym.make(
    "LunarLander-v2",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5,
    ),
    hyperparams=Hyperparams(
        epsilon=0.99,
        epsilonMin=0.001,
        epsilonDecay=10000,
        memorySize=10000,
        learningRate=1e-4,
        batchSize=128,
        discountFactor=0.99,
        targetNetworkUpdateFrequency=20,
        episodes=700
    ),
    nnModel=NeuralNetwork
)
dqnLunarLander.train()

DependencyNotInstalled: box2D is not installed, run `pip install gym[box2d]`

In [9]:
dqn.policyNetwork.save('ConvergedModel.pt')

AttributeError: 'NeuralNetwork' object has no attribute 'save'

In [10]:
torch.save(dqn.policyNetwork,'ConvergedModel.pt')