<a href="https://colab.research.google.com/github/lastgiftofsummer/Deep-Reinforcement-Learning-for-stock-trading/blob/main/CliffWalking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install x11-utils > /dev/null 2>&1
!pip install pyglet > /dev/null 2>&1
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install pyvirtualdisplay > /dev/null 2>&1

import numpy as np
import gym
import random
import tensorflow as tf
from tensorflow.keras import layers, models
from collections import deque
import matplotlib.pyplot as plt
from IPython.display import clear_output
from gym import wrappers
import pyvirtualdisplay


In [None]:

# Define DQNAgent class
class DQNAgent:
    def __init__(self, state_shape, action_size):
        self.state_shape = state_shape
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.99  # Discount rate
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        model = models.Sequential()
        model.add(layers.Dense(24, input_shape=self.state_shape, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate), loss='mse')
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                t = self.target_model.predict(next_state)
                target[0][action] = reward + self.gamma * np.amax(t[0])
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self, name):
        self.model.save(name)

    def load_model(self, name):
        self.model = models.load_model(name)
        self.update_target_model()

# Define function to train DQN agent
def train_dqn(agent, env, episodes=10, batch_size=32):
    scores = []
    for e in range(episodes):
        state = env.reset()
        state = np.reshape(state, agent.state_shape)
        done = False
        score = 0
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, agent.state_shape)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        scores.append(score)
        if agent.epsilon > agent.epsilon_min:
            agent.epsilon *= agent.epsilon_decay
        if (e + 1) % 100 == 0:
            print(f"Episode {e + 1}/{episodes}, Score: {score}, Epsilon: {agent.epsilon:.2f}")
    return scores

# Initialize CliffWalking-v0 environment
env = gym.make('CliffWalking-v0')
state_shape = (1,)  # State is represented by a single integer in CliffWalking-v0
action_size = env.action_space.n

# Initialize DQN agent
agent = DQNAgent(state_shape, action_size)

# Train the agent
scores = train_dqn(agent, env)


  and should_run_async(code)
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
# Plotting scores
plt.plot(scores)
plt.xlabel('Episode')
plt.ylabel('Score')
plt.title('Training Progress')
plt.show()

# Save the trained model
agent.save_model('cliffwalking_dqn.h5')
