<a href="https://colab.research.google.com/github/Teme1999/AI_2024_Harjoitustyo/blob/main/projekti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing required libraries



In [3]:
# Install Gym for environment simulation
!pip install gym
# Install Gym's Atari environments (e.g., Pong)
!pip install gym[atari]
# Install TensorFlow for neural networks
!pip install tensorflow

Collecting ale-py~=0.7.5 (from gym[atari])
  Using cached ale_py-0.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Using cached ale_py-0.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
Installing collected packages: ale-py
  Attempting uninstall: ale-py
    Found existing installation: ale-py 0.8.1
    Uninstalling ale-py-0.8.1:
      Successfully uninstalled ale-py-0.8.1
Successfully installed ale-py-0.7.5


Setting up the environment for CartPole and Pong

In [4]:
import gym

# Set up CartPole-v1 environment
env_cartpole = gym.make('CartPole-v1')

# Set up Pong-v4 environment
env_pong = gym.make('Pong-v4')

# Check environment details
print("CartPole Observation Space: ", env_cartpole.observation_space)
print("CartPole Action Space: ", env_cartpole.action_space)
print("Pong Observation Space: ", env_pong.observation_space)
print("Pong Action Space: ", env_pong.action_space)


CartPole Observation Space:  Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
CartPole Action Space:  Discrete(2)
Pong Observation Space:  Box(0, 255, (210, 160, 3), uint8)
Pong Action Space:  Discrete(6)


  deprecation(
  deprecation(


Build DQN Model using TensorFlow

In [5]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the DQN network
def create_dqn(input_shape, num_actions):
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=input_shape))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(num_actions, activation='linear'))
    return model

# Get the observation space and action space
input_shape = (env_cartpole.observation_space.shape[0],)
num_actions = env_cartpole.action_space.n

# Create DQN model
dqn_model = create_dqn(input_shape, num_actions)

# Compile the model with MSE loss and Adam optimizer
dqn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse')

dqn_model.summary()


  from jax import xla_computation as _xla_computation
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Experience Replay Buffer

In [6]:
from collections import deque
import numpy as np

class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def store(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[idx] for idx in indices]
        return map(np.array, zip(*batch))

# Initialize the replay buffer
replay_buffer = ReplayBuffer(max_size=10000)


Train the agent, this is for CartPole

In [8]:
gamma = 0.99     # Discount factor
epsilon = 1.0    # Initial exploration probability
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
episodes = 1000

def epsilon_greedy_action(state, epsilon, num_actions):
    if np.random.rand() <= epsilon:
        return np.random.choice(num_actions)  # Random action (exploration)
    q_values = dqn_model.predict(state)  # Predict Q-values for all actions
    return np.argmax(q_values[0])  # Choose the action with the highest Q-value

for episode in range(episodes):
    state = env_cartpole.reset()
    state = np.reshape(state, [1, env_cartpole.observation_space.shape[0]])  # Ensure correct shape
    done = False
    total_reward = 0

    while not done:
        # Choose action based on epsilon-greedy policy
        action = epsilon_greedy_action(state, epsilon, num_actions)

        # Take the action, observe the next state and reward
        next_state, reward, done, _ = env_cartpole.step(action)
        next_state = np.reshape(next_state, [1, env_cartpole.observation_space.shape[0]])  # Ensure correct shape

        # Store experience in the replay buffer
        replay_buffer.store((state, action, reward, next_state, done))

        # Update current state
        state = next_state
        total_reward += reward

        # Sample a mini-batch and train the model
        if len(replay_buffer.buffer) > batch_size:
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

            # Ensure the states and next_states have correct shape (batch_size, input_dim)
            states = np.array(states).reshape(batch_size, env_cartpole.observation_space.shape[0])
            next_states = np.array(next_states).reshape(batch_size, env_cartpole.observation_space.shape[0])

            # Predict Q-values for current and next states
            q_values = dqn_model.predict(states)
            next_q_values = dqn_model.predict(next_states)

            # Compute the target Q-values using the Bellman equation
            for i in range(batch_size):
                q_values[i][actions[i]] = rewards[i] + (1 - dones[i]) * gamma * np.max(next_q_values[i])

            # Train the DQN model
            dqn_model.fit(states, q_values, epochs=1, verbose=0)

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    print(f"Episode: {episode+1}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4m

KeyboardInterrupt: 

Save results

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Plot learning curves

In [None]:
import matplotlib.pyplot as plt

rewards = [...]  # Store total rewards per episode during training

plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Learning Curve')
plt.show()
