In [18]:
import numpy as np
import pandas
import pandas_profiling
import matplotlib.pyplot as plt
import gymnasium
import random
import time
from gymnasium.wrappers import RecordVideo
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MSE
from collections import namedtuple, deque
from tensorflow.keras.layers import Conv2D, Flatten

In [19]:
#  sets the precision to 3 decimal places.
np.set_printoptions(precision=3)

# Enviroment Setup (Mode 2 is the real game)
Game Mode 0 (default) - 15

Changing the Game Mode (Good For Testing Generalisation of the model) 

In [20]:
mode = 2
env = gymnasium.make("ALE/SpaceInvaders-v5", render_mode='rgb_array', mode=mode)
output_dir = "test_video"
video_output_frequency = 20
env = RecordVideo(env, output_dir, episode_trigger=lambda episode_id: episode_id == 1 or (episode_id != 0 and (episode_id) % video_output_frequency == 0))
env.metadata['render_fps'] = 24

  logger.warn("Unable to save last video! Did you call close()?")
  logger.warn(


# Training the Agent with Deep Q-Learning

We discovered with Q-learning that it was not possible to train the agent with the Q-Learning algorithm due to the limitations of the Q-Table. The state returned by env.reset() and env.step(action) in the Atari environments is a full image from the game screen. Pixel values range from 0 to 255, and most of them will not correspond to valid indices in your Q-table.

Therefore we will utilize Deep Q-Learning to solve this issue.

In [21]:
BUFFER_SIZE = 100_000
GAMMA = 0.995 # discount factor
ALPHA = 0.001 # learning rate
TAU = 0.001 # soft update factor

STEPS_BETWEEN_LEARNING = 4
MINI_BATCH_SIZE = 64

In [22]:
state_space_size = env.observation_space.shape
action_space_size = env.action_space.n
print(state_space_size)
print(action_space_size)

(210, 160, 3)
6


In [23]:
q_net = Sequential([
    Input(state_space_size),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(action_space_size, activation='linear')
])

In [24]:
target_q_net = Sequential([
    Input(state_space_size),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(action_space_size, activation='linear')
])

In [25]:
optimizer = Adam(learning_rate=ALPHA)



In [26]:
experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "termitrun"])

In [27]:
def loss(experiences, gamma, q_net, target_q_net):
    '''
    y_j = R_j if episode terminates at T_{j+1}, else
    y_j = R_j + gamma max_{a'} Q^(s_{j+1}, a')
    '''
    # unpack experiences into its components
    states, actions, rewards, next_states, termitrun = experiences
    
    # compute max Q^(s,a)
    max_qsa = tf.reduce_max(target_q_net(next_states), axis=-1)
    
    # y = R if episode terminates, else y = R + y max Q^(s,a)
    # note: done_vals is boolean, and (1 - done_vals) == 0 if done_vals == True
    y_targets = rewards + (gamma * max_qsa * (1 - termitrun))
    
    q_values = q_net(states)
    # tf.gather_nd(params, indices) returns `indices` slices of `params`
    q_values = tf.gather_nd(q_values, 
                            tf.stack([tf.range(q_values.shape[0]),
                                      tf.cast(actions, tf.int32)],
                                     axis=1
                                    ))
    
    # compute MSE loss
    return MSE(y_targets, q_values)

In [28]:
print(env.action_space)

Discrete(6)


In [29]:
@tf.function
def learn(experiences, gamma):
    """
    Fit the Q and Q target networks
    """
    
    # compute loss
    with tf.GradientTape() as tape:
        computed_loss = loss(experiences, gamma, q_net, target_q_net)
        
    # update q_net
    gradients = tape.gradient(computed_loss, q_net.trainable_variables)
    optimizer.apply_gradients(zip(gradients, q_net.trainable_variables))
    
    # update target_q_net
    for target_weights, q_net_weights in zip(target_q_net.weights, q_net.weights):
        target_weights.assign(TAU * q_net_weights + (1.0 - TAU) * target_weights)

In [30]:
time_begin = time.time()

num_episodes = 100 # how many times we play the game

scores = pandas.DataFrame(columns=['score'])

epsilon = 1.0 # initial epsilon for greedy policy (exploration/exploitation)
epsilon_decay = 0.995
epsilon_min = 0.01

buffer = deque(maxlen=BUFFER_SIZE)

scores = pandas.DataFrame(columns=['score'])

# clone q_net into target_q_net
target_q_net.set_weights(q_net.get_weights())

for episode in range(1, num_episodes + 1):
    total_reward = 0
    state, info = env.reset()  # Getting start stats from the game
    lives = info['lives']  # Initialize lives inside the loop

    while lives > 0:
        # For demonstration purposes, we use a random agent
        action = env.action_space.sample()   # Just a random agent
        obs, reward, terminated, truncated, info = env.step(action)
        lives = info['lives']
        total_reward += reward

        # Update highest score if current total reward is higher
        #if total_reward > highest_score:
        #    highest_score = total_reward

        scores = pandas.concat([scores, pandas.DataFrame({'score': [total_reward]})], ignore_index=True)

env.close()

Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4




Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4
Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4




Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4




In [None]:
scores.describe()

Unnamed: 0,score
count,50807.0
mean,74.63381
std,87.237915
min,0.0
25%,10.0
50%,50.0
75%,105.0
max,605.0


In [None]:
scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50807 entries, 0 to 50806
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   score   50807 non-null  float64
dtypes: float64(1)
memory usage: 397.1 KB


## Implementing Deep Q-Learning (DQL)

## Agent Setup (this need to point the trained agent)

In [None]:
def agent(env):
    """Random agent that samples actions from the environment's action space."""
    return env.action_space.sample()

# Agent Testing

In [None]:
""" def get_agent_scores(num_episodes = 1):
    scores = pandas.DataFrame(columns=['score'])
    total_reward = 0
    #highest_score = 0
    
    for _ in range(0, num_episodes + 1):
        #if(_ > 1 and total_reward > 0):
        #    print(f'Game Episode {_ - 1}: Score: {total_reward}')
        env.close()
        total_reward = 0
        observation, info = env.reset()  # Getting start stats from the game
        lives = info['lives']  # Initialize lives inside the loop
        while lives != 0:
            # For demonstration purposes, we use a random agent
            action = agent(env)   # Just a random agent
            obs, reward, terminated, truncated, info = env.step(action)
            lives = info['lives']
            total_reward = total_reward + reward

        # Update highest score if current total reward is higher
        #if total_reward > highest_score:
        #    highest_score = total_reward

        scores = pandas.concat([scores, pandas.DataFrame({'score': [total_reward]})], ignore_index=True)


    """ # Print the highest score achieved
    print(f"Highest Score: {highest_score}") """

    # Close the environment to finalize the video recording
    scores = scores.drop(scores.index[0])
    return scores
 """

IndentationError: unexpected indent (1960065885.py, line 28)

In [None]:
num_episodes = 100

scores = get_agent_scores(num_episodes = num_episodes)



Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4




Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-1.mp4




Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4
Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-20.mp4
Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-40.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-40.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-40.mp4
Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-40.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-40.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-40.mp4
Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-60.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-60.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-60.mp4




Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-60.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-60.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-60.mp4
Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-80.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-80.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-80.mp4




Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-80.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-80.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/test_video/rl-video-episode-80.mp4




# Descriptive Statistics of Agent Performance

In [None]:
scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 1 to 100
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   score   100 non-null    float64
dtypes: float64(1)
memory usage: 932.0 bytes


In [None]:
scores.head()

Unnamed: 0,score
1,110.0
2,105.0
3,190.0
4,155.0
5,35.0


In [None]:
scores.tail()

Unnamed: 0,score
96,30.0
97,210.0
98,60.0
99,140.0
100,120.0


In [None]:
scores.describe()

Unnamed: 0,score
count,100.0
mean,133.55
std,88.652354
min,30.0
25%,70.0
50%,120.0
75%,171.25
max,545.0


In [None]:

index_start = 0
index_end = num_episodes-1

print(f"Game Episode 1: Score {scores.iloc[index_start]['score']}\nGame Episode {num_episodes}: Score {scores.iloc[index_end]['score']}")

Game Episode 1: Score 110.0
Game Episode 100: Score 120.0


In [None]:
print(f"Lowest Score: {scores['score'].min()}\nHigh Score: {scores['score'].max()}")

Lowest Score: 30.0
High Score: 545.0


## Mode (most frequent element)
It is possible for a sample to have multiple modes. In statistics, a mode refers to the value or values in a dataset that occur most frequently. If there are multiple values with the same highest frequency, the dataset is considered multimodal.

In [None]:
scores.mode()

Unnamed: 0,score
0,155.0


## Median (middle value of the data set)

In [None]:
scores.median()

score    120.0
dtype: float64

## Mean (average)

In [None]:
scores.mean()

score    133.55
dtype: float64

## Variance (Dissimilarity between samples)

For samples degrees of freedom (ddof) is 0.

Note: The number is high due there is a lot of Dissimilarities between each score.

$$\sigma=\frac{\sum\left(x_i-\mu\right)^2}{n}\text{ for samples (degrees of freedom is 0)}$$ 

In [None]:
scores.var(ddof=0)

score    7780.6475
dtype: float64

## Standard Deviation ( variability within a sample)
For samples degrees of freedom (ddof) is 0.

$$SD_0=\sqrt{\frac{\sum\left(x_i-\mu\right)^2}{n}}\text{ for samples (degrees of freedom is 0)}$$

In [None]:

scores.std(ddof=0)

score    88.207979
dtype: float64

# Report Generation

In [None]:
report = pandas_profiling.ProfileReport(scores).to_file('test_agent_scores_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]