In [1]:
import numpy as np
import pandas
import pandas_profiling
import matplotlib.pyplot as plt
import gymnasium
from gymnasium.wrappers import RecordVideo
from gymnasium.utils.play import play
from gymnasium.experimental.wrappers import GrayscaleObservationV0
import random
import time

In [2]:
#  sets the precision to 3 decimal places.
np.set_printoptions(precision=3)

# Enviroment Setup

## Game Mode (Mode 2 is the real game)

Game Mode 0 (default) - 15

Changing the Game Mode (Good For Testing Generalisation of the model) 

In [3]:
mode = 2

In [4]:

env = gymnasium.make("ALE/SpaceInvaders-v5", render_mode='rgb_array', mode=mode)

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


# Training Agent with Q-Learning

In [5]:
action_space_size = env.action_space.n
print(action_space_size)
state_space_size = env.observation_space
print(state_space_size)


6
Box(0, 255, (210, 160, 3), uint8)


The env.observation_space is of type Box, which represents a continuous observation space. It is described as Box(0, 255, (210, 160, 3), uint8), indicating that the observations are represented as a 3-dimensional array of shape (210, 160, 3), where each element is an 8-bit unsigned integer (uint8) ranging from 0 to 255.

Since the observation space is continuous and not discrete, it does not have a fixed number of possible states represented by the attribute n. Instead, the observation values can take any real value within the defined range and shape.

If your state space is continuous, it is not practical to create a Q-table, as it would require an excessive amount of memory to store values for every possible state. In such cases, you typically use function approximation methods, such as neural networks, to approximate the Q-values or the value function.

To solve this we need to lower the observation space, i will do this by setting the game to greyscale.

In [6]:
env = GrayscaleObservationV0(env)

In [7]:
# Initialize Q-table, state counts and counts for exploration and exploitation
n_actions = env.action_space.n
n_states = env.observation_space.shape
print(n_actions)
print(n_states)

6
(210, 160)


In [8]:
# For simplicity, let's use a smaller state space, for example, by dividing the original by 10.
# This is called state discretization and can make the Q-learning algorithm feasible in this case.
state_shape = [n_states[0], n_states[1]]
q_table = np.zeros((n_states[0], n_states[1], n_actions))
print(q_table)

[[[0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  ...
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  ...
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  ...
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]]

 ...

 [[0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  ...
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  ...
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  ...
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0.]]]


In [9]:
state_counts = np.zeros(state_shape)
print(state_counts)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
state_counts = np.zeros(state_shape)

## Q-Learning Algorithm

### Hyper Parameters

In [11]:
# Initialize parameters
learning_rate = 0.01
discount_rate = 0.98

exploration_count = 0
exploitation_count = 0  

exploration_rate = 1
max_exploration_rate = 0.9
min_exploration_rate = 0.05
exploration_decay_rate = 0.0002
num_episodes = 500

In [12]:
# Additional initialization
scores = pandas.DataFrame(columns=['score'])

# Q-learning algorithm
for episode in range(num_episodes):
    state, info = env.reset()
    lives = info['lives']
    total_reward = 0

    while lives != 0:
        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state[0], state[1]])
        else:
            action = env.action_space.sample()

        # Perform the action and get the reward and new state
        next_state, reward, terminated, truncated, info = env.step(action)

        next_state = next_state

        lives = info['lives']

        # Update Q-table
        q_table[state[0], state[1], action] = q_table[state[0], state[1], action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[next_state[0], next_state[1]]))

        # Update the state
        state = next_state

        total_reward += reward

    scores = pandas.concat([scores, pandas.DataFrame({'score': [total_reward]})], ignore_index=True)

    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

env.close()

In [None]:
scores.describe()

Unnamed: 0,score
count,100.0
mean,137.9
std,91.298655
min,5.0
25%,75.0
50%,120.0
75%,180.0
max,425.0


In [None]:
print(q_table)

[[[20.925 20.87  21.16  20.901 21.794 20.62 ]
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  ...
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]]

 [[ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  ...
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]]

 [[ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  ...
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]]

 ...

 [[ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  [ 0.     0.     0.     0.     0.     0.   ]
  ..

    
    The Space Invaders environment has a continuous state space represented by images of size (210, 160, 3). The Q-learning algorithm and Q-tables are typically not suitable for environments with continuous or very large state spaces because the Q-table would need to have a size that's exponential in the number of state dimensions, which is not feasible for most computational resources.
    
    Even using GrayScale the dimensions is still too large be optimal for learning. 
    
    The fundamental issue here is that we trying to apply tabular Q-learning to a problem with a very high-dimensional state space (the screen images). Tabular methods are not suitable for such high-dimensional problems because they would require a Q-table with a number of entries that is exponential in the number of dimensions.

# Agent Setup (this need to point the trained agent)

In [None]:
def agent(state):
    """Random agent that samples actions from the environment's action space."""
    return np.argmax(q_table[state[0], state[1]])

# Recording Setup

In [None]:
output_dir = "q-learning_video"
video_output_frequency = 20
env = RecordVideo(env, output_dir, episode_trigger=lambda episode_id: episode_id == 1 or (episode_id != 0 and (episode_id) % video_output_frequency == 0))
env.metadata['render_fps'] = 24

  logger.warn(


# Testing Agent Performance

In [None]:
def get_agent_scores(num_episodes = 1):
    scores = pandas.DataFrame(columns=['score'])
    total_reward = 0
    #highest_score = 0
    
    for _ in range(0, num_episodes + 1):
        #if(_ > 1 and total_reward > 0):
        #    print(f'Game Episode {_ - 1}: Score: {total_reward}')
        env.close()
        total_reward = 0
        observation, info = env.reset()  # Getting start stats from the game
        lives = info['lives']  # Initialize lives inside the loop
        while lives != 0:
            # For demonstration purposes, we use a random agent
            action = agent(env)   # Just a random agent
            obs, reward, terminated, truncated, info = env.step(action)
            lives = info['lives']
            total_reward = total_reward + reward

        # Update highest score if current total reward is higher
        #if total_reward > highest_score:
        #    highest_score = total_reward

        scores = pandas.concat([scores, pandas.DataFrame({'score': [total_reward]})], ignore_index=True)


    """ # Print the highest score achieved
    print(f"Highest Score: {highest_score}") """

    # Close the environment to finalize the video recording
    scores = scores.drop(scores.index[0])
    return scores


In [None]:
num_episodes = 100

scores = get_agent_scores(num_episodes = num_episodes)



Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/q-learning_video/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/q-learning_video/rl-video-episode-1.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/q-learning_video/rl-video-episode-1.mp4




Moviepy - Building video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/q-learning_video/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/q-learning_video/rl-video-episode-1.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/archit3ct/Code/Assignments/AI_Mandatory_2/Timothy/q-learning_video/rl-video-episode-1.mp4


# Descriptive Statistics of Agent Performance

In [None]:
scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 1 to 100
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   score   100 non-null    float64
dtypes: float64(1)
memory usage: 932.0 bytes


In [None]:
scores.head()

Unnamed: 0,score
1,90.0
2,20.0
3,65.0
4,160.0
5,20.0


In [None]:
scores.tail()

Unnamed: 0,score
96,120.0
97,100.0
98,120.0
99,75.0
100,135.0


In [None]:
scores.describe()

Unnamed: 0,score
count,100.0
mean,155.05
std,111.524095
min,10.0
25%,87.5
50%,120.0
75%,200.0
max,545.0


In [None]:

index_start = 0
index_end = num_episodes-1

print(f"Game Episode 1: Score {scores.iloc[index_start]['score']}\nGame Episode {num_episodes}: Score {scores.iloc[index_end]['score']}")

Game Episode 1: Score 90.0
Game Episode 100: Score 135.0


In [None]:
print(f"Lowest Score: {scores['score'].min()}\nHigh Score: {scores['score'].max()}")

Lowest Score: 10.0
High Score: 545.0


## Mode (most frequent element)
It is possible for a sample to have multiple modes. In statistics, a mode refers to the value or values in a dataset that occur most frequently. If there are multiple values with the same highest frequency, the dataset is considered multimodal.

In [None]:
scores.mode()

Unnamed: 0,score
0,105.0


## Median (middle value of the data set)

In [None]:
scores.median()

score    120.0
dtype: float64

## Mean (average)

In [None]:
scores.mean()

score    155.05
dtype: float64

## Variance (Dissimilarity between samples)

For samples degrees of freedom (ddof) is 0.

Note: The number is high due there is a lot of Dissimilarities between each score.

$$\sigma=\frac{\sum\left(x_i-\mu\right)^2}{n}\text{ for samples (degrees of freedom is 0)}$$ 

In [None]:
scores.var(ddof=0)

score    12313.2475
dtype: float64

## Standard Deviation ( variability within a sample)
For samples degrees of freedom (ddof) is 0.

$$SD_0=\sqrt{\frac{\sum\left(x_i-\mu\right)^2}{n}}\text{ for samples (degrees of freedom is 0)}$$

In [None]:

scores.std(ddof=0)

score    110.965073
dtype: float64

# Report Generation

In [None]:
report = pandas_profiling.ProfileReport(scores).to_file('q-learning_agent_scores_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]