<a href="https://colab.research.google.com/github/Svar7769/RL_game/blob/main/RL_game.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install gymnasium

  and should_run_async(code)




In [13]:
# Imports
import gymnasium as gym  # Updated import to use gymnasium instead of gym
from gymnasium import spaces  # Import spaces from gymnasium
import random  # To randomly place the player, win, and lose positions
import numpy as np  # For numerical operations and array handling
from IPython.display import clear_output  # To clear the output display
import os  # For operating system-related functions

from stable_baselines3 import PPO  # Import the PPO algorithm from stable_baselines3
from stable_baselines3.common.env_checker import check_env  # For checking the custom environment

# Global constants for game board values
NOTHING = 0  # Represents an empty space on the board
PLAYER = 1  # Represents the player's position on the board
WIN = 2  # Represents the win position on the board
LOSE = 3  # Represents the lose position on the board

# Action values
UP = 0
DOWN = 1
LEFT = 2
RIGHT = 3

# Helper function to clear the screen
def clear_screen():
    clear_output(wait=True)  # Clear the output display in Jupyter Notebook
    os.system("cls" if os.name == "nt" else "clear")  # Clear the terminal screen for Windows or Unix-based systems

# Function to print the environment state in a visually appealing way
def pretty_print(state_array, cumulative_reward):
    clear_screen()  # Clear the screen before printing the new state
    print(f'Cumulative Reward: {cumulative_reward}')  # Print the cumulative reward
    print()
    for i in range(6):  # Loop through each row
        for j in range(6):  # Loop through each column
            print('{:4}'.format(state_array[i*6 + j]), end="")  # Print each cell value with a width of 4
        print()  # Newline at the end of each row

# Define the custom environment class inheriting from gymnasium.Env
class BasicEnv(gym.Env):
    def __init__(self):
        super(BasicEnv, self).__init__()

        # Class variable for cumulative reward
        self.cumulative_reward = 0

        # Set initial state for a flattened 6x6 grid
        self.state = [NOTHING] * 36

        # Randomly place the player, win, and lose positions
        self.player_position = random.randrange(0, 36)
        self.win_position = random.randrange(0, 36)
        self.lose_position = random.randrange(0, 36)

        # Ensure win and lose positions do not overlap with each other or with the player position
        while self.win_position == self.player_position:
            self.win_position = random.randrange(0, 36)
        while self.lose_position == self.win_position or self.lose_position == self.player_position:
            self.lose_position = random.randrange(0, 36)

        # Update the state array with the player, win, and lose positions
        self.state[self.player_position] = PLAYER
        self.state[self.win_position] = WIN
        self.state[self.lose_position] = LOSE

        # Convert the state list to a numpy array
        self.state = np.array(self.state, dtype=np.int16)

        # Define the observation space (valid range for observation in the state)
        self.observation_space = spaces.Box(0, 3, [36,], dtype=np.int16)

        # Define the action space (valid actions: UP, DOWN, LEFT, RIGHT)
        self.action_space = spaces.Discrete(4)

    def step(self, action):
        # Placeholder for debugging information
        info = {}
        truncated = False

        # Set default values for done, reward, and the previous player position
        done = False
        reward = -0.01
        previous_position = self.player_position

        # Debug print for action
        print(f"Action taken: {action}")

        # Check for invalid actions
        if action == UP and (self.player_position - 6) < 0:
            action = -1  # Invalid action
        elif action == DOWN and (self.player_position + 6) >= 36:
            action = -1  # Invalid action
        elif action == LEFT and (self.player_position % 6) == 0:
            action = -1  # Invalid action
        elif action == RIGHT and (self.player_position % 6) == 5:
            action = -1  # Invalid action

        # If the action is valid, move the player
        if action == UP:
            self.player_position -= 6
        elif action == DOWN:
            self.player_position += 6
        elif action == LEFT:
            self.player_position -= 1
        elif action == RIGHT:
            self.player_position += 1
        else:
            print("Invalid action taken, skipping this step")
            return self.state, reward, done, truncated, info

        # Check win or lose condition and set reward
        if self.state[self.player_position] == WIN:
            reward = 1.0
            self.cumulative_reward += reward
            done = True
            clear_screen()
            print(f'Cumulative Reward: {self.cumulative_reward}')
            print('WIN !!!')
        elif self.state[self.player_position] == LOSE:
            reward = -1.0
            self.cumulative_reward += reward
            done = True
            clear_screen()
            print(f'Cumulative Reward: {self.cumulative_reward}')
            print("Lose :B")

        # Update the environment state
        if not done:
            self.state[previous_position] = NOTHING
            self.state[self.player_position] = PLAYER

        self.cumulative_reward += reward

        return self.state, reward, done, truncated, info

    def reset(self, seed=None, options=None):
        self.cumulative_reward = 0

        # Set the initial state to a flattened 6x6 grid
        self.state = [NOTHING] * 36

        # Randomly place the player, win, and lose positions
        self.player_position = random.randrange(0, 36)
        self.win_position = random.randrange(0, 36)
        self.lose_position = random.randrange(0, 36)

        # Ensure win and lose positions do not overlap with each other or with the player position
        while self.win_position == self.player_position:
            self.win_position = random.randrange(0, 36)
        while self.lose_position == self.win_position or self.lose_position == self.player_position:
            self.lose_position = random.randrange(0, 36)

        # Update the state array with the player, win, and lose positions
        self.state[self.player_position] = PLAYER
        self.state[self.win_position] = WIN
        self.state[self.lose_position] = LOSE

        # Convert the state list to a numpy array
        self.state = np.array(self.state, dtype=np.int16)

        return self.state, info

    def render(self):
        pretty_print(self.state, self.cumulative_reward)

# Instantiate the environment
env = BasicEnv()

# Check if the environment follows the gymnasium API
check_env(env)

# Create the PPO agent
model = PPO("MlpPolicy", env, verbose=1)

# Train the agent
model.learn(total_timesteps=10000)

# Save the trained agent
model.save("ppo_basic_env")

# Load the trained agent
model = PPO.load("ppo_basic_env")

# Visualize the current state of the environment
env.render()

# Test the agent
obs, info = env.reset()
for i in range(50):
    action, _states = model.predict(obs)  # Predict the action to take based on the current observation
    obs, rewards, dones, truncate, info = env.step(action)  # Take the action in the environment
    env.render()  # Render the environment to visualize the current state
    if dones:  # If the episode is done, break the loop
        break


Cumulative Reward: -2.0700000000000003

   0   0   0   0   0   2
   0   0   0   0   0   0
   0   0   0   0   0   0
   0   0   0   0   0   0
   0   0   0   0   0   0
   1   3   0   0   0   0


In [25]:
# Test the agent
def play_game(model, env, episodes=1, steps=50):
    for episode in range(episodes):
        obs, info = env.reset()
        env.render()
        done = False
        step = 0
        while not done and step < steps:
            action, _states = model.predict(obs)  # Predict the action to take based on the current observation
            obs, rewards, done, truncate, info = env.step(action)  # Take the action in the environment
            env.render()  # Render the environment to visualize the current state
            step += 1
            if done:
                print(f"Episode {episode + 1} finished after {step} steps")
                break

# Play the game
play_game(model, env)

Cumulative Reward: 1.74

   0   0   0   0   0   0
   0   0   0   0   0   0
   2   1   0   0   0   0
   3   0   0   0   0   0
   0   0   0   0   0   0
   0   0   0   0   0   0
Episode 1 finished after 33 steps
