In [None]:
!pip3 install torch torchvision torchaudio

In [None]:
!pip3 install stable-baselines3[extra] protobuf==3.20.*

In [None]:
!sudo apt install tesseract-ocr -y

In [None]:
!sudo apt-get install python3-tk python3-dev -y

In [None]:
!pip install mss pyautogui pytesseract

In [2]:
from mss import mss
import pyautogui
import cv2
import numpy as np
from matplotlib import pyplot as plt
import time
from gymnasium import Env
from gymnasium.spaces import Box, Discrete

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [None]:
#just testing dimensions

# game_location = {'top': 125, 'left': 50, 'width': 60, 'height': 50}
# img = mss().grab(game_location)
# img = cv2.resize(np.array(img)[:, :, :3], (0, 0), fx=8, fy=8)
# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# thr = cv2.cvtColor(cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], cv2.COLOR_BGR2RGB)
# plt.imshow(cv2.cvtColor(thr, cv2.COLOR_BGR2RGB))
# pytesseract.image_to_string(thr, config="--psm 10 -c page_separator=''")

In [3]:
class WebGame(Env):
    def __init__(self):
        super().__init__()
        # Setup spaces
        self.observation_space = Box(low=0, high=255, shape=(1,100,100), dtype=np.uint8)
        self.action_space = Discrete(4)
        # Capture game frames
        self.cap = mss()
        self.game_location = {'top': 100, 'left': 0, 'width': 600, 'height': 600}
        self.done_location = {'top': 600, 'left': 635, 'width': 75, 'height': 50}
        self.score_location = {'top': 125, 'left': 50, 'width': 60, 'height': 50}

    def step(self, action):
        action_map = {
            0: 'up',
            1: 'down',
            2: 'left',
            3: 'right'
        }
        pyautogui.press(action_map[action])
        # separated by frames so it doesn't run all at the same time (better for speed)
        
        reward, done = 0, False
#         if action != self.previous_action:
#             reward = 0.01
#             self.previous_action = action
        observation = self.get_observation()
        if self.frame == 2:
            done, done_cap = self.get_done() 
            self.frame = 0
        else:
            self.frame += 1
            if self.frame == 2:
                score = self.get_score()
                if score != self.previous_score:
                    reward = 1
                    self.previous_score = score
        
        info = {}
#         if done:
#             reward = -0.1
#         if (self.previous_action == 0 and action == 3) or (self.previous_action == 3 and action == 0):
#             reward = 1
#         self.previous_action = action
        return observation, reward, done, False, info

    def reset(self, **kwargs):
        self.previous_score = "0\n" 
        time.sleep(0.5)
        pyautogui.click(x=600, y=600)
        time.sleep(0.5)
        pyautogui.press('up')
        self.frame = 2
#         self.previous_action = 0
        return self.get_observation(), {}

    def get_observation(self):
        raw = np.array(self.cap.grab(self.game_location))[:,:,:3].astype(np.uint8)
        gray = cv2.cvtColor(raw, cv2.COLOR_BGR2GRAY)
        resized = cv2.resize(gray, (100,100))
        channel = np.reshape(resized, (1,100,100))
        return channel

    def get_score(self):
        score_cap = np.array(self.cap.grab(self.score_location))[:, :, :3]
        img = cv2.resize(score_cap, (0, 0), fx=8, fy=8)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        thr = cv2.cvtColor(cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], cv2.COLOR_BGR2RGB)
        score_string = pytesseract.image_to_string(thr, config="--psm 10 -c page_separator=''")
        score = score_string
        return score

    def get_done(self):
        done_cap = np.array(self.cap.grab(self.done_location))[:, :, :3]
        img = cv2.resize(done_cap, (0, 0), fx=8, fy=8)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        thr = cv2.cvtColor(cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], cv2.COLOR_BGR2RGB)
        done = False
        res = pytesseract.image_to_string(thr)
        if 'play' in res.lower():
            done = True
        return done, done_cap

In [None]:
# play the game but can be ignored for now

# env = WebGame()

# for episode in range(1):
#     obs = env.reset()
#     terminated = False
#     total_reward = 0
# #     obs, reward, terminated, truncated, info =  env.step(3)
# #     obs, reward, terminated, truncated, info =  env.step(1)
#     while not terminated:
#         action = model.predict(obs[0].reshape((1, 100, 100)))
#         # print(action)
#         obs, reward, terminated, truncated, info =  env.step(int(action[0]))
#         total_reward  += reward
# #         if reward != 0:
# #             print(terminated, env.get_score(), reward)
#     print('Total Reward for episode {} is {}'.format(episode, total_reward))   

In [4]:
# Import os for file path management
import os 
# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback
# Check Environment    
from stable_baselines3.common import env_checker

In [None]:
env_checker.check_env(env)

In [5]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [6]:
CHECKPOINT_DIR = './train/post'
LOG_DIR = './logs/post'

In [7]:
callback = TrainAndLoggingCallback(check_freq=5000, save_path=CHECKPOINT_DIR)

In [8]:
from stable_baselines3 import PPO, DQN

In [9]:
env = WebGame()

In [None]:
# show observation
# plt.imshow(cv2.cvtColor(env.get_observation()[0], cv2.COLOR_BGR2RGB))

In [3]:
# trial to do some kind of resnet but I've also tried making the NatureCNN from the SB3 website wider and denser

from torch import nn
import torch as th
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CNNPart(nn.Module):
    def __init__(self, n_input_channels):
        super().__init__()
        
        self.convblock1 = nn.Sequential(
            nn.Conv2d(n_input_channels, 128, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(128), 
            nn.ReLU(),)
        
        self.convblock2 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(128),
            nn.ReLU(),)
        
        self.convblock3 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=5, stride=1, padding=2), 
            nn.BatchNorm2d(128),  
            nn.ReLU(),)
        
        self.pool = nn.AvgPool2d(2, 2)
    
    def forward(self, x):
        x1 = self.pool(self.convblock1(x))
        x2 = self.pool(self.convblock2(x1))
        x3 = self.convblock3(x2) + x2 + self.pool(x1)
        return nn.Flatten()(x3)
    

class CustomNatureCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=512):
        super().__init__(observation_space, features_dim)

        n_input_channels = observation_space.shape[0]
        
        self.cnn = CNNPart(n_input_channels)

        with th.no_grad():
            n_flatten = self.cnn(th.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim * 4),  # New Layer
            nn.BatchNorm1d(features_dim * 4),  # New Layer
            nn.ReLU(),  # New Layer
            
            nn.Linear(features_dim * 4, features_dim * 2),
            nn.BatchNorm1d(features_dim * 2),  # Added BatchNorm
            nn.ReLU(),
            
            nn.Linear(features_dim * 2, features_dim),  # New Layer
            nn.BatchNorm1d(features_dim),  # New Layer
            nn.ReLU(),  # New Layer
        )

    def forward(self, x):
        x = self.cnn(x)
        return self.linear(x)

In [1]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.0003, n_steps=128, ent_coef=0.01, gamma=0.97, policy_kwargs=dict(features_extractor_class=CustomNatureCNN))
# model = DQN('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, buffer_size=10000)

NameError: name 'PPO' is not defined

In [20]:
model.learn(total_timesteps=11000, callback=callback)

Logging to ./logs/post/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 15.5     |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 3        |
|    iterations      | 1        |
|    time_elapsed    | 40       |
|    total_timesteps | 128      |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.5         |
|    ep_rew_mean          | 0            |
| time/                   |              |
|    fps                  | 1            |
|    iterations           | 2            |
|    time_elapsed         | 166          |
|    total_timesteps      | 256          |
| train/                  |              |
|    approx_kl            | 0.0015186016 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | -

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 19.7        |
|    ep_rew_mean          | 0.0714      |
| time/                   |             |
|    fps                  | 1           |
|    iterations           | 11          |
|    time_elapsed         | 1322        |
|    total_timesteps      | 1408        |
| train/                  |             |
|    approx_kl            | 0.028793875 |
|    clip_fraction        | 0.366       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.34       |
|    explained_variance   | -1.94       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.095      |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0632     |
|    value_loss           | 0.00189     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 19.9  

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 18.7      |
|    ep_rew_mean          | 0.12      |
| time/                   |           |
|    fps                  | 1         |
|    iterations           | 21        |
|    time_elapsed         | 2611      |
|    total_timesteps      | 2688      |
| train/                  |           |
|    approx_kl            | 0.0312887 |
|    clip_fraction        | 0.462     |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.35     |
|    explained_variance   | -0.0547   |
|    learning_rate        | 0.0003    |
|    loss                 | -0.106    |
|    n_updates            | 200       |
|    policy_gradient_loss | -0.0504   |
|    value_loss           | 0.0168    |
---------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 18.6       |
|    ep_rew_mean          | 0.11     

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 19          |
|    ep_rew_mean          | 0.15        |
| time/                   |             |
|    fps                  | 1           |
|    iterations           | 31          |
|    time_elapsed         | 3896        |
|    total_timesteps      | 3968        |
| train/                  |             |
|    approx_kl            | 0.073794484 |
|    clip_fraction        | 0.474       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.986      |
|    explained_variance   | 0.223       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0817     |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.0679     |
|    value_loss           | 0.0592      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 19.1    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 19.9        |
|    ep_rew_mean          | 0.2         |
| time/                   |             |
|    fps                  | 1           |
|    iterations           | 41          |
|    time_elapsed         | 5177        |
|    total_timesteps      | 5248        |
| train/                  |             |
|    approx_kl            | 0.038018443 |
|    clip_fraction        | 0.378       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.889      |
|    explained_variance   | 0.257       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.049      |
|    n_updates            | 400         |
|    policy_gradient_loss | -0.0201     |
|    value_loss           | 0.0217      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 19.8    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 17.1        |
|    ep_rew_mean          | 0.13        |
| time/                   |             |
|    fps                  | 1           |
|    iterations           | 51          |
|    time_elapsed         | 6473        |
|    total_timesteps      | 6528        |
| train/                  |             |
|    approx_kl            | 0.050667465 |
|    clip_fraction        | 0.261       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.453      |
|    explained_variance   | -1.5        |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0571     |
|    n_updates            | 500         |
|    policy_gradient_loss | -0.0174     |
|    value_loss           | 0.00662     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 16.4  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 16         |
|    ep_rew_mean          | 0.05       |
| time/                   |            |
|    fps                  | 1          |
|    iterations           | 61         |
|    time_elapsed         | 7773       |
|    total_timesteps      | 7808       |
| train/                  |            |
|    approx_kl            | 0.34706905 |
|    clip_fraction        | 0.448      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.567     |
|    explained_variance   | 0.281      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0578    |
|    n_updates            | 600        |
|    policy_gradient_loss | -0.0435    |
|    value_loss           | 0.0247     |
----------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 16        |
|    ep_rew_mean   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 16.1        |
|    ep_rew_mean          | 0.09        |
| time/                   |             |
|    fps                  | 1           |
|    iterations           | 71          |
|    time_elapsed         | 9072        |
|    total_timesteps      | 9088        |
| train/                  |             |
|    approx_kl            | 0.050124153 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.444      |
|    explained_variance   | -42.9       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00151     |
|    n_updates            | 700         |
|    policy_gradient_loss | -0.011      |
|    value_loss           | 0.000289    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 16.2  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 15.8       |
|    ep_rew_mean          | 0          |
| time/                   |            |
|    fps                  | 0          |
|    iterations           | 81         |
|    time_elapsed         | 10372      |
|    total_timesteps      | 10368      |
| train/                  |            |
|    approx_kl            | 0.06621829 |
|    clip_fraction        | 0.256      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.342     |
|    explained_variance   | -7.73      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0726    |
|    n_updates            | 800        |
|    policy_gradient_loss | -0.0274    |
|    value_loss           | 0.000478   |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 15.9        |
|    ep_rew_m

<stable_baselines3.ppo.ppo.PPO at 0x7f259b121790>

In [None]:
model = PPO.load('./train/post/best_model_10000.zip', env)

In [None]:
# from torchvision.models import mobilenet_v3_large

# model = mobilenet_v3_large()

In [None]:
model.policy.pi_features_extractor.cnn

In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np

# Function to capture intermediate output
def get_intermediate_output(model, input_data, layer_num):
    temp_model = torch.nn.Sequential(*(list(model.children())[:layer_num+1]))
    with torch.no_grad():
        intermediate_output = temp_model(input_data)
    return intermediate_output

# Function to visualize the most "important" feature map
def visualize_important_feature(features):
    feature_maps = features.squeeze().cpu().numpy()
    # Compute sum or mean along x, y dimensions for each feature map
    importance = np.sum(np.abs(feature_maps), axis=(1, 2))
    
    # Find the index of the feature map with the greatest sum or mean
    most_important_idx = np.argmax(importance)
    
    plt.imshow(feature_maps[most_important_idx], cmap='viridis')
    plt.axis('off')
    plt.show()

# Assume you have loaded or trained a Stable Baselines 3 PPO model
# model = PPO.load("your_model_path")
# Also assume 'obs' is a single observation from your environment, shaped correctly

obs = env.reset()

# Get intermediate features
layer_num = 11  # Replace with the layer number you're interested in
features = get_intermediate_output(model.policy.features_extractor.cnn, torch.tensor(obs[0][np.newaxis, :, :, :], dtype=torch.float32), layer_num)

# Visualize the most "important" feature map
visualize_important_feature(features)


In [None]:
env.reset()

In [None]:
import torch as th
from gymnasium import spaces
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomMobileNet(BaseFeaturesExtractor):
    
    def __init__(self, observation_space: spaces.Box, features_dim: int = 512):
        super().__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        
        self.model = mobilenet_v3_large()
        
        self.model.features[0][0] = th.nn.Conv2d(n_input_channels, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)

        self.model.classifier[3] = th.nn.Linear(self.model.classifier[3].in_features, features_dim, bias=True)

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.model(observations)

In [None]:
!tensorboard --logdir=logs/post/PPO_1

TensorFlow installation not found - running with reduced feature set.
/home/vini_suaiden/.local/lib/python3.9/site-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.33' not found (required by /home/vini_suaiden/.local/lib/python3.9/site-packages/tensorboard_data_server/bin/server)
/home/vini_suaiden/.local/lib/python3.9/site-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /home/vini_suaiden/.local/lib/python3.9/site-packages/tensorboard_data_server/bin/server)
/home/vini_suaiden/.local/lib/python3.9/site-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.32' not found (required by /home/vini_suaiden/.local/lib/python3.9/site-packages/tensorboard_data_server/bin/server)
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.14.1 at http://localhost:6006/ (Press CTRL+C to quit)


In [None]:
!ls logs/post40

In [None]:
!rm -r train/post31_2

In [None]:
!ls /home/vini_suaiden/.local/lib/python3.9/site-packages

In [None]:
!pip install pygad

In [None]:
def mutate_model(model, mutation_rate=0.5):
    for param in model.parameters():
        noise = th.randn_like(param.data) * mutation_rate
        param.data += noise

In [None]:
def fitness(model):
    with th.no_grad():
        obs = env.reset()
        terminated = False
        total_reward = 0
        while not terminated:
            action = model(th.from_numpy(obs[0].reshape((1, 1, 100, 100))).float())
            obs, reward, terminated, truncated, info =  env.step(int(th.argmax(action[0])))
            total_reward  += reward
        print(total_reward)
    return total_reward

In [None]:
import copy
import numpy as np

# 0.5 0.5 0.1
# best_net = NatureCNNAction(env.observation_space, env.action_space)

for i in range(20):
    models, fitnesses = [best_net], np.array([fitness(best_net)])

    for j in range(5):
        new_net = copy.deepcopy(best_net)
        mutate_model(new_net, 0.005)
        models += [new_net]
        fitnesses = np.append(fitnesses, fitness(new_net))

    if np.argmax(fitnesses) != 0:
        if fitness(models[np.argmax(fitnesses)]) > fitness(models[0]):
            best_net = models[np.argmax(fitnesses)]

In [None]:
def nfitness(model):
    with th.no_grad():
        obs = env.reset()
        terminated = False
        total_reward = 0
        while not terminated:
            action = model(th.from_numpy(obs[0].reshape((1, 1, 100, 100))).float())
            print(action)
            obs, reward, terminated, truncated, info =  env.step(int(th.argmax(action[0])))
            total_reward  += reward
    return total_reward

In [None]:
nfitness(best_net)

In [None]:
def encode_weights(net):
    return th.cat([p.flatten() for p in net.parameters()])

def decode_weights(vector, net):
    start = 0
    for p in net.parameters():
        end = start + p.numel()
        p.data.copy_(vector[start:end].view(p.size()))
        start = end

def fitness_func(ga_instance, solution, solution_idx):
    decode_weights(th.Tensor(solution), net)
    obs = env.reset()
    terminated = False
    total_reward = 0
    while not terminated:
        action = net(th.from_numpy(obs[0].reshape((1, 1, 100, 100))).float())
        obs, reward, terminated, truncated, info =  env.step(int(th.argmax(action[0])))
        total_reward  += reward
    print(total_reward)
    return total_reward

In [None]:
num_generations = 100
num_parents_mating = 1
#keep_parents = 1
sol_per_pop = 5

initial_population = th.randn((sol_per_pop, encode_weights(net).shape[0])).numpy()

ga_instance = pygad.GA(
    num_generations=num_generations,
    num_parents_mating=num_parents_mating,
    fitness_func=fitness_func,
    sol_per_pop=sol_per_pop,
    num_genes=initial_population.shape[1],
    init_range_low=-1,
    init_range_high=1,
    #parent_selection_type="rank",
    #keep_parents=keep_parents,
    keep_elitism=1,
    crossover_type=None,
    mutation_type="random",
    mutation_probability=1.0
)

In [None]:
ga_instance.run()

In [None]:
obs = env.reset()

In [None]:
obs[0].reshape((1, 100, 100))

In [None]:
!rm -r train/post36