In [1]:
# Imports
import gymnasium as gym
from typing import Callable

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.vec_env import VecMonitor
import torch as th
from stable_baselines3.common.callbacks import EvalCallback

import pickle

# Import Our environment
from dev_env import tradingEng



In [None]:
# Load Paths
with open("../0.7corr5001.pkl","rb") as fp:
    paths1 = pickle.load(fp)

# Load Paths
with open("../0.7corr5002.pkl","rb") as fp:
    paths2 = pickle.load(fp)


In [None]:
## LR schedule
def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining:
        :return: current learning rate
        """
        return progress_remaining * initial_value

    return func


# Def Env
def start_and_release(set, action = 'small', obs = 'xs'):
    ret = tradingEng(set, action = action, obs = obs, reward='L2')
    #del(set)
    return ret

#t = start_and_release(paths1,action='small-More-Trust', obs = 'auto')
envs = VecMonitor(DummyVecEnv([
    lambda: start_and_release(paths1,action = 'small-More-Trust', obs = 'xs'),
    lambda: start_and_release(paths2,action = 'small-More-Trust', obs = 'xs')
]))

eval_callback = EvalCallback(
    envs,
    best_model_save_path='./logs/best_model',
    log_path='./logs/eval_logs/',
    eval_freq=5000,
    deterministic=True,
    render=False
)

# Instantiate the agent
policy_kwargs = dict(activation_fn=th.nn.LeakyReLU,
                     net_arch=dict(pi=[512,512,256,128,64,64,64,64,36,18], vf=[512,512,256,128,64,64,64,64,36,18], optimizers_class = th.optim.Adam)) #
model = PPO("MlpPolicy", envs, batch_size = 252*2*5, learning_rate=linear_schedule(0.005), policy_kwargs=policy_kwargs, n_steps=252*4*5, normalize_advantage=True, gamma = 0.9, verbose = 1) 

model.learn(total_timesteps=3e6, log_interval=2, callback=eval_callback) 
# Save the agent
model.save("0.7basfall")



  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Using cpu device
Eval num_timesteps=10000, episode_reward=-0.00 +/- 0.00
Episode length: 1260.00 +/- 0.00
----------------------------------
| eval/              |           |
|    mean_ep_length  | 1.26e+03  |
|    mean_reward     | -6.83e-06 |
| time/              |           |
|    total_timesteps | 10000     |
----------------------------------
New best mean reward!
Eval num_timesteps=20000, episode_reward=-0.00 +/- 0.00
Episode length: 1260.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1.26e+03     |
|    mean_reward          | -0.000233    |
| time/                   |              |
|    total_timesteps      | 20000        |
| train/                  |              |
|    approx_kl            | 0.008831629  |
|    clip_fraction        | 0.0575       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.8         |
|    explained_variance   | -0.020309329 |
|    learning_rate    

KeyboardInterrupt: 

In [None]:
data = np.load('./logs/eval_logs/evaluations.npz')
timesteps = data['timesteps'].flatten()
mean_rewards = data['results'].mean(axis=1)  # average reward per eval
std_rewards = data['results'].std(axis=1)    # standard deviation


# Convert to pandas Series for rolling
timesteps_s = pd.Series(timesteps)
mean_s = pd.Series(mean_rewards)
std_s = pd.Series(std_rewards)

# Rolling averages
mean_smoothed = mean_s.rolling(window=10, min_periods=1).mean()
std_smoothed = std_s.rolling(window=10, min_periods=1).mean()

plt.figure(figsize=(10, 6))
plt.plot(timesteps_s, mean_smoothed, label='Mean Eval Reward (Smoothed)', linewidth=2)
plt.fill_between(timesteps_s, mean_smoothed - std_smoothed, mean_smoothed + std_smoothed,
                 alpha=0.3, label='±1 Std. Dev.', color='tab:blue')

plt.xlabel("Timesteps", fontsize=12)
plt.ylabel("Evaluation Reward", fontsize=12)
plt.title("PPO Convergence Over Time", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.xlim(1000000,3000000)
plt.ylim(-0.005,0.001)
plt.tight_layout()
plt.show()
