In [None]:
import gym
import numpy as np
import pandas as pd
import yfinance as yf
from gym.spaces import Box, Discrete
from gym import Env
import torch.nn as nn
from stock_trading_env import StockTradingEnv
from ppo_agent import PPO
import matplotlib.pyplot as plt
import torch.optim as optim
import torch

In [None]:
# HYPERPARAMETERS:
EPOCHS = 1000

In [None]:
def train_ppo(env, model, optimizer, epochs=10, gamma=0.99, clip_epsilon=0.2, batch_size=32, lambda_gae=0.95):
    for episode in range(epochs):
        state = env.reset()
        log_probs = []
        values = []
        rewards = []
        actions = []
        dones = []
        
        done = False
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32)
            action_probs, _ = model(state_tensor)

            print(f"Action Probabilities: {action_probs.detach().numpy()}")

            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()

            print(f"PPO Chose Action: {action.item()}")

            log_prob = action_dist.log_prob(action)
            
            next_state, reward, done, _ = env.step(action.item())
            
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(reward)
            actions.append(action)
            dones.append(done)
            
            state = next_state
        
        # Compute GAE advantages
        values = torch.stack(values).squeeze()
        advantages = []
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + (gamma * values[t + 1] if t < len(rewards) - 1 else 0) - values[t]
            gae = delta + gamma * lambda_gae * gae
            advantages.insert(0, gae)
        
        advantages = torch.tensor(advantages, dtype=torch.float32)
        returns = advantages + values.detach()
        log_probs = torch.stack(log_probs)
        
        # PPO Update
        ratio = torch.exp(log_probs - log_probs.detach())
        surrogate1 = ratio * advantages
        surrogate2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantages
        policy_loss = -torch.min(surrogate1, surrogate2).mean()
        value_loss = (returns - values).pow(2).mean()
        
        loss = policy_loss + 0.5 * value_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if episode % 10 == 0:
            print(f"Episode {episode}, Loss: {loss.item()}, Policy Loss: {policy_loss.item()}, Value Loss: {value_loss.item()}")



In [None]:
# Function to evaluate the trained PPO agent
def evaluate_ppo(env, model, episodes=10):
    all_rewards = []
    portfolio_values = []
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        portfolio_value = [env.initial_balance]
        
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32)
            action_probs, _ = model(state_tensor)
            action = torch.argmax(action_probs).item()
            
            state, reward, done, _ = env.step(action)
            total_reward += reward
            portfolio_value.append(env.balance + (env.shares_held * env.stock_data[env.current_step][3]))
        
        all_rewards.append(total_reward)
        portfolio_values.append(portfolio_value)
    
    return all_rewards, portfolio_values

In [None]:
# Train the agent
env = StockTradingEnv()
model = PPO(input_dim=9, output_dim=3)
optimizer = optim.Adam(model.parameters(), lr=0.0003)
env.render()
train_ppo(env, model, optimizer, epochs=500)


# Evaluate the agent
rewards, portfolio_values = evaluate_ppo(env, model, episodes=10)

In [None]:


# # Visualization
# plt.figure(figsize=(10, 5))
# for i, values in enumerate(portfolio_values):
#     plt.plot(values, label=f"Episode {i+1}")
# plt.xlabel("Time Steps")
# plt.ylabel("Portfolio Value ($)")
# plt.title("Portfolio Value Over Time for Different Episodes")
# plt.legend()
# plt.show()

# plt.figure(figsize=(10, 5))
# plt.plot(rewards, marker="o", linestyle="-", color="b")
# plt.xlabel("Evaluation Episode")
# plt.ylabel("Total Reward")
# plt.title("Total Reward per Evaluation Episode")
# plt.show()


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

[[7.17210114e+01 7.27765906e+01 7.14668048e+01 ... 1.35480400e+08
  7.40863770e+01 7.41229780e+01]
 [7.19413358e+01 7.27717523e+01 7.17839694e+01 ... 1.46322800e+08
  7.40863770e+01 7.41229780e+01]
 [7.11278585e+01 7.26216386e+01 7.08760678e+01 ... 1.18387200e+08
  7.40863770e+01 7.41229780e+01]
 ...
 [1.28179661e+02 1.29524031e+02 1.24423341e+02 ... 8.54384000e+07
  1.31916209e+02 1.41923730e+02]
 [1.26518978e+02 1.28980357e+02 1.26261971e+02 ... 7.57037000e+07
  1.30571840e+02 1.41648888e+02]
 [1.26934142e+02 1.28456435e+02 1.25965402e+02 ... 7.70342000e+07
  1.29922391e+02 1.41378200e+02]]



  df.fillna(method='bfill', inplace=True)
