In [None]:
import numpy as np
import math
import gym
from gym import spaces
from random import random
from os import path
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout

In [None]:
class TradingEnvironment(gym.Env):
    # Constants
    PENALTY_FACTOR = 1.0

    def __init__(self, data_dir, target_symbols, input_symbols, start_date, end_date, window_size=60, stop_loss=-1.0, use_cumulative_reward=False):
        self.start_date = start_date
        self.end_date = end_date
        self.window_size = window_size
        self.stop_loss = stop_loss
        self.use_cumulative_reward = use_cumulative_reward

        self.input_symbols = []
        self.target_symbols = target_symbols
        self.data_store = {}

        # Load data for each symbol
        for symbol in (target_symbols + input_symbols):
            file_path = f"{data_dir}/{symbol}.csv"
            symbol_data = {}
            last_close = 0
            last_volume = 0

            try:
                with open(file_path, "r") as file:
                    for line in file:
                        if line.strip():
                            dt, open_price, high, low, close, volume = line.strip().split(",")
                            try:
                                if dt >= start_date:
                                    high = float(high) if high else float(close)
                                    low = float(low) if low else float(close)
                                    close = float(close)
                                    volume = int(volume)

                                    if last_close > 0 and close > 0 and last_volume > 0:
                                        close_change = (close - last_close) / last_close
                                        high_change = (high - close) / close
                                        low_change = (low - close) / close
                                        volume_change = (volume - last_volume) / last_volume
                                        symbol_data[dt] = (high_change, low_change, close_change, volume_change)

                                    last_close = close
                                    last_volume = volume
                            except Exception as e:
                                print(f"Error parsing line: {line.strip().split(',')}\nException: {e}")

                # Store data if enough records exist
                if len(symbol_data) > window_size:
                    self.data_store[symbol] = symbol_data
                    if symbol in target_symbols:
                        self.target_symbols.append(symbol)
                    if symbol in input_symbols:
                        self.input_symbols.append(symbol)
            except Exception as e:
                print(f"Error loading file: {file_path}\nException: {e}")

        self.actions = ["LONG", "SHORT"]
        self.action_space = spaces.Discrete(len(self.actions))
        self.observation_space = spaces.Box(np.ones(window_size * (len(input_symbols) + 1)) * -1, np.ones(window_size * (len(input_symbols) + 1)))

        self.reset()
        self._seed()

    def step(self, action):
        if self.done:
            return self.state, self.reward, self.done, {}

        self.reward = 0
        if self.actions[action] == "LONG":
            if sum(self.positions) < 0:
                for pos in self.positions:
                    self.reward += -(pos + 1)
                if self.use_cumulative_reward:
                    self.reward /= max(1, len(self.positions))
                if self.stop_loss * len(self.positions) > self.reward:
                    self.done = True
                self.positions = []
            self.positions.append(1.0)
        elif self.actions[action] == "SHORT":
            if sum(self.positions) > 0:
                for pos in self.positions:
                    self.reward += pos - 1
                if self.use_cumulative_reward:
                    self.reward /= max(1, len(self.positions))
                if self.stop_loss * len(self.positions) > self.reward:
                    self.done = True
                self.positions = []
            self.positions.append(-1.0)

        # Update state and cumulative profit/loss
        price_change = self.current_target[self.target_dates[self.current_index]][2]
        self.cumulative_profit *= (1 + price_change)

        for i in range(len(self.positions)):
            self.positions[i] *= TradingEnvironment.PENALTY_FACTOR * (1 + price_change * (-1 if sum(self.positions) < 0 else 1))

        self.update_state()
        self.current_index += 1

        if self.current_index >= len(self.target_dates) or self.end_date <= self.target_dates[self.current_index]:
            self.done = True

        if self.done:
            for pos in self.positions:
                self.reward += (pos * (1 if sum(self.positions) > 0 else -1)) - 1
            if self.use_cumulative_reward:
                self.reward /= max(1, len(self.positions))
            self.positions = []

        return self.state, self.reward, self.done, {"dt": self.target_dates[self.current_index], "cum": self.cumulative_profit, "code": self.current_symbol}

    def reset(self):
        self.current_symbol = self.target_symbols[int(random() * len(self.target_symbols))]
        self.current_target = self.data_store[self.current_symbol]
        self.target_dates = sorted(self.current_target.keys())
        self.current_index = self.window_size
        self.positions = []
        self.cumulative_profit = 1.0
        self.done = False
        self.reward = 0
        self.update_state()
        return self.state

    def render(self, mode='human', close=False):
        if close:
            return
        return self.state

    def _seed(self):
        return int(random() * 100)

    def update_state(self):
        temp_state = []
        position_budget = (sum(self.positions) / len(self.positions)) if len(self.positions) > 0 else 1.0
        position_size = math.log(max(1.0, len(self.positions)), 100)
        position_direction = 1.0 if sum(self.positions) > 0 else 0.0
        temp_state.append([[position_budget, position_size, position_direction]])

        price_changes = []
        volume_changes = []
        for i in range(self.window_size):
            try:
                price_changes.append([self.current_target[self.target_dates[self.current_index - 1 - i]][2]])
                volume_changes.append([self.current_target[self.target_dates[self.current_index - 1 - i]][3]])
            except Exception as e:
                print(f"Error updating state: {e}")
                self.done = True
        temp_state.append([[price_changes, volume_changes]])

        temp_state = [np.array(i) for i in temp_state]
        self.state = temp_state


In [None]:
# This is an abstract class. You need to implement your own model.
class BaseModelBuilder:

    def __init__(self, model_weights_path=None):
        self.model_weights_path = model_weights_path

    def get_model(self):
        """
        Returns a compiled model. If weights_path is provided and the file exists,
        the model's weights are loaded from that file.
        """
        model = self.build_model()

        if self.model_weights_path and path.isfile(self.model_weights_path):
            try:
                model.load_weights(self.model_weights_path)
            except Exception as e:
                print(f"Error loading weights: {e}")

        return model

    # This method should be overridden by subclasses.
    def build_model(self):
        """
        Builds and returns a Keras model. This method must be overridden by subclasses.
        """
        raise NotImplementedError("You must implement your own model by overriding the 'build_model' method.")

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Conv2D, Flatten, Dropout, LeakyReLU, concatenate

In [None]:
class MarketPolicyGradientModelBuilder(BaseModelBuilder):

    def build_model(self):
        """
        Builds a Convolutional Neural Network model for policy gradient in market environments.
        """
        # Input for basic features
        basic_input = Input(shape=(3,))
        basic_dense = Dense(5, activation="relu")(basic_input)

        inputs = [basic_input]
        merged_layers = [basic_dense]

        for i in range(1):
            # Input for sequential data (e.g., market history)
            seq_input = Input(shape=[2, 60, 1])
            inputs.append(seq_input)

            conv1 = Conv2D(2048, (3, 1), padding='valid')(seq_input)
            conv1 = LeakyReLU(0.001)(conv1)

            conv2 = Conv2D(2048, (5, 1), padding='valid')(seq_input)
            conv2 = LeakyReLU(0.001)(conv2)

            conv3 = Conv2D(2048, (10, 1), padding='valid')(seq_input)
            conv3 = LeakyReLU(0.001)(conv3)

            conv4 = Conv2D(2048, (20, 1), padding='valid')(seq_input)
            conv4 = LeakyReLU(0.001)(conv4)

            conv5 = Conv2D(2048, (40, 1), padding='valid')(seq_input)
            conv5 = LeakyReLU(0.001)(conv5)

            # Flatten and merge convolutional layers
            flattened = Flatten()(conv5)
            dense_layer = Dense(512)(flattened)
            dense_layer = LeakyReLU(0.001)(dense_layer)
            merged_layers.append(dense_layer)

            # Additional convolutional processing
            conv_final = Conv2D(2048, (60, 1), padding='valid')(seq_input)
            conv_final = LeakyReLU(0.001)(conv_final)

            flattened_final = Flatten()(conv_final)
            dense_final = Dense(512)(flattened_final)
            dense_final = LeakyReLU(0.001)(dense_final)
            merged_layers.append(dense_final)

        # Merge all layers and finalize the model
        merged_output = concatenate(merged_layers, axis=1)
        dense1 = Dense(1024)(merged_output)
        dense1 = LeakyReLU(0.001)(dense1)
        dense2 = Dense(512)(dense1)
        dense2 = LeakyReLU(0.001)(dense2)
        dense3 = Dense(256)(dense2)
        dense3 = LeakyReLU(0.001)(dense3)
        output = Dense(2, activation='softmax')(dense3)

        model = Model(inputs=inputs, outputs=output)
        return model


class MarketModelBuilder(BaseModelBuilder):

    def build_model(self):
        dropout_rate = 0.0

        # Input for basic features
        basic_input = Input(shape=(3,))
        basic_dense = Dense(5, activation="relu")(basic_input)

        inputs = [basic_input]
        merged_layers = [basic_dense]

        for i in range(1):
            # Input for sequential data (e.g., market history)
            seq_input = Input(shape=[2, 60, 1])  # Adjust input size if needed
            inputs.append(seq_input)

            conv1 = Conv2D(32, (2, 1), padding='same')(seq_input)
            conv1 = LeakyReLU(0.001)(conv1)

            conv2 = Conv2D(64, (2, 1), padding='same')(seq_input)
            conv2 = LeakyReLU(0.001)(conv2)

            conv3 = Conv2D(128, (2, 1), padding='same')(seq_input)
            conv3 = LeakyReLU(0.001)(conv3)

            conv4 = Conv2D(256, (2, 1), padding='same')(seq_input)
            conv4 = LeakyReLU(0.001)(conv4)

            conv5 = Conv2D(512, (2, 1), padding='same')(seq_input)
            conv5 = LeakyReLU(0.001)(conv5)

            # Flatten and merge convolutional layers
            flattened = Flatten()(conv5)
            dense_layer = Dense(1024)(flattened)  # Reduced from 2048
            dense_layer = LeakyReLU(0.001)(dense_layer)
            dense_layer = Dropout(dropout_rate)(dense_layer)
            merged_layers.append(dense_layer)

            # Additional convolutional processing
            conv_final = Conv2D(1024, (2, 1), padding='same')(seq_input)
            conv_final = LeakyReLU(0.001)(conv_final)

            flattened_final = Flatten()(conv_final)
            dense_final = Dense(1024)(flattened_final)  # Reduced from 4096
            dense_final = LeakyReLU(0.001)(dense_final)
            dense_final = Dropout(dropout_rate)(dense_final)
            merged_layers.append(dense_final)

        # Merge all layers and finalize the model
        merged_output = concatenate(merged_layers, axis=1)
        dense1 = Dense(512)(merged_output)  # Reduced from 1024
        dense1 = LeakyReLU(0.001)(dense1)
        dense1 = Dropout(dropout_rate)(dense1)
        dense2 = Dense(256)(dense1)  # Reduced from 512
        dense2 = LeakyReLU(0.001)(dense2)
        dense2 = Dropout(dropout_rate)(dense2)
        dense3 = Dense(128)(dense2)  # Reduced from 256
        dense3 = LeakyReLU(0.001)(dense3)
        dense3 = Dropout(dropout_rate)(dense3)
        output = Dense(2, activation='linear')(dense3)

        model = Model(inputs=inputs, outputs=output)
        return model

In [None]:
from keras.optimizers import SGD
import sys
import codecs

class TerminalColors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


class ExperienceReplay:
    def __init__(self, max_memory=100, discount=0.9):
        self.max_memory = max_memory
        self.memory = []
        self.discount = discount

    def remember(self, states, game_over):
        """Store a transition in the replay memory."""
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:
            self.memory.pop(0)

    def get_batch(self, model, batch_size=10):
        """Sample a batch of experiences and prepare them for training."""
        len_memory = len(self.memory)
        num_actions = model.output_shape[-1]
        dim = len(self.memory[0][0][0])

        inputs = [[] for _ in range(dim)]
        targets = np.zeros((min(len_memory, batch_size), num_actions))

        for i, idx in enumerate(np.random.randint(0, len_memory, size=min(len_memory, batch_size))):
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            game_over = self.memory[idx][1]

            for j in range(dim):
                inputs[j].append(state_t[j][0])

            targets[i] = model.predict(state_t)[0]
            Q_sa = np.max(model.predict(state_tp1)[0])

            if game_over:
                targets[i, action_t] = reward_t
            else:
                targets[i, action_t] = reward_t + self.discount * Q_sa

        inputs = [np.array(inputs[i]) for i in range(dim)]

        return inputs, targets


if __name__ == "__main__":
    code_list_file = "inputs.csv"
    model_file = None

    code_map = []
    with codecs.open(code_list_file, "r", "utf-8") as f:
        for line in f:
            if line.strip():
                tokens = line.strip().split(",") if "," in line else line.strip().split("\t")
                code_map.append(tokens[0])

    env = TradingEnvironment(
        data_dir="data/",
        target_symbols=code_map,
        input_symbols=[],
        start_date="2013-08-26",
        end_date="2015-08-25"
    )

    # Parameters
    epsilon = 0.5  # Exploration rate
    min_epsilon = 0.1
    epochs = 100000
    max_memory = 5000
    batch_size = 128
    discount = 0.8

    # Build and compile the model
    model = MarketModelBuilder(model_file).get_model()
    # Define the optimizer with correct argument names
    sgd_optimizer = SGD(learning_rate=0.001, decay=1e-6, momentum=0.9, nesterov=True)

    model.compile(loss='mse', optimizer='rmsprop')

    # Initialize experience replay
    experience_replay = ExperienceReplay(max_memory=max_memory, discount=discount)

    # Training loop
    win_count = 0
    for epoch in range(epochs):
        loss = 0.0
        env.reset()
        game_over = False
        cum_reward = 0
        state = env.reset()

        while not game_over:
            previous_state = state
            is_random_action = False

            # Choose action
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, env.action_space.n)
                is_random_action = True
            else:
                q_values = model.predict(previous_state)
                action = np.argmax(q_values[0])

                if np.isnan(q_values).any():
                    print("Encountered NaN in Q-values!")
                    exit()

            # Take action, observe reward and next state
            state, reward, game_over, info = env.step(action)
            cum_reward += reward

            # Print action info
            if env.actions[action] in ["LONG", "SHORT"]:
                color = TerminalColors.FAIL if env.actions[action] == "LONG" else TerminalColors.OKBLUE
                if is_random_action:
                    color = TerminalColors.WARNING if env.actions[action] == "LONG" else TerminalColors.OKGREEN

            # Store experience
            experience_replay.remember([previous_state, action, reward, state], game_over)

            # Train model on the experience batch
            inputs, targets = experience_replay.get_batch(model, batch_size=batch_size)
            loss += model.train_on_batch(inputs, targets)

        if cum_reward > 0 and game_over:
            win_count += 1

        print(f"Epoch {epoch:03d}/{epochs} | Loss {loss:.4f} | Win count {win_count} | Epsilon {epsilon:.4f}")

        # Save the model after each epoch
        model.save_weights(model_file or "model.h5", overwrite=True)

        # Decay epsilon
        epsilon = max(min_epsilon, epsilon * 0.99)


2013-11-27:	[93mLONG[0m	0.00	0.98	
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 378ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
2013-11-28:	[94mSHORT[0m	-0.02	0.98	LONG:-0.15	SHORT:-0.03
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step
2013-11-29:	[93mLONG[0m	-0.02	1.00	
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/st



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
2013-12-04:	[94mSHORT[0m	-0.03	1.06	LONG:-0.03	SHORT:0.03
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/ste



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
2013-12-05:	[91mLONG[0m	-0.04	1.04	LONG:0.02	SHORT:0.01
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/st

In [None]:
from market_model_builder import MarketPolicyGradientModelBuilder

class TerminalColors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


class PolicyGradientAgent:

    def __init__(self, environment, discount_rate=0.99, model_file=None, history_file=None):
        self.env = environment
        self.discount_rate = discount_rate
        self.model_file = model_file
        self.history_file = history_file

        self.model = MarketPolicyGradientModelBuilder(self.model_file).get_model()
        optimizer = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
        self.model.compile(loss='mse', optimizer='rmsprop')

    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_sum = 0
        rewards = rewards.flatten()

        for t in reversed(range(0, rewards.size)):
            if rewards[t] != 0:
                running_sum = 0

            running_sum = running_sum * self.discount_rate + rewards[t]
            discounted_rewards[t] = running_sum

        return discounted_rewards

    def train(self, max_episodes=1000000, max_steps_per_episode=200, verbosity=0):
        env = self.env
        model = self.model
        avg_reward_sum = 0.0

        for episode in range(max_episodes):
            env.reset()
            observation = env.reset()
            game_over = False
            reward_sum = 0

            observations = []
            actions = []
            predicted_probs = []
            rewards = []

            while not game_over:
                action_probabilities = model.predict(observation)[0]
                observations.append(observation)
                predicted_probs.append(action_probabilities)

                if action_probabilities.shape[0] > 1:
                    action = np.random.choice(self.env.action_space.n, 1, p=action_probabilities / np.sum(action_probabilities))[0]

                    one_hot_action = np.zeros([self.env.action_space.n])
                    one_hot_action[action] = 1.0
                    actions.append(one_hot_action)
                else:
                    action = 0 if np.random.uniform() < action_probabilities else 1
                    actions.append([float(action)])

                observation, reward, game_over, info = self.env.step(action)
                reward_sum += float(reward)
                rewards.append(float(reward))

                if verbosity > 0:
                    if env.actions[action] in ["LONG", "SHORT"]:
                        color = TerminalColors.FAIL if env.actions[action] == "LONG" else TerminalColors.OKBLUE
                        
            avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
            output_string = f"{episode}\t{info['code']}\t{(TerminalColors.FAIL if reward_sum >= 0 else TerminalColors.OKBLUE)}" + \
                            f"{reward_sum:.2f}{TerminalColors.ENDC}\t{info['cum']:.2f}\t{avg_reward_sum:.2f}"
            print(output_string)

            if self.history_file:
                with open(self.history_file, 'a') as history:
                    history.write(f"{output_string}\n")

            observations_reshaped = np.array([np.array(obs).flatten() for obs in observations])
            actions_reshaped = np.vstack(actions)
            predicted_probs_reshaped = np.vstack(predicted_probs)
            rewards_reshaped = np.vstack(rewards)

            discounted_rewards = self.discount_rewards(rewards_reshaped)
            discounted_rewards /= np.std(discounted_rewards)

            for i, (reward, discounted_reward) in enumerate(zip(rewards, discounted_rewards)):
                if verbosity > 1:
                    print(actions_reshaped[i], end=' ')

                if discounted_reward < 0:
                    actions_reshaped[i] = 1 - actions_reshaped[i]
                    actions_reshaped[i] /= sum(actions_reshaped[i])

                actions_reshaped[i] = np.clip(predicted_probs_reshaped[i] + (actions_reshaped[i] - predicted_probs_reshaped[i]) * abs(discounted_reward), 0, 1)

                if verbosity > 1:
                    print(predicted_probs_reshaped[i], actions_reshaped[i], reward, discounted_reward)

            model.fit(observations_reshaped, actions_reshaped, epochs=1, verbose=0, shuffle=True)
            model.save_weights(self.model_file)


if __name__ == "__main__":
    import sys
    import codecs

    code_list_file = "inputs.csv"
    model_file = None
    history_file = None

    code_map = []
    with codecs.open(code_list_file, "r", "utf-8") as f:
        for line in f:
            if line.strip():
                tokens = line.strip().split(",") if "," in line else line.strip().split("\t")
                code_map.append(tokens[0])

    env = TradingEnvironment(
        data_dir="data/",
        target_symbols=code_map,
        input_symbols=[],
        start_date="2010-08-26",
        end_date="2015-08-25"
    )

    policy_gradient_agent = PolicyGradientAgent(env, discount_rate=0.9, model_file=model_file, history_file=history_file)
    policy_gradient_agent.train(verbosity=1)