In [2]:
import os
# Keep using keras-2 (tf-keras) rather than keras-3 (keras).
os.environ['TF_USE_LEGACY_KERAS'] = '1'

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.policies import policy_saver
from utils import load_and_prepare_data, SYMBOLS, DATA_FILEPATH, POLICY_SAVE_PATH, CONTEXT_LENGTH, NUM_TRAINING_STEPS, ALPHA
from environment import CryptoTradingEnvironment

# --- Main Training Script ---
print("--- Starting Bandit Training Script ---")

# 1. Load Data
all_data = load_and_prepare_data(DATA_FILEPATH, SYMBOLS)
training_data = all_data.iloc[:NUM_TRAINING_STEPS]

# 2. Setup Environment
tf_env = tf_py_environment.TFPyEnvironment(
    CryptoTradingEnvironment(data=training_data, symbols=SYMBOLS, context_len=CONTEXT_LENGTH)
)

# 3. Setup Agent
agent = lin_ucb_agent.LinearUCBAgent(
    time_step_spec=tf_env.time_step_spec(),
    action_spec=tf_env.action_spec(),
    alpha=ALPHA,
    dtype=tf.float32
)

# 4. Setup Metrics and Oracle
def optimal_reward_oracle(observation):
    py_env = tf_env.pyenv.envs[0]
    current_step = py_env._current_step_index
    if current_step >= len(py_env._data) - 2: return 0.0
    
    rewards = []
    for action in range(py_env.action_spec().maximum + 1):
        idx, is_buy = action // 2, action % 2 == 0
        col = f'{py_env._symbols[idx]}_close'
        p_curr = py_env._data.iloc[current_step][col]
        p_next = py_env._data.iloc[current_step + 1][col]
        reward = (p_next - p_curr) / p_curr if is_buy else (p_curr - p_next) / p_curr
        rewards.append(reward)
    return np.max(rewards).astype(np.float32)

regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_oracle)

class ShowProgress:
    def __init__(self, total, interval=50):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % interval == 0:
            print("\r{}/{} Reward: {}".format(self.counter, self.total, trajectory.reward), end="")

# 5. Setup Driver
def train_step(trajectory):
    if not trajectory.is_last():
        time_axised_trajectory = tf.nest.map_structure(lambda t: tf.expand_dims(t, 1), trajectory)
        agent.train(time_axised_trajectory)
        
num_steps = len(training_data) - CONTEXT_LENGTH - 5

driver = dynamic_step_driver.DynamicStepDriver(
    env=tf_env,
    policy=agent.policy,
    num_steps=num_steps,
    observers=[train_step, regret_metric, ShowProgress(num_steps)]
)

# 6. Run Training
print(f"\nStarting training for {driver._num_steps} steps...")
driver.run()
print("Training finished.")

# 7. Save Policy
print(f"\nSaving the trained policy to: {POLICY_SAVE_PATH}")
saver = policy_saver.PolicySaver(agent.policy)
saver.save(POLICY_SAVE_PATH)
print("Policy saved successfully.")

# 8. Report Results
cumulative_regret = regret_metric.result().numpy()
print(f"\nCumulative Regret vs. Perfect Foresight Oracle: {cumulative_regret:.4f}")

2025-07-14 17:34:09.900150: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-14 17:34:09.927362: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-14 17:34:09.927381: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-14 17:34:09.928061: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-14 17:34:09.932298: I tensorflow/core/platform/cpu_feature_guar

--- Starting Bandit Training Script ---
Loading data from data/ohlcv.csv.gz...
Processing symbols: ['BTC', 'DOGE', 'XRP', 'ETH', 'SOL']
Data prepared. Shape: (13093, 15)

Starting training for 985 steps...


2025-07-14 17:34:11.550478: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2025-07-14 17:34:11.550495: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: d7a4e586fbec
2025-07-14 17:34:11.550498: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: d7a4e586fbec
2025-07-14 17:34:11.550541: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 570.158.1
2025-07-14 17:34:11.550549: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 570.158.1
2025-07-14 17:34:11.550552: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:241] kernel version seems to match DSO: 570.158.1


900/985 Reward: [0.00427779]]Training finished.

Saving the trained policy to: policy




INFO:tensorflow:Assets written to: policy/assets


INFO:tensorflow:Assets written to: policy/assets


Policy saved successfully.

Cumulative Regret vs. Perfect Foresight Oracle: 0.0063


In [3]:
import matplotlib.pyplot as plt
# plt.axhline(y=0.0, color='r', linestyle='-')
# plt.plot(rewards)
# plt.ylabel('Rewards')
# plt.xlabel('Number of Iterations')