In [None]:
import os
# Keep using keras-2 (tf-keras) rather than keras-3 (keras).
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
gpus = tf.config.list_physical_devices('GPU')
print(f"gpus: {gpus}")

tf.test.is_built_with_cuda()

In [None]:
from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.environments import bandit_py_environment
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts

# --- 1. Data Simulation ---
def generate_dummy_data(num_steps, num_cryptos):
    """Generates a DataFrame with dummy crypto prices."""
    data = {}
    initial_prices = np.array([40000, 2000, 1.5])
    for i, name in enumerate(['BTC', 'ETH', 'ADA']):
        prices = np.zeros(num_steps)
        prices[0] = initial_prices[i]
        for t in range(1, num_steps):
            prices[t] = prices[t-1] * (1 + np.random.normal(0.0, 0.02))
        data[f'{name}_close'] = prices
    return pd.DataFrame(data)

NUM_CRYPTOS = 3
CRYPTO_NAMES = ['BTC', 'ETH', 'ADA']
NUM_STEPS = 2000
data = generate_dummy_data(NUM_STEPS, NUM_CRYPTOS)

# --- 2. Create the Bandit Environment ---
class CryptoTradingEnvironment(bandit_py_environment.BanditPyEnvironment):
    def __init__(self, data, lookback_window=10):
        self._data = data
        self._lookback = lookback_window
        self._num_cryptos = len(CRYPTO_NAMES)
        self._num_actions = self._num_cryptos * 2
        self._current_step = self._lookback
        
        observation_size = self._num_cryptos * (self._lookback - 1)
        observation_spec = array_spec.ArraySpec(shape=(observation_size,), dtype=np.float32, name='context')
        action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=self._num_actions - 1, name='action')
        
        super(CryptoTradingEnvironment, self).__init__(observation_spec, action_spec)

    def _observe(self):
        start_idx_price = self._current_step - self._lookback
        end_idx_price = self._current_step
        price_window = self._data.iloc[start_idx_price:end_idx_price]
        returns = price_window.pct_change().dropna()
        flat_returns = returns[[f'{name}_close' for name in CRYPTO_NAMES]].values.flatten()
        required_size = self._num_cryptos * (self._lookback - 1)
        padding_size = required_size - len(flat_returns)
        if padding_size > 0:
            flat_returns = np.pad(flat_returns, (0, padding_size), 'constant')
        return flat_returns.astype(np.float32)

    def _apply_action(self, action):
        if self._current_step >= len(self._data) - 1:
            self._current_step += 1
            return 0.0
        crypto_index = action // 2
        is_buy_action = action % 2 == 0
        current_price = self._data.iloc[self._current_step][f'{CRYPTO_NAMES[crypto_index]}_close']
        next_price = self._data.iloc[self._current_step + 1][f'{CRYPTO_NAMES[crypto_index]}_close']
        reward = ((next_price - current_price) / current_price) if is_buy_action else ((current_price - next_price) / current_price)
        self._current_step += 1
        return reward

# --- 3. Instantiate Environment and Agent ---
LOOKBACK_WINDOW = 10
tf_env = tf_py_environment.TFPyEnvironment(CryptoTradingEnvironment(data, lookback_window=LOOKBACK_WINDOW))

agent = lin_ucb_agent.LinearUCBAgent(
    time_step_spec=ts.time_step_spec(tf_env.observation_spec()),
    action_spec=tf_env.action_spec(),
    alpha=1.0, # Exploration parameter
    dtype=tf.float32
)

# --- 4. Define the Training Loop ---
def train_step(trajectory):
    time_axised_trajectory = tf.nest.map_structure(lambda t: tf.expand_dims(t, 1), trajectory)
    return agent.train(time_axised_trajectory)

# Simple observer to collect rewards during training
training_rewards = []
def collect_reward_observer(trajectory):
  training_rewards.append(trajectory.reward.numpy()[0])

NUM_TRAINING_STEPS = 1000

def get_action_name(action):
    crypto_index = action // 2
    action_type = "BUY" if action % 2 == 0 else "SELL"
    action_name = f"{action_type} {CRYPTO_NAMES[crypto_index]}"
    print(action_name)
    return action_name
    
driver = dynamic_step_driver.DynamicStepDriver(
    env=tf_env,
    policy=agent.policy,
    num_steps=NUM_TRAINING_STEPS,
    observers=[train_step, collect_reward_observer]
)

print("Starting training...")
driver.run()
print("Training finished.")
print(f"Total reward earned during training: {sum(training_rewards)}")
print(f"Average reward per step: {np.mean(training_rewards):.4f}")

# --- 5. Evaluate the Results ---
print("\n--- Evaluation Loop ---")
time_step = tf_env.reset() # Reset env to start from a new position for eval
cumulative_reward = 0
for i in range(50):
    action_step = agent.policy.action(time_step)
    action = action_step.action.numpy()[0]
    time_step = tf_env.step(action)
    reward = time_step.reward.numpy()[0]
    cumulative_reward += reward
    print(f"Step {i+1}: Chose action '{get_action_name(action)}', received reward {reward:.4f}")

print(f"\nFinal cumulative reward over last 50 steps: {cumulative_reward:.4f}")