In [None]:
import numpy as np
import pandas as pd
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

In [None]:
# 환경 설정
class BitcoinTradingEnv(gym.Env):
    def __init__(self, data, initial_balance=1000):
        super(BitcoinTradingEnv, self).__init__()
        self.data = data
        self.initial_balance = initial_balance
        self.current_step = 0
        self.balance = initial_balance
        self.holdings = 0
        self.done = False

        # 상태와 행동 공간 정의
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(len(data.columns),), dtype=np.float32)
        self.action_space = gym.spaces.Discrete(3)  # 0: 관망, 1: 매수, 2: 매도

    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.holdings = 0
        self.done = False
        return self._next_observation()

    def _next_observation(self):
        return self.data.iloc[self.current_step].values

    def step(self, action):
        current_price = self.data.iloc[self.current_step]['close']
        reward = 0

        if action == 1:  # 매수
            if self.balance > 0:
                self.holdings += self.balance / current_price
                self.balance = 0
        elif action == 2:  # 매도
            if self.holdings > 0:
                self.balance += self.holdings * current_price
                self.holdings = 0

        # 다음 단계로 이동
        self.current_step += 1
        if self.current_step >= len(self.data) - 1:
            self.done = True
            reward = self.balance + self.holdings * current_price - self.initial_balance

        return self._next_observation(), reward, self.done, {}

In [None]:
# DQN 모델 정의
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

In [None]:
# 에이전트 학습
def train_agent(env, episodes=1000):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    memory = deque(maxlen=2000)
    gamma = 0.99
    epsilon = 1.0
    epsilon_decay = 0.995
    epsilon_min = 0.01

    for episode in range(episodes):
        state = env.reset()
        state = torch.tensor(state, dtype=torch.float32).to(device)
        total_reward = 0

        while True:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                q_values = model(state)
                action = torch.argmax(q_values).item()

            next_state, reward, done, _ = env.step(action)
            next_state = torch.tensor(next_state, dtype=torch.float32).to(device)

            memory.append((state, action, reward, next_state, done))
            total_reward += reward

            if done:
                break

            state = next_state

        if len(memory) > 32:
            batch = random.sample(memory, 32)
            for state, action, reward, next_state, done in batch:
                q_update = reward
                if not done:
                    q_update += gamma * torch.max(model(next_state)).item()

                q_values = model(state)
                q_values[action] = q_update

                optimizer.zero_grad()
                loss = criterion(model(state), q_values)
                loss.backward()
                optimizer.step()

        epsilon = max(epsilon * epsilon_decay, epsilon_min)
        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward:.2f}")


In [None]:
# 데이터 로드 및 실행
data = pd.read_csv('/workspace/BTCUSDT/BTCUSDT-1h-2023.csv', index_col=0)
data = data[['Open', 'High', 'Low', 'Close', 'Volume']]
env = BitcoinTradingEnv(data)

train_agent(env, episodes=500)