In [None]:
import numpy as np
import pandas as pd
import gymnasium as gym
import gym_trading_env
from gym_trading_env.wrapper import DiscreteActionsWrapper
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import wandb
import glob

# --- CONFIGURATION GPU ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Partie 2 : Cerveau branché sur : {device}")

# --- FONCTION DE PRÉTRAITEMENT AVANCÉE ---
def preprocess_v2(df):
    df = df.sort_index().dropna().drop_duplicates()
    
    # 1) Log Returns (La base)
    df["feature_close"] = np.log(df["close"]).diff()
    
    # 2) RSI (Relative Strength Index) - Détecte Sur-achat / Sur-vente
    # Fenêtre standard de 14 périodes
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df["feature_rsi"] = 100 - (100 / (1 + rs))
    # On normalise le RSI entre 0 et 1 pour le réseau de neurones
    df["feature_rsi"] = df["feature_rsi"] / 100.0

    # 3) MACD (Moving Average Convergence Divergence) - Détecte la Tendance
    ema12 = df['close'].ewm(span=12, adjust=False).mean()
    ema26 = df['close'].ewm(span=26, adjust=False).mean()
    df["feature_macd"] = ema12 - ema26
    df["feature_signal"] = df["feature_macd"].ewm(span=9, adjust=False).mean()
    
    # 4) BANDES DE BOLLINGER (Volatilité)
    # Fenêtre de 20 heures, écart type de 2
    sma20 = df['close'].rolling(window=20).mean()
    std20 = df['close'].rolling(window=20).std()
    
    # On donne la distance par rapport aux bandes (normalisée)
    # Si > 1 : Le prix a percé la bande haute (souvent signal de vente)
    # Si < 0 : Le prix a percé la bande basse (souvent signal d'achat)
    upper_band = sma20 + (2 * std20)
    lower_band = sma20 - (2 * std20)
    
    # Position du prix relative aux bandes (0.5 = au milieu, 1.0 = bande haute)
    df["feature_bollinger"] = (df['close'] - lower_band) / (upper_band - lower_band)

    # Nettoyage des NaN générés par les moyennes mobiles
    df = df.dropna()
    
    # Remplacement des infinis par 0 au cas où
    df = df.replace([np.inf, -np.inf], 0)
    
    return df

print("Fonction preprocess_v2 (Indicators) chargée.")

Partie 2 : Cerveau branché sur : cuda
Fonction preprocess_v2 (Indicators) chargée.


In [None]:
# --- CHARGEMENT DES DONNÉES ---
dataset_path_str = "./data/*.pkl" # dossier local
files = glob.glob(dataset_path_str)
print(f"Chargement de {len(files)} fichiers...")

# --- CRÉATION DE L'ENVIRONNEMENT AVANCÉ ---
env_raw = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir=dataset_path_str,
    preprocess=preprocess_v2,     
    portfolio_initial_value=1_000,
    trading_fees=0.1/100,
    borrow_interest_rate=0.02/100/24,
    windows=30,                   # Il voit 30 heures en arrière
    verbose=1
)

def metric_portfolio_valuation(history):
    return history['portfolio_valuation', -1]

env_raw.add_metric('Valuation Finale', metric_portfolio_valuation)

env = DiscreteActionsWrapper(env_raw, positions=[-1, 0, 1])

# Calcul de la nouvelle taille d'entrée
# L'observation est maintenant une matrice (30, nb_features) qu'on va aplatir
nb_features = env.observation_space.shape[1] 
window_size = env.observation_space.shape[0]
input_dim = nb_features * window_size

print(f"\n--- CONFIGURATION PARTIE 2 ---")
print(f"Fenêtre historique : {window_size} heures")
print(f"Nombre de features : {nb_features} (RSI, MACD, Bollinger...)")
print(f"Taille totale entrée cerveau : {input_dim} neurones")

Chargement de 9 fichiers...

--- CONFIGURATION PARTIE 2 ---
Fenêtre historique : 30 heures
Nombre de features : 7 (RSI, MACD, Bollinger...)
Taille totale entrée cerveau : 210 neurones


In [None]:
class DQNAgentV2:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.memory = deque(maxlen=5000) # On augmente un peu la mémoire
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99995 
        self.learning_rate = 0.001
        
        # Le réseau est un peu plus gros car l'entrée est plus riche
        self.model = nn.Sequential(
            nn.Linear(state_size, 128),  # 150 -> 128
            nn.ReLU(),
            nn.Dropout(0.2),             # Ajout de Dropout pour éviter le par coeur (Overfitting)
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, action_size)
        ).to(device)
        
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        state = torch.FloatTensor(state).to(device)
        with torch.no_grad():
            act_values = self.model(state)
        return torch.argmax(act_values).item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size: return
        minibatch = random.sample(self.memory, batch_size)
        
        # Vectorisation (Turbo)
        states = torch.FloatTensor(np.array([t[0] for t in minibatch])).to(device) # Pas de squeeze ici car c'est déjà plat
        actions = torch.LongTensor(np.array([t[1] for t in minibatch])).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(np.array([t[2] for t in minibatch])).to(device)
        next_states = torch.FloatTensor(np.array([t[3] for t in minibatch])).to(device)
        dones = torch.FloatTensor(np.array([t[4] for t in minibatch])).to(device)

        current_q = self.model(states).gather(1, actions).squeeze(1)
        next_q = self.model(next_states).max(1)[0].detach()
        target_q = rewards + (self.gamma * next_q * (1 - dones))

        loss = self.criterion(current_q, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [4]:
# --- CONFIG WANDB ---
wandb.init(
    project="Projet-Trading-RL",
    name="Partie2-Indicators-Window30",
    config={"features": "RSI, MACD, BB", "window": 30}
)

[34m[1mwandb[0m: Currently logged in as: [33mravi-sabra[0m ([33mravi-sabra-cpe-lyon[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:

EPISODES = 30
BATCH_SIZE = 64 # On augmente un peu car on a plus d'infos

# Initialisation
# Attention : input_dim est calculé à l'étape 2 (nb_features * window)
agent = DQNAgentV2(input_dim, env.action_space.n)

print(f"Lancement Partie 2 (Vision Augmentée) sur {device}...")

try:
    for e in range(1, EPISODES + 1):
        state, info = env.reset()
        if isinstance(state, tuple): state = state[0]
        
        # APLATISSEMENT IMPORTANT : (30, 5) -> (150,)
        state = state.flatten()
        
        done = False
        total_reward = 0
        step_count = 0
        
        while not done:
            action = agent.act(state)
            next_state, reward, terminated, truncated, info = env.step(action)

            if info['portfolio_valuation'] < 50:
                terminated = True # On force la fin
                reward = -1 # Grosse punition

            done = terminated or truncated
            
            # APLATISSEMENT ICI AUSSI
            next_state = next_state.flatten()
            
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            step_count += 1
            
            if len(agent.memory) > BATCH_SIZE:
                agent.replay(BATCH_SIZE)
            
            if step_count % 1000 == 0:
                print(f"   Ep {e} | Step {step_count} | Val: {info['portfolio_valuation']:.0f}$ | Eps: {agent.epsilon:.2f}", end='\r')

        # Logs finaux
        metrics = env.unwrapped.get_metrics()
        final_val = metrics['Valuation Finale']
        print(f"Ep {e}/{EPISODES} | Score: {final_val:.2f}$ | Epsilon: {agent.epsilon:.3f}")
        
        wandb.log({
            "Episode": e, "Valuation": final_val, "Epsilon": agent.epsilon,
            "Portfolio Return": float(str(metrics['Portfolio Return']).strip('%'))
        })

except KeyboardInterrupt:
    print("Arrêt manuel.")
finally:
    wandb.finish()
    env.close()

Lancement Partie 2 (Vision Augmentée) sur cuda...
Market Return : 10.03%   |   Portfolio Return : -97.78%   |   Valuation Finale : 22.237956175922236   |   
Ep 1/30 | Score: 22.24$ | Epsilon: 0.106
Market Return : 692.70%   |   Portfolio Return : -100.00%   |   Valuation Finale : 8.016763137120116e-06   |   
Ep 2/30 | Score: 0.00$ | Epsilon: 0.010
Market Return : 967.38%   |   Portfolio Return : -100.00%   |   Valuation Finale : 0.001389824959994784   |   
Ep 3/30 | Score: 0.00$ | Epsilon: 0.010
Market Return : 449.56%   |   Portfolio Return : -100.00%   |   Valuation Finale : 4.0565033488905555e-07   |   
Ep 4/30 | Score: 0.00$ | Epsilon: 0.010
Arrêt manuel.


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
Episode,▁▃▆█
Epsilon,█▁▁▁
Portfolio Return,█▁▁▁
Valuation,█▁▁▁

0,1
Episode,4.0
Epsilon,0.01
Portfolio Return,-100.0
Valuation,0.0
