# Setting up env

## Importing libs

In [17]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


from transformers import AutoModel
from transformers import AutoTokenizer

## Setting up paths

In [2]:
from paths import DATASET_DIR, DATASET_OUTPUT_FILE, SP500_OUTPUT_FILE, OUT_FILTER_FILE, PRE_DATASET_FILE

# Importing Dataset

In [19]:
import pandas as pd

# Charger ton dataset parquet
dataset_raw = pd.read_parquet(PRE_DATASET_FILE)
print(dataset_raw.shape)  # (9516, 3)

# Conversion en listes
rf_texts = dataset_raw["rf"].astype(str).tolist()
mgmt_texts = dataset_raw["mgmt"].astype(str).tolist()
labels = torch.tensor(dataset_raw["return"].values, dtype=torch.float)

(9516, 3)


# Tokenize and Chunk

In [18]:
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-pretrain")

def tokenize_and_chunk(texts, tokenizer, max_length=512, n_chunks=10, desc="Chunking"):
    """
    Tokenize une liste de textes en chunks de taille fixe.
    
    Retourne:
        input_ids: [N, n_chunks, max_length]
        attention_mask: [N, n_chunks, max_length]
    """
    all_input_ids, all_attention = [], []

    for text in tqdm(texts, desc=desc):
        tokens = tokenizer.encode(text, add_special_tokens=True)

        # Split en morceaux de max_length
        chunks = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length)]

        ids, masks = [], []
        for chunk in chunks[:n_chunks]:
            attn = [1] * len(chunk)
            if len(chunk) < max_length:
                pad_len = max_length - len(chunk)
                chunk = chunk + [tokenizer.pad_token_id] * pad_len
                attn = attn + [0] * pad_len
            ids.append(chunk)
            masks.append(attn)

        # Padding si moins de n_chunks
        while len(ids) < n_chunks:
            ids.append([tokenizer.pad_token_id] * max_length)
            masks.append([0] * max_length)

        all_input_ids.append(ids)
        all_attention.append(masks)

    return torch.tensor(all_input_ids), torch.tensor(all_attention)

In [6]:
rf_ids, rf_masks = tokenize_and_chunk(
    dataset_raw["rf"].astype(str).tolist(),
    tokenizer,
    max_length=256,
    n_chunks=10,
    desc="Tokenizing RF"
)

mgmt_ids, mgmt_masks = tokenize_and_chunk(
    dataset_raw["mgmt"].astype(str).tolist(),
    tokenizer,
    max_length=256,
    n_chunks=10,
    desc="Tokenizing MGMT"
)

Tokenizing RF: 100%|██████████| 9516/9516 [00:42<00:00, 226.27it/s]
Tokenizing MGMT: 100%|██████████| 9516/9516 [02:20<00:00, 67.50it/s] 


In [7]:
print(rf_ids.shape)      # torch.Size([9516, 100, 512])
print(rf_masks.shape)    # torch.Size([9516, 100, 512])
print(mgmt_ids.shape)    # torch.Size([9516, 100, 512])
print(mgmt_masks.shape)  # torch.Size([9516, 100, 512]
print(labels.shape)

i = 0  # premier document
print(rf_ids[i, 0, :20])   # premiers tokens du premier chunk
print(rf_masks[i, 0, :20]) # mask -> devrait être 1 pour les tokens réels, 0 pour les PAD

print(rf_ids[i, -1, :20])   # premiers tokens du dernier chunk (100e)
print(rf_masks[i, -1, :20]) # mask -> devrait être que des 0 si chunk vide

torch.Size([9516, 10, 256])
torch.Size([9516, 10, 256])
torch.Size([9516, 10, 256])
torch.Size([9516, 10, 256])
torch.Size([9516])
tensor([    3,   366,  4338,    48,   177,   298,  2443,   366,  4338,    48,
          177,   298,   181,  1945, 25671,     8,    38,    54,   585,    40])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor([1242, 1914,    8,  111,   28,  117,  104,  140,  450,  585,   16,   60,
          73,    9,  104, 1804,   10,  287, 1228,  174])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


# Save Dataset

In [8]:
torch.save({
    "rf_input_ids": rf_ids,
    "rf_attention_mask": rf_masks,
    "mgmt_input_ids": mgmt_ids,
    "mgmt_attention_mask": mgmt_masks,
    "labels": labels
}, "finbert_chunks.pt")

In [9]:
# from datasets import load_dataset

# dataset = load_dataset("Arthurmaffre34/predataset", data_files="pre_dataset.parquet")
# print(dataset.shape)
# # Charger en DataFrame Pandas
# dataset_raw = dataset["train"].to_pandas()
# print(dataset_raw.shape)


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class FinBERTChunksDataset(Dataset):
    def __init__(self, data_path):
        # Charger directement le dict de tensors
        self.data = torch.load(data_path)

    def __len__(self):
        return self.data["labels"].shape[0]

    def __getitem__(self, idx):
        return (
            self.data["mgmt_input_ids"][idx],        # [n_chunks, 512]
            self.data["mgmt_attention_mask"][idx],   # [n_chunks, 512]
            self.data["rf_input_ids"][idx],          # [n_chunks, 512]
            self.data["rf_attention_mask"][idx],     # [n_chunks, 512]
            self.data["labels"][idx]                 # scalaire
        )

In [11]:
# Charger le dataset
dataset = FinBERTChunksDataset("finbert_chunks.pt")



In [12]:
# Créer un DataLoader
loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Test : vérifier les shapes
for mgmt_ids, mgmt_masks, rf_ids, rf_masks, labels in loader:
    print("mgmt_ids:", mgmt_ids.shape)       # [B, 100, 512]
    print("mgmt_masks:", mgmt_masks.shape)   # [B, 100, 512]
    print("rf_ids:", rf_ids.shape)           # [B, 100, 512]
    print("rf_masks:", rf_masks.shape)       # [B, 100, 512]
    print("labels:", labels.shape)           # [B]
    break

mgmt_ids: torch.Size([1, 10, 256])
mgmt_masks: torch.Size([1, 10, 256])
rf_ids: torch.Size([1, 10, 256])
rf_masks: torch.Size([1, 10, 256])
labels: torch.Size([1])


In [13]:
import torch
import torch.nn as nn
from transformers import AutoModel

class MultiStageFinBERT(nn.Module):
    def __init__(self, bert_model="yiyanghkust/finbert-pretrain",
                 hidden_dim=768, n_heads=4, n_layers=1):
        super().__init__()
        # BERT partagé (reste en FP32, AMP gérera le cast)
        self.bert = AutoModel.from_pretrained(bert_model)
        self.hidden_dim = hidden_dim

        # Agrégateur Transformer (reste en FP32)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=n_heads, batch_first=True
        )
        self.chunk_transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        # MLP final
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, mgmt_ids, mgmt_masks, rf_ids, rf_masks):
        B, N, L = mgmt_ids.shape  # [batch, n_chunks, seq_len]

        def run_bert(ids, masks):
            ids = ids.view(B*N, L).to(dtype=torch.long)   # IDs restent int64
            masks = masks.view(B*N, L).to(dtype=torch.long)

            outputs = self.bert(input_ids=ids, attention_mask=masks)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [B*N, hidden_dim]

            return cls_embeddings.view(B, N, self.hidden_dim)

        mgmt_emb = run_bert(mgmt_ids, mgmt_masks)
        rf_emb = run_bert(rf_ids, rf_masks)

        # Agrégation des chunks avec Transformer
        mgmt_repr = self.chunk_transformer(mgmt_emb).mean(dim=1)  # [B, hidden_dim]
        rf_repr = self.chunk_transformer(rf_emb).mean(dim=1)      # [B, hidden_dim]

        # Concat + MLP
        combined = torch.cat([mgmt_repr, rf_repr], dim=1)  # [B, 2*hidden_dim]
        out = self.fc(combined)                            # [B, 1]
        return out.squeeze(-1)

In [14]:
device = "mps"
model = MultiStageFinBERT().to(device)

for mgmt_ids, mgmt_masks, rf_ids, rf_masks, labels in loader:
    mgmt_ids, mgmt_masks = mgmt_ids.to(device), mgmt_masks.to(device)
    rf_ids, rf_masks = rf_ids.to(device), rf_masks.to(device)
    labels = labels.to(device)
    print(labels)
    preds = model(mgmt_ids, mgmt_masks, rf_ids, rf_masks)
    print(preds.shape)  # [B]
    break

tensor([0.3869], device='mps:0')
torch.Size([1])


In [15]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Nombre total de paramètres : {total_params:,}")
print(f"Paramètres entraînables : {trainable_params:,}")

Nombre total de paramètres : 116,446,977
Paramètres entraînables : 116,446,977


In [16]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.cuda.amp import autocast, GradScaler

device = "cuda" if torch.cuda.is_available() else "cpu"

# Dataset / DataLoader
dataset = FinBERTChunksDataset("finbert_chunks.pt")
loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0)

# Modèle + optim
model = MultiStageFinBERT().to(device)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# AMP
scaler = GradScaler()

n_epochs = 3

for epoch in range(n_epochs):
    model.train()
    total_loss = 0.0

    for step, (mgmt_ids, mgmt_masks, rf_ids, rf_masks, labels) in enumerate(loader):
        mgmt_ids, mgmt_masks = mgmt_ids.to(device), mgmt_masks.to(device)
        rf_ids, rf_masks = rf_ids.to(device), rf_masks.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        # AMP forward
        with autocast():
            preds = model(mgmt_ids, mgmt_masks, rf_ids, rf_masks)
            loss = criterion(preds, labels)

        # backward AMP
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        if step % 10 == 0:
            print(f"Epoch {epoch+1} Step {step}/{len(loader)} - Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1} finished - Avg Loss: {avg_loss:.4f}")

  scaler = GradScaler()
  with autocast():


KeyboardInterrupt: 

In [None]:
def evaluate_model(model, dataloader, criterion, device="mps", n_show=5):
    model.eval()
    total_loss = 0
    preds_list, labels_list = [], []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader):
            rf_ids = batch["rf_input_ids"].to(device)
            rf_mask = batch["rf_attention_mask"].to(device)
            mgmt_ids = batch["mgmt_input_ids"].to(device)
            mgmt_mask = batch["mgmt_attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(rf_ids, rf_mask, mgmt_ids, mgmt_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds_list.extend(outputs.cpu().numpy())
            labels_list.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)

    # Corrélation Pearson
    preds_arr = np.array(preds_list)
    labels_arr = np.array(labels_list)
    if len(np.unique(labels_arr)) > 1:  # éviter division par zéro
        corr = np.corrcoef(preds_arr, labels_arr)[0, 1]
    else:
        corr = float("nan")

    print(f"✅ Evaluation - Loss moyenne: {avg_loss:.4f}, Corrélation: {corr:.4f}")

    # Exemples
    print("\nExemples de prédictions vs labels:")
    for i in range(min(n_show, len(preds_list))):
        print(f"  Pred: {preds_list[i]:.4f} | Label: {labels_list[i]:.4f}")

    return avg_loss, corr, preds_list, labels_list

In [None]:
# Supposons que tu aies déjà : model, criterion, loader_small, device
loss, corr, preds, labels = evaluate_model(model, eval_loader, criterion, device="mps")

100%|██████████| 50/50 [01:39<00:00,  1.99s/it]

✅ Evaluation - Loss moyenne: 0.0186, Corrélation: -0.0369

Exemples de prédictions vs labels:
  Pred: 0.0494 | Label: 0.0628
  Pred: 0.0594 | Label: 0.1351
  Pred: 0.0405 | Label: 0.0051
  Pred: 0.0284 | Label: -0.1686
  Pred: 0.0545 | Label: 0.2264



