In [None]:
!pip install einops 
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from einops import rearrange
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

from utils import *
from tft import *
from preprocessing import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de {device}")

# Chargement des données d'entraînement + pre processing

In [None]:
X_train_path = "/kaggle/input/dataset/X_train_N1UvY30.csv"
y_train_path = "/kaggle/input/dataset/y_train_or6m3Ta.csv"

df_train = pd.read_csv(X_train_path)
y_train = pd.read_csv(y_train_path)
df_train = df_train.merge(y_train, on='obs_id', how='left')
df_train = create_features(df_train)

# Prétraitement train
df_train, features, encoders, scaler, categorical_features, numerical_features = pre_processing(df_train, is_train=True)

# On regroupe par obs_id et on filtre ceux qui ont bien 100 évènements
grouped = df_train.groupby('obs_id')
train_sequences = []
train_labels = []

for obs_id, group in tqdm(grouped):
    if len(group) == 100:
        train_sequences.append(group[features].values)
        train_labels.append(group['eqt_code_cat'].iloc[0])

train_sequences = np.array(train_sequences)
train_labels = np.array(train_labels)

train_indices, val_indices = train_test_split(np.arange(len(train_labels)), test_size=0.1, random_state=42, stratify=train_labels)

class MyDataset(Dataset):
    def __init__(self, X, y, indices):
        self.X = X
        self.y = y
        self.indices = indices

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        i = self.indices[idx]
        return torch.tensor(self.X[i], dtype=torch.float32), torch.tensor(self.y[i], dtype=torch.long)

train_dataset = MyDataset(train_sequences, train_labels, train_indices)
val_dataset = MyDataset(train_sequences, train_labels, val_indices)

# Chargement des données de test + pre processing

In [None]:
X_test_path = "/kaggle/input/dataset/X_test_m4HAPAP.csv"
df_test = pd.read_csv(X_test_path)
df_test = create_features(df_test)
df_test, features_test, _, _, _, _ = pre_processing(df_test, encoders=encoders, scaler=scaler, is_train=False)
grouped_test = df_test.groupby('obs_id')

test_sequences = []
test_obs_ids = []
for obs_id, group in tqdm(grouped_test):
    if len(group) == 100:
        test_sequences.append(group[features_test].values)
        test_obs_ids.append(obs_id)

test_sequences = np.array(test_sequences)

class TestDataset(Dataset):
    def __init__(self, X):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32)

test_dataset = TestDataset(test_sequences) 

batch_size = 250
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Chargement du modèle

In [None]:
cat_dims = [df_train[c].max()+1 for c in categorical_features]  # nombre de classes par feature cat
embedding_size = 8  # taille arbitraire des embeddings

num_numerical = len(numerical_features)
num_classes = 24

# Modèle TFTClassifier
model = TFTClassifier(
    cat_dims=cat_dims, 
    num_numerical=num_numerical, 
    embedding_size=8,       # taille des embeddings
    hidden_size=600,        # dimension cachée
    lstm_layers=3,          # plus de couches LSTM pour plus de capacité
    dropout=0.4,            # taux de dropout
    attn_heads=60,          # nombre de têtes d'attention
    max_seq_len=100,        # longueur de séquence (100 évènements)
    num_classes=num_classes
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0003, weight_decay=4.6e-05)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=3)


# Nombre de paramètres du modèle
print('Nombre de paramètres entraînables :', sum(p.numel() for p in model.parameters() if p.requires_grad))

# Training

In [None]:
best_val_acc = 0
patience = 10
counter = 0

for epoch in range(21):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = eval_model(model, val_loader, criterion, device)
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}: Train loss {train_loss:.4f}, Train Acc {train_acc:.4f}, Val loss {val_loss:.4f}, Val Acc {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        counter = 0
        torch.save(model.state_dict(), "best_model_tft.pth")
    else:
        counter += 1
        if counter > patience:
            print("Early stopping!")
            break

# Evaluation du modèle sur les données de test

In [None]:
model.load_state_dict(torch.load("best_model_tft.pth"))
model.eval()
all_preds = []
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        out = model(X)
        preds = out.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)

result_df = pd.DataFrame({"obs_id": test_obs_ids, "eqt_code_cat": all_preds})
result_df.to_csv("submission.csv", index=False)
print("Prédictions saved dans submission.csv")