In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pygwalker as pyg
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Concatenate, Flatten
from tensorflow.keras.models import Model
import numpy as np 

2024-11-12 16:16:54.961845: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# importation des différents fichiers dans l'environnement 
X_test = pd.read_csv("X_test_m4HAPAP.csv")
X_train = pd.read_csv("X_train_N1UvY30.csv")
y_train = pd.read_csv("y_train_or6m3Ta.csv")

In [3]:
# Variables catégorielles à encoder
categorical_columns = ['venue', 'action', 'side']

# Variables numériques
numeric_columns = ['price', 'bid', 'ask', 'bid_size', 'ask_size', 'flux']

# Encoding 
for col in categorical_columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])


X_train['trade'] = X_train['trade'].astype(int)  
X_test['trade'] = X_test['trade'].astype(int)

# Transformation logarithmique pour 'bid_size' et 'ask_size'
X_train['log_bid_size'] = np.log(np.abs(X_train['bid_size'] + 1))
X_train['log_ask_size'] = np.log(np.abs(X_train['ask_size'] + 1))
X_train['log_flux'] = np.log(np.abs(X_train['flux'])+1) * np.sign(X_train['flux']) 

X_test['log_bid_size'] = np.log(np.abs(X_test['bid_size'] + 1))
X_test['log_ask_size'] = np.log(np.abs(X_test['ask_size'] + 1))
X_test['log_flux'] = np.log(np.abs(X_test['flux'])+1) * np.sign(X_train['flux'])

# Sélection des colonnes nécessaires pour le modèle
selected_columns = ['venue', 'action', 'trade',  'bid', 'ask', 'price', 'log_bid_size', 'log_ask_size', 'log_flux']


In [4]:
def group_by_observation(X_train, selected_columns, sequence_length=100):
    grouped = X_train.groupby('obs_id')

    # Préallocation d'un tableau pour stocker toutes les séquences
    num_groups = len(grouped)
    sequences = np.zeros((num_groups, sequence_length, len(selected_columns)))  # Shape: (num_groups, sequence_length, num_features)
    
    for i, (_, group) in enumerate(grouped):
        # Prendre les `sequence_length` premiers événements si plus
        sequence = group[selected_columns].values[:sequence_length]
        
        # Si moins de `sequence_length` événements, on applique du padding (remplissage avec des zéros)
        if len(sequence) < sequence_length:
            sequences[i, :len(sequence)] = sequence  # Remplir les lignes avec les données existantes
            # Les lignes restantes sont déjà initialisées à zéro
        else:
            sequences[i] = sequence  # Remplacer par la séquence complète

    return sequences

# Utilisation de la fonction
sequences = group_by_observation(X_train, selected_columns)

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Bidirectional, Dropout, BatchNormalization
from tensorflow.keras.models import Model

# Configuration de base
embedding_dim = 8
gru_units = 64
sequence_length = 100  # 100 événements
num_features = 9  # Dimensions des vecteurs d'entrées

# Dimensions des embeddings (hypothétiques)
num_unique_venues = 6  # nombre d'items possibles pour 'venue'
num_unique_actions = 3  # nombre d'items possibles pour 'action'
num_unique_trades = 2  # nombre d'items possibles pour 'trade'

# Entrée unique pour le tenseur avec les dimensions (num_observations, 100, nombre_de_features)
input_tensor = Input(shape=(sequence_length, num_features), name='input_tensor')

# Supposons que les 3 premières features sont des catégories
venue_embedding = Embedding(input_dim=num_unique_venues, output_dim=embedding_dim)(input_tensor[:, :, 0])  # 'venue'
action_embedding = Embedding(input_dim=num_unique_actions, output_dim=embedding_dim)(input_tensor[:, :, 1])  # 'action'

# On concatène les embeddings et les autres features
# Ici, nous assumons que les features restantes sont continues et déjà sous forme de tenseur 2D
continuous_features = input_tensor[:, :, 2:]  # toutes les autres features (bid, ask, etc.)

# Concatenation des embeddings et des features continues
features = tf.keras.layers.Concatenate(axis=-1)([
    venue_embedding, action_embedding,  continuous_features
])

# Utilisation d'une couche GRU bidirectionnelle
gru_output = Bidirectional(GRU(gru_units, return_sequences=False))(features)

# Ajout d'une première couche dense
dense_output1 = Dense(64, activation='selu')(gru_output)
dense_output1 = BatchNormalization()(dense_output1)  # Normalisation de lot
dense_output1 = Dropout(0.5)(dense_output1)  # Dropout

# Couche de sortie
output = Dense(24, activation='softmax')(dense_output1)  # 24 classes pour la sortie


# Création du modèle
model = Model(inputs=input_tensor, outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-3),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Résumé du modèle
model.summary()


In [None]:
model= model

In [6]:
from tensorflow.keras.utils import to_categorical
# Exemple de labels générés aléatoirement (à remplacer par tes vraies catégories cibles)
num_observations = sequences.shape[0]
Y= to_categorical(y_train['eqt_code_cat'], num_classes=24)

In [7]:
# Entraînement du modèle
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model.fit(sequences, Y, batch_size=1000, epochs=(10_000 * 1000) // num_observations, validation_split=0.1, callbacks=[early_stopping])


Epoch 1/62
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 2s/step - accuracy: 0.0993 - loss: 3.1862 - val_accuracy: 0.2007 - val_loss: 2.4170
Epoch 2/62
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 2s/step - accuracy: 0.2418 - loss: 2.2837 - val_accuracy: 0.2096 - val_loss: 2.4851
Epoch 3/62
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 1s/step - accuracy: 0.2968 - loss: 2.0835 - val_accuracy: 0.3224 - val_loss: 1.9704
Epoch 4/62
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 1s/step - accuracy: 0.3360 - loss: 1.9595 - val_accuracy: 0.3027 - val_loss: 2.0658
Epoch 5/62
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 1s/step - accuracy: 0.3562 - loss: 1.8998 - val_accuracy: 0.3412 - val_loss: 1.9833
Epoch 6/62
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 1s/step - accuracy: 0.3716 - loss: 1.8460 - val_accuracy: 0.3984 - val_loss: 1.7588
Epoch 7/62
[1m145/145

<keras.src.callbacks.history.History at 0x12f676000>

In [8]:
from joblib import dump, load

In [9]:
dump(model, 'model_ML_ENS,3.modele')

['model_ML_ENS,3.modele']

In [10]:
sequences_test = group_by_observation(X_test, selected_columns)

In [11]:
predictions = model.predict(sequences_test)

[1m2550/2550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 40ms/step


In [12]:
y_predict_adjusted = np.argmax(predictions, axis=1)
df_prediction = pd.DataFrame(data = {'eqt_code_cat':y_predict_adjusted})

In [13]:
df_prediction.to_csv('y_prediction7.csv')

In [None]:
list_ = [True, False ]

In [None]:
a= map(int, list_)