In [1]:
import tensorflow as tf
from tensorflow.keras.layers import GRU, Dense, Input, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np

X_train =pd.read_csv('C:\\Users\\Dell\\Desktop\\op\\data\\X_train.csv')
Y_train =pd.read_csv("C:\\Users\\Dell\\Desktop\\op\\data\\Y_train.csv")

X_test =pd.read_csv("C:\\Users\\Dell\\Desktop\\op\\data\\X_test.csv")
Y_test =pd.read_csv("C:\\Users\\Dell\\Desktop\\op\\data\\Y_test.csv")




In [2]:
# Embedding de 'venue', 'action', et 'trade'
def one_hot_encode(df, column, num_categories):
    return pd.get_dummies(df[column], prefix=column).reindex(columns=[f"{column}_{i}" for i in range(num_categories)], fill_value=0)

df1 = one_hot_encode(X_train, 'venue', 8)
df2 = one_hot_encode(X_train, 'action', 8)
df3 = one_hot_encode(X_train, 'trade', 8)

In [3]:
# Log Transform
X_train['flux'] = X_train['flux'] - X_train['flux'].min() + 1
data = {
    'log(bid_size+1)': np.log(X_train['bid_size'] + 1),
    'log(ask_size+1)': np.log(X_train['ask_size'] + 1),
    'log(flux)': np.log(X_train['flux'])
}
dfa = pd.DataFrame(data)
X_selected = X_train[['bid', 'ask', 'price']]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [4]:
# Combiner les features
df_combined = pd.concat([df1, df2, df3, X_selected, dfa], axis=1)

# Reshape the data to create sequences of (100, 30)
# We need to ensure we have enough rows to create full sequences
num_sequences = len(df_combined) // 100
X_reshaped = df_combined.iloc[:num_sequences * 100].values.reshape(num_sequences, 100, 30)

# Convertir les labels en tenseurs
Y_labels = pd.get_dummies(Y_train['eqt_code_cat']).values
Y_reshaped = Y_labels[:num_sequences]

# Conversion en tenseurs
X_tensor = tf.convert_to_tensor(X_reshaped, dtype=tf.float32)
Y_tensor = tf.convert_to_tensor(Y_reshaped, dtype=tf.float32)

In [5]:
# Définir la taille d'une observation
sequence_length = 100
feature_dim = 30

# Définir l'entrée du modèle
input_layer = Input(shape=(sequence_length, feature_dim))

# Ajouter des couches GRU bidirectionnelles
gru_1 = Bidirectional(GRU(64, return_sequences=True))(input_layer)
gru_2 = Bidirectional(GRU(64))(gru_1)

# Ajouter des couches Denses
dense_1 = Dense(64, activation='selu')(gru_2)
output_layer = Dense(24, activation='softmax')(dense_1)

# Créer le modèle
model = Model(inputs=input_layer, outputs=output_layer)




In [6]:
# Compiler le modèle
optimizer = Adam(learning_rate=3e-3)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Afficher un résumé du modèle
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 30)]         0         
                                                                 
 bidirectional (Bidirection  (None, 100, 128)          36864     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               74496     
 onal)                                                           
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 24)                1560      
                                                                 
Total params: 121176 (473.34 KB)
Trainable params: 121176 (47

In [7]:
# Entraîner le modèle
model.fit(X_tensor, Y_tensor, batch_size=1000, epochs=3)

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2a79e5b9e90>

In [12]:
# Appliquer le one-hot encoding aux colonnes 'venue', 'action', et 'trade' pour X_test
df1_test = one_hot_encode(X_test, 'venue', 8)
df2_test = one_hot_encode(X_test, 'action', 8)
df3_test = one_hot_encode(X_test, 'trade', 8)

# Log Transform pour X_test
X_test['flux'] = X_test['flux'] - X_test['flux'].min() + 1
data_test = {
    'log(bid_size+1)': np.log(X_test['bid_size'] + 1),
    'log(ask_size+1)': np.log(X_test['ask_size'] + 1),
    'log(flux)': np.log(X_test['flux'])
}
dfa_test = pd.DataFrame(data_test)
X_selected_test = X_test[['bid', 'ask', 'price']]

# Combiner les features pour X_test
df_combined_test = pd.concat([df1_test, df2_test, df3_test, X_selected_test, dfa_test], axis=1)

# Reshape the data to create sequences of (100, 30)
# We need to ensure we have enough rows to create full sequences for X_test
num_sequences_test = len(df_combined_test) // 100

# Only take enough rows to form complete sequences
X_combined_test = df_combined_test.iloc[:num_sequences_test * 100]

# Reshape the data to the required shape
X_reshaped_test = X_combined_test.values.reshape(num_sequences_test, 100, 30)

# Convertir les labels en tenseurs pour Y_test
Y_labels_test = pd.get_dummies(Y_test['eqt_code_cat']).values

# Ensure Y_reshaped_test has the same number of sequences as X_reshaped_test
Y_reshaped_test = Y_labels_test[:num_sequences_test]

# Conversion en tenseurs pour X_test et Y_test
X_tensor_test = tf.convert_to_tensor(X_reshaped_test, dtype=tf.float32)
Y_tensor_test = tf.convert_to_tensor(Y_reshaped_test, dtype=tf.float32)

# Évaluer le modèle sur l'ensemble de test
loss, accuracy = model.evaluate(X_tensor_test, Y_tensor_test, verbose=0)
print(f'Précision sur l\'ensemble de test: {accuracy}')

# Faire des prédictions et évaluer les performances
y_pred = model.predict(X_tensor_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(Y_reshaped_test, axis=1)

from sklearn.metrics import classification_report, confusion_matrix

print('Rapport de classification:')
print(classification_report(y_true, y_pred_classes))
print('Matrice de confusion:')
print(confusion_matrix(y_true, y_pred_classes))

# Visualiser l'historique d'entraînement (optionnel si vous avez l'historique)
import matplotlib.pyplot as plt

# Assurez-vous d'avoir l'historique d'entraînement pour les graphiques
history = model.fit(X_tensor, Y_tensor, batch_size=1000, epochs=3)

# Précision
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Précision du modèle')
plt.ylabel('Précision')
plt.xlabel('Époque')
plt.legend(['Entraînement', 'Validation'], loc='upper left')
plt.show()

# Perte
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Perte du modèle')
plt.ylabel('Perte')
plt.xlabel('Époque')
plt.legend(['Entraînement', 'Validation'], loc='upper left')
plt.show()

MemoryError: Unable to allocate 662. MiB for an array with shape (18, 4824000) and data type int64

In [8]:
# Appliquer le one-hot encoding aux colonnes 'venue', 'action', et 'trade' pour X_test
df1_test = one_hot_encode(X_test, 'venue', 8)
df2_test = one_hot_encode(X_test, 'action', 8)
df3_test = one_hot_encode(X_test, 'trade', 8)

# Log Transform pour X_test
X_test['flux'] = X_test['flux'] - X_test['flux'].min() + 1
data_test = {
    'log(bid_size+1)': np.log(X_test['bid_size'] + 1),
    'log(ask_size+1)': np.log(X_test['ask_size'] + 1),
    'log(flux)': np.log(X_test['flux'])
}
dfa_test = pd.DataFrame(data_test)
X_selected_test = X_test[['bid', 'ask', 'price']]

# Combiner les features pour X_test
df_combined_test = pd.concat([df1_test, df2_test, df3_test, X_selected_test, dfa_test], axis=1)

# Reshape the data to create sequences of (100, 30)
# We need to ensure we have enough rows to create full sequences for X_test
num_sequences_test = len(df_combined_test) // 100
X_reshaped_test = df_combined_test.iloc[:num_sequences_test * 100].values.reshape(num_sequences_test, 100, 30)

# Convertir les labels en tenseurs pour Y_test
Y_labels_test = pd.get_dummies(Y_test['eqt_code_cat']).values
Y_reshaped_test = Y_labels_test[:num_sequences_test]

# Conversion en tenseurs pour X_test et Y_test
X_tensor_test = tf.convert_to_tensor(X_reshaped_test, dtype=tf.float32)
Y_tensor_test = tf.convert_to_tensor(Y_reshaped_test, dtype=tf.float32)

In [10]:
# 6. Évaluer le modèle sur l'ensemble de test
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f'Précision sur l\'ensemble de test: {accuracy}')

ValueError: Data cardinality is ambiguous:
  x sizes: 4824000
  y sizes: 48240
Make sure all arrays contain the same number of samples.

In [None]:
# Faire des prédictions et évaluer les performances
y_pred = model.predict(X_tensor_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(Y_reshaped_test, axis=1)

from sklearn.metrics import classification_report, confusion_matrix

print('Rapport de classification:')
print(classification_report(y_true, y_pred_classes))
print('Matrice de confusion:')
print(confusion_matrix(y_true, y_pred_classes))

# Visualiser l'historique d'entraînement (optionnel si vous avez l'historique)
import matplotlib.pyplot as plt

# Assurez-vous d'avoir l'historique d'entraînement pour les graphiques
history = model.fit(X_tensor, Y_tensor, batch_size=1000, epochs=3)

# Précision
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Précision du modèle')
plt.ylabel('Précision')
plt.xlabel('Époque')
plt.legend(['Entraînement', 'Validation'], loc='upper left')
plt.show()

# Perte
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Perte du modèle')
plt.ylabel('Perte')
plt.xlabel('Époque')
plt.legend(['Entraînement', 'Validation'], loc='upper left')
plt.show()