In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from keras import Sequential, layers, Input, regularizers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical



from scripts.data_cleaner import filter_top_cpv_categories
from scripts.preprocess_pipeline import create_pipeline


2025-06-10 21:51:20.882083: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-10 21:51:20.884567: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-10 21:51:20.935095: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-10 21:51:20.935152: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-10 21:51:20.935178: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [2]:
df = pd.read_csv('../data/data_clean.csv')
df = filter_top_cpv_categories(df, top_n=150, cpv_column='codeCPV_3')
df.drop(df[df['montant'] > 999_999].index, inplace=True)

y = np.log1p(df['montant'])


bins = np.linspace(y.min(), y.max(), 10)
bins = np.delete(bins, 1)

#bins = bins[2:]

# Créer les étiquettes correspondantes
labels = list(range(0, len(bins)-1))

# Ajouter la colonne fourchette_de_prix
df['bins'] = pd.cut(y, bins=bins, labels=labels, include_lowest=True, right=True)

df['bins'] = df['bins'].astype('int')
df = df.drop(columns='montant')

X = df.drop(columns=['bins', 'titulaire_tranche_effectif', 'titulaire_categorie'])
y = df['bins']

df[['acheteur_tranche_effectif', 'acheteur_categorie']] = df[['acheteur_tranche_effectif', 'acheteur_categorie']].fillna('null')


  df = pd.read_csv('../data/data_clean.csv')


Filtered from 392 to 150 CPV categories, keeping 279174 rows out of 286850


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=X['codeCPV_3'])

numerical_columns = ['dureeMois', 'offresRecues', 'annee']

binary_columns = ['sousTraitanceDeclaree', 'origineFrance',
                          'marcheInnovant', 'idAccordCadre']

categorical_columns = ['procedure', 'nature', 'formePrix', 'ccag',
                               'typeGroupementOperateurs', 'tauxAvance_cat',
                               'codeCPV_3', 'acheteur_tranche_effectif', 'acheteur_categorie']

pipeline = create_pipeline(numerical_columns, binary_columns, categorical_columns)

X_train_preproc = pipeline.fit_transform(X_train)

y_train_cat = to_categorical(y_train)

In [4]:
y_train_cat.shape

(190530, 8)

In [5]:
def build_mlp_model(input_dim=220, num_classes=8, use_dropout=True, l2_factor=1e-4):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))

    # Couche 1
    model.add(layers.Dense(
        1028, activation='relu',
        kernel_regularizer=regularizers.l2(l2_factor)
    ))
    model.add(layers.BatchNormalization())
    if use_dropout:
        model.add(layers.Dropout(0.4))

    # Couche 2
    model.add(layers.Dense(
        512, activation='relu',
        kernel_regularizer=regularizers.l2(l2_factor)
    ))
    model.add(layers.BatchNormalization())
    if use_dropout:
        model.add(layers.Dropout(0.3))

    # Couche 3
    model.add(layers.Dense(
        256, activation='relu',
        kernel_regularizer=regularizers.l2(l2_factor)
    ))
    model.add(layers.BatchNormalization())
    if use_dropout:
        model.add(layers.Dropout(0.2))

    # Sortie
    model.add(layers.Dense(
        num_classes, activation='softmax',
        kernel_regularizer=regularizers.l2(l2_factor)
    ))

    # Compilation
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


In [6]:
model = build_mlp_model()


es = EarlyStopping(patience=8, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(factor=0.5, patience=5, verbose=1, min_lr=1e-6)


history = model.fit(
        X_train_preproc, y_train_cat,
        validation_split=0.2,
        batch_size=128,
        epochs=150,
        callbacks=[es, lr_scheduler],
        verbose=1)

Epoch 1/150
Epoch 2/150

KeyboardInterrupt: 

In [None]:
# Récupération des données d'entraînement et validation
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

# Création de la figure
plt.figure(figsize=(12, 5))

# Plot de l'accuracy
plt.subplot(1, 2, 1)
plt.plot(acc, label='Accuracy')
plt.plot(val_acc, label='Val Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

# Plot de la loss
plt.subplot(1, 2, 2)
plt.plot(loss, label='Loss')
plt.plot(val_loss, label='Val Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Affichage
plt.tight_layout()
plt.show()