In [29]:
import pandas as pd
import numpy as np
import os
import sys

from sklearn.model_selection import train_test_split

from keras import Sequential, layers
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('../data/data_clean.csv')

In [3]:
from scripts.data_cleaner import filter_top_cpv_categories

df = filter_top_cpv_categories(df, top_n=150, cpv_column='codeCPV_3')

Filtered from 392 to 150 CPV categories, keeping 279174 rows out of 286850


In [4]:
df.drop(df[df['montant'] > 999999].index, inplace=True)
df.shape

(238163, 38)

In [5]:
y = np.log1p(df['montant'])
y.head()

0    11.292279
1    10.882925
4    13.235694
5    11.350889
6    12.044412
Name: montant, dtype: float64

In [6]:
bins = np.linspace(y.min(), y.max(), 10)

# Créer les étiquettes correspondantes
labels = list(range(1, len(bins)))

# Ajouter la colonne fourchette_de_prix
df['bins'] = pd.cut(y, bins=bins, labels=labels, include_lowest=True, right=True)

df.shape

(238163, 39)

In [7]:
df['bins'].isna().sum()

0

In [8]:
df['bins'] = df['bins'].astype('int')

In [9]:
df['bins'].unique

<bound method Series.unique of 0         4
1         3
4         8
5         4
6         5
         ..
286844    3
286845    5
286846    2
286848    9
286849    2
Name: bins, Length: 238163, dtype: int64>

In [10]:
df = df.drop(columns='montant')
df.columns

Index(['uid', 'id', 'nature', 'acheteur_id', 'acheteur_nom', 'acheteur_siren',
       'titulaire_id', 'titulaire_typeIdentifiant', 'titulaire_nom',
       'titulaire_siren', 'objet', 'codeCPV', 'procedure', 'dureeMois',
       'dateNotification', 'datePublicationDonnees', 'formePrix',
       'attributionAvance', 'offresRecues', 'marcheInnovant', 'ccag',
       'sousTraitanceDeclaree', 'typeGroupementOperateurs', 'tauxAvance',
       'origineUE', 'origineFrance', 'lieuExecution_code',
       'lieuExecution_typeCode', 'idAccordCadre', 'source_open_data',
       'codeCPV_FR', 'codeCPV_2', 'codeCPV_3', 'codeCPV_4', 'codeCPV_5',
       'codeCPV_2_3', 'annee', 'bins'],
      dtype='object')

In [11]:
X = df.drop(columns='bins')
y = df['bins']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=X['codeCPV_3'])

In [13]:
from scripts.preprocess_pipeline import create_pipeline_cat

cat_list = ['marche_sim', 'pred_montant', 'anomalie']
cat = cat_list[1]
pipeline = create_pipeline_cat(cat)

In [14]:
X_train_preproc = pipeline.fit_transform(X_train)

In [34]:
X_train_preproc.shape

(190530, 184)

In [37]:
from keras.utils import to_categorical
y_train_cat = to_categorical(y_train)

In [39]:
y_train_cat.shape

(190530, 10)

In [17]:
#Création du model --> couches Dense

In [40]:
from keras import metrics

# Modèle
model = Sequential()

# Entrée
model.add(layers.Input(shape=(184,)))

# Couches cachées
model.add(layers.Dense(512, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(256, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(128, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

# Couche de sortie : 10 classes
model.add(layers.Dense(10, activation='softmax'))

# Compilation du modèle
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',  # ou 'sparse_categorical_crossentropy' si tes labels sont des entiers
    metrics=[
        'accuracy',
        metrics.Precision(name='precision'),
        metrics.Recall(name='recall'),
        metrics.AUC(name='auc'),
        metrics.TopKCategoricalAccuracy(k=5, name='top_5_accuracy')
    ]
)

# Affichage du résumé
model.summary()

In [31]:
# model.compile(
#     optimizer='adam',
#     loss='categorical_crossentropy',
#     metrics=['accuracy', 'precision', 'recall'])

In [41]:
es = EarlyStopping(patience=10, restore_best_weights=True)

history = model.fit(
        X_train_preproc, y_train_cat,
        validation_split=0.2,
        batch_size=32,
        epochs=100,
        callbacks=[es],
        verbose=1)

Epoch 1/100
[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.1800 - auc: 0.6768 - loss: 2.2172 - precision: 0.1694 - recall: 0.0089 - top_5_accuracy: 0.7406 - val_accuracy: 0.2113 - val_auc: 0.7369 - val_loss: 1.9677 - val_precision: 0.5000 - val_recall: 7.8728e-05 - val_top_5_accuracy: 0.8159
Epoch 2/100
[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.2071 - auc: 0.7330 - loss: 1.9787 - precision: 0.1068 - recall: 2.0463e-05 - top_5_accuracy: 0.8129 - val_accuracy: 0.2112 - val_auc: 0.7409 - val_loss: 1.9573 - val_precision: 0.4634 - val_recall: 9.9722e-04 - val_top_5_accuracy: 0.8232
Epoch 3/100
[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.2088 - auc: 0.7388 - loss: 1.9636 - precision: 0.1550 - recall: 4.0979e-05 - top_5_accuracy: 0.8187 - val_accuracy: 0.2153 - val_auc: 0.7423 - val_loss: 1.9544 - val_precision: 0.6495 - val_recall: 0.0017 - val_top_5