In [1]:
import pandas as pd
import numpy as np
import os
import sys

from sklearn.model_selection import train_test_split

from keras import Sequential, layers, Input
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('../data/data_clean.csv')

In [3]:
from scripts.data_cleaner import filter_top_cpv_categories

df = filter_top_cpv_categories(df, top_n=150, cpv_column='codeCPV_3')

Filtered from 392 to 150 CPV categories, keeping 279174 rows out of 286850


In [4]:
df.drop(df[df['montant'] > 999999].index, inplace=True)
df.shape

(238163, 38)

In [5]:
y = np.log1p(df['montant'])
y.head()

0    11.292279
1    10.882925
4    13.235694
5    11.350889
6    12.044412
Name: montant, dtype: float64

In [6]:
bins = np.linspace(y.min(), y.max(), 10)

# Créer les étiquettes correspondantes
labels = list(range(1, len(bins)))

# Ajouter la colonne fourchette_de_prix
df['bins'] = pd.cut(y, bins=bins, labels=labels, include_lowest=True, right=True)

df.shape

(238163, 39)

In [7]:
df['bins'].isna().sum()

0

In [8]:
df['bins'] = df['bins'].astype('int')

In [9]:
df['bins'].unique

<bound method Series.unique of 0         4
1         3
4         8
5         4
6         5
         ..
286844    3
286845    5
286846    2
286848    9
286849    2
Name: bins, Length: 238163, dtype: int64>

In [10]:
df = df.drop(columns='montant')
df.columns

Index(['uid', 'id', 'nature', 'acheteur_id', 'acheteur_nom', 'acheteur_siren',
       'titulaire_id', 'titulaire_typeIdentifiant', 'titulaire_nom',
       'titulaire_siren', 'objet', 'codeCPV', 'procedure', 'dureeMois',
       'dateNotification', 'datePublicationDonnees', 'formePrix',
       'attributionAvance', 'offresRecues', 'marcheInnovant', 'ccag',
       'sousTraitanceDeclaree', 'typeGroupementOperateurs', 'tauxAvance',
       'origineUE', 'origineFrance', 'lieuExecution_code',
       'lieuExecution_typeCode', 'idAccordCadre', 'source_open_data',
       'codeCPV_FR', 'codeCPV_2', 'codeCPV_3', 'codeCPV_4', 'codeCPV_5',
       'codeCPV_2_3', 'annee', 'bins'],
      dtype='object')

In [11]:
X = df.drop(columns='bins')
y = df['bins']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=X['codeCPV_3'])

In [13]:
from scripts.preprocess_pipeline import create_pipeline_cat

cat_list = ['marche_sim', 'pred_montant', 'anomalie']
cat = cat_list[1]
pipeline = create_pipeline_cat(cat)

In [14]:
X_train_preproc = pipeline.fit_transform(X_train)

In [15]:
X_train_preproc.shape

(190530, 184)

In [16]:
y_train.shape

(190530,)

In [17]:
#Création du model --> couches Dense

In [18]:
model = Sequential()

model.add(Input(shape=(184,)))

model.add(layers.Dense(200, activation='relu'))
model.add(layers.Dense(150, activation='relu'))
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(20, activation='relu'))

model.add(layers.Dense(1, activation='softmax'))

model.summary()

In [19]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [20]:
es = EarlyStopping(patience=10, restore_best_weights=True)

history = model.fit(
        X_train_preproc, y_train,
        validation_split=0.2,
        batch_size=32,
        epochs=100,
        callbacks=[es],
        verbose=1)

Epoch 1/100


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m4732/4764[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 918us/step - accuracy: 0.0267 - loss: 0.0000e+00

  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.0267 - loss: 0.0000e+00 - val_accuracy: 0.0265 - val_loss: 0.0000e+00
Epoch 2/100
[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 847us/step - accuracy: 0.0267 - loss: 0.0000e+00 - val_accuracy: 0.0265 - val_loss: 0.0000e+00
Epoch 3/100
[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 830us/step - accuracy: 0.0264 - loss: 0.0000e+00 - val_accuracy: 0.0265 - val_loss: 0.0000e+00
Epoch 4/100
[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 845us/step - accuracy: 0.0266 - loss: 0.0000e+00 - val_accuracy: 0.0265 - val_loss: 0.0000e+00
Epoch 5/100
[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 831us/step - accuracy: 0.0260 - loss: 0.0000e+00 - val_accuracy: 0.0265 - val_loss: 0.0000e+00
Epoch 6/100
[1m4764/4764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 848us/step - accuracy: 0.0262 - loss: 0.0000e+00 - val_accu