In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys


# Sklearn preprocessing
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectPercentile, mutual_info_regression, VarianceThreshold, SelectFromModel
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor


# Load Data

In [2]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')

In [3]:
data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', 'data_cpv.csv'))
data.head()

Unnamed: 0,uid,id,nature,acheteur_id,acheteur_nom,acheteur_siren,titulaire_id,titulaire_typeIdentifiant,titulaire_nom,titulaire_siren,...,origineFrance,lieuExecution_code,lieuExecution_typeCode,idAccordCadre,source_open_data,codeCPV_FR,codeCPV_2,codeCPV_3,codeCPV_4,codeCPV_5
0,210601209000132022_M013,2022_M013,Marché,21060120900013,COMMUNE DE SAINT ETIENNE DE TINEE,210601209.0,38177692100029,SIRET,SERMATECH,381776921.0,...,,6660,Code postal,,data.gouv.fr decp-2024.json,Travaux de construction de bâtiments,45000000,45200000,45210000,45210000
1,217100759000182024RENOCHARP,2024RENOCHARP,Marché,21710075900018,COMMUNE DE CHALMOUX,217100759.0,75203574100038,SIRET,MACON ETANCHEITE,752035741.0,...,0.0,71140,Code postal,,data.gouv.fr decp-2025-01.json,Travaux de charpente et de couverture et trava...,45000000,45200000,45260000,45261000
2,200066231000162022033INFOL00,2022033INFOL00,Marché,20006623100016,CC DES PORTES D'ARIEGE PYRENEES,200066231.0,49459697600014,SIRET,EQUADEX,494596976.0,...,,9100,Code postal,2022033INFOL00,data.gouv.fr decp-2024.json,Services de gestion d'installations pour le dé...,72000000,72500000,72510000,72514000
3,243100518001702024M05,2024M05,Marché,24310051800170,TOULOUSE METROPOLE,243100518.0,59278023300017,SIRET,RIVES & EAUX DU SUD-OUEST,592780233.0,...,0.0,31000,Code postal,,data.gouv.fr decp-2025-04.json,Services d'ingénierie,71000000,71300000,71300000,71300000
4,21590544900017202402401,202402401,Marché,21590544900017,COMMUNE DE SAINT SAULVE,215905449.0,32683156700010,SIRET,ALTOMARE ALTALU,326831567.0,...,,59800,Code postal,,data.gouv.fr decp-2024.json,Serrurerie,44000000,44300000,44310000,44316000


# Preprocess

In [4]:
X = data[['montant', 'procedure', 'dureeMois', 'formePrix', 'attributionAvance',
          'sousTraitanceDeclaree', 'typeGroupementOperateurs', 'tauxAvance','codeCPV_3']]

In [5]:
cpv_3 = data['codeCPV_3'].value_counts()
cpv_3_list = cpv_3[cpv_3 > 200].reset_index()['codeCPV_3']

X = X[X['codeCPV_3'].isin(cpv_3_list)]

X = X[~X['dureeMois'].isna()].copy()

In [6]:
X_train, X_test = train_test_split(X, test_size=0.2, random_state=0)

In [7]:
X_train.head()

Unnamed: 0,montant,procedure,dureeMois,formePrix,attributionAvance,sousTraitanceDeclaree,typeGroupementOperateurs,tauxAvance,codeCPV_3
235924,68695.95,Procédure adaptée,11.0,Unitaire,,,Pas de groupement,0.0,45400000
161978,2093.0,Procédure adaptée,12.0,Unitaire,0.0,0.0,Pas de groupement,0.0,90400000
298419,3333333.33,Procédure adaptée,48.0,Mixte,,,Pas de groupement,0.0,72500000
127992,248300.64,Procédure adaptée,36.0,Forfaitaire,0.0,0.0,Solidaire,0.0,45400000
291910,87265.55,Procédure adaptée,14.0,Mixte,,,Pas de groupement,,44300000


In [8]:
features_imputer_2 = ['attributionAvance', 'sousTraitanceDeclaree']

features_imputer_0 = ['tauxAvance']

# AttributionAvance
imputer_2 = SimpleImputer(strategy='constant', fill_value=2)

# TauxAvance
imputer_0 = SimpleImputer(strategy='constant', fill_value=0)

# All ohe features
ohe_features = ['procedure', 'formePrix', 'typeGroupementOperateurs','codeCPV_3']

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='if_binary')

imputer_2_pipe = make_pipeline(imputer_2, ohe)

# Numerical features
numerical_features = ['montant', 'dureeMois']
robust_scaler = RobustScaler()

imputer_0_pipe = make_pipeline(imputer_0, robust_scaler)



preproc_baseline = ColumnTransformer([
                    ('imputer_2', imputer_2_pipe, features_imputer_2),
                    ('imputer_0', imputer_0_pipe, features_imputer_0),
                    ('scale', robust_scaler, numerical_features),
                    ('ohe', ohe, ohe_features)
                    ]).set_output(transform='pandas')


preproc_baseline


In [11]:
X_train_preproc = preproc_baseline.fit_transform(X_train)
X_test_preproc = preproc_baseline.transform(X_test)

In [10]:
model_if = IsolationForest(n_estimators=100, random_state=0)

pipeline = make_pipeline(preproc_baseline, model_if)

pipeline

In [14]:
pipeline.fit(X_train)

anomalies = pipeline.predict(X_test)

X_test['anomaly'] = anomalies

X_test[X_test['anomaly'] == -1]

Unnamed: 0,montant,procedure,dureeMois,formePrix,attributionAvance,sousTraitanceDeclaree,typeGroupementOperateurs,tauxAvance,codeCPV_3,anomaly


In [16]:
X_test['anomaly'].value_counts()

anomaly
1    57473
Name: count, dtype: int64