In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys
import logging
import pickle

# Data
from sklearn.model_selection import train_test_split

# Tensorflow
import tensorflow as tf

# Graph
import tensorflow_gnn as tfgnn

# Add the project root to the Python path
#sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.data_cleaner import filter_top_cpv_categories
#from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer
from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer

from scripts.synthetic_anomaly_generator import SyntheticAnomalyGenerator

from scripts.preprocess_pipeline import create_pipeline_cat


%load_ext autoreload
%autoreload 2


2025-06-08 11:01:29.028618: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-08 11:01:29.030807: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-08 11:01:29.070735: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-08 11:01:29.070780: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-08 11:01:29.070796: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
df = pd.read_csv(os.path.join(data_path, 'data_clean.csv'))
df.head()

Unnamed: 0,uid,id,nature,acheteur_id,acheteur_nom,acheteur_siren,titulaire_id,titulaire_typeIdentifiant,titulaire_nom,titulaire_siren,...,lieuExecution_typeCode,idAccordCadre,source_open_data,codeCPV_FR,codeCPV_2,codeCPV_3,codeCPV_4,codeCPV_5,codeCPV_2_3,annee
0,210601209000132022_M013,2022_M013,Marché,21060120900013,COMMUNE DE SAINT ETIENNE DE TINEE,210601209.0,38177692100029,SIRET,SERMATECH,381776921.0,...,Code postal,,data.gouv.fr decp-2024.json,Travaux de construction de bâtiments,45000000,45200000,45210000,45210000,45200000,2021
1,217100759000182024RENOCHARP,2024RENOCHARP,Marché,21710075900018,COMMUNE DE CHALMOUX,217100759.0,75203574100038,SIRET,MACON ETANCHEITE,752035741.0,...,Code postal,,data.gouv.fr decp-2025-01.json,Travaux de charpente et de couverture et trava...,45000000,45200000,45260000,45261000,45200000,2023
2,200066231000162022033INFOL00,2022033INFOL00,Marché,20006623100016,CC DES PORTES D'ARIEGE PYRENEES,200066231.0,49459697600014,SIRET,EQUADEX,494596976.0,...,Code postal,2022033INFOL00,data.gouv.fr decp-2024.json,Services de gestion d'installations pour le dé...,72000000,72500000,72510000,72514000,72000000,2023
3,243100518001702024M05,2024M05,Marché,24310051800170,TOULOUSE METROPOLE,243100518.0,59278023300017,SIRET,RIVES & EAUX DU SUD-OUEST,592780233.0,...,Code postal,,data.gouv.fr decp-2025-04.json,Services d'ingénierie,71000000,71300000,71300000,71300000,71300000,2024
4,21590544900017202402401,202402401,Marché,21590544900017,COMMUNE DE SAINT SAULVE,215905449.0,32683156700010,SIRET,ALTOMARE ALTALU,326831567.0,...,Code postal,,data.gouv.fr decp-2024.json,Serrurerie,44000000,44300000,44310000,44316000,44000000,2024


In [32]:
generator = SyntheticAnomalyGenerator(random_seed=42)

df_sample = df.copy()


anomaly_types=['single_bid_competitive', 
                       'price_inflation',
                       'price_deflation',
                       'procedure_manipulation',
                       'suspicious_modifications',
                       'high_market_concentration' ,  
                       'temporal_clustering',
                       'excessive_subcontracting',
                       'short_contract_duration',
                       'suspicious_buyer_supplier_pairs']

# Generate anomalies
df_with_anomalies, anomaly_labels = generator.generate_anomalies(
        df_sample,
        anomaly_percentage=0.10,  # 10% anomalies
        anomaly_types=anomaly_types
        )

df_with_anomalies.head()

INFO:scripts.synthetic_anomaly_generator:Generating 28685 total synthetic anomaly rows
INFO:scripts.synthetic_anomaly_generator:Approximately 2868 anomalies per type
INFO:scripts.synthetic_anomaly_generator:Generating single_bid_competitive anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 2868 single bid competitive anomaly rows
INFO:scripts.synthetic_anomaly_generator:Generating price_inflation anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 2868 price inflation anomaly rows
INFO:scripts.synthetic_anomaly_generator:Generating price_deflation anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 2868 price deflation anomaly rows
INFO:scripts.synthetic_anomaly_generator:Generating procedure_manipulation anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 2868 procedure manipulation anomaly rows
INFO:scripts.synthetic_anomaly_generator:Generating suspicious_modifications anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 2

Unnamed: 0,uid,id,nature,acheteur_id,acheteur_nom,acheteur_siren,titulaire_id,titulaire_typeIdentifiant,titulaire_nom,titulaire_siren,...,codeCPV_FR,codeCPV_2,codeCPV_3,codeCPV_4,codeCPV_5,codeCPV_2_3,annee,anomaly_type,source_type,is_synthetic_anomaly
0,210601209000132022_M013,2022_M013,Marché,21060120900013,COMMUNE DE SAINT ETIENNE DE TINEE,210601209.0,38177692100029,SIRET,SERMATECH,381776921.0,...,Travaux de construction de bâtiments,45000000,45200000,45210000,45210000,45200000,2021,,,0
1,217100759000182024RENOCHARP,2024RENOCHARP,Marché,21710075900018,COMMUNE DE CHALMOUX,217100759.0,75203574100038,SIRET,MACON ETANCHEITE,752035741.0,...,Travaux de charpente et de couverture et trava...,45000000,45200000,45260000,45261000,45200000,2023,,,0
2,200066231000162022033INFOL00,2022033INFOL00,Marché,20006623100016,CC DES PORTES D'ARIEGE PYRENEES,200066231.0,49459697600014,SIRET,EQUADEX,494596976.0,...,Services de gestion d'installations pour le dé...,72000000,72500000,72510000,72514000,72000000,2023,,,0
3,243100518001702024M05,2024M05,Marché,24310051800170,TOULOUSE METROPOLE,243100518.0,59278023300017,SIRET,RIVES & EAUX DU SUD-OUEST,592780233.0,...,Services d'ingénierie,71000000,71300000,71300000,71300000,71300000,2024,,,0
4,21590544900017202402401,202402401,Marché,21590544900017,COMMUNE DE SAINT SAULVE,215905449.0,32683156700010,SIRET,ALTOMARE ALTALU,326831567.0,...,Serrurerie,44000000,44300000,44310000,44316000,44000000,2024,,,0


In [4]:
df_with_anomalies[anomaly_labels['single_bid_competitive'] == True]['offresRecues'].value_counts()

offresRecues
1.0    3187
Name: count, dtype: int64

In [5]:
df_with_anomalies[anomaly_labels['high_market_concentration'] == True][['acheteur_id', 'titulaire_id', 'codeCPV_3']].head(10)

Unnamed: 0,acheteur_id,titulaire_id,codeCPV_3
290037,25380510500017,45254204600019,45200000
290038,25380510500017,45254204600019,45200000
290039,25380510500017,45254204600019,45200000
290040,20006810400013,82082937200094,71200000
290041,20006810400013,82082937200094,71200000
290042,20006810400013,82082937200094,71200000
290043,20003019500115,52783327100044,33700000
290044,20003019500115,52783327100044,33700000
290045,20003019500115,52783327100044,33700000
290046,21350034100010,39907312100028,45400000


In [6]:
df_with_anomalies[(df_with_anomalies['acheteur_id'] == 26972058700014) &
                   (df_with_anomalies['codeCPV_3'] == 92300000)][[
                       'uid','acheteur_id', 'titulaire_id','codeCPV_3', 'is_synthetic_anomaly']]

Unnamed: 0,uid,acheteur_id,titulaire_id,codeCPV_3,is_synthetic_anomaly
28921,2697205870001420232023062,26972058700014,92157965200014,92300000,False
53057,2697205870001420232023061,26972058700014,92157965200014,92300000,False
160488,26972058700014202420240102,26972058700014,52847284800019,92300000,False
255952,269720587000142024ACC2024010,26972058700014,52847284800019,92300000,False
256051,26972058700014202220221105,26972058700014,52847284800019,92300000,False


In [21]:
df_with_anomalies[anomaly_labels['price_inflation'] == True][[
    'uid','montant']].head(10)

Unnamed: 0,uid,montant
286850,21560052900014202424122,297064.1
286851,2251000150001820242024-34,285978.0
286852,229501275000152024AO190029,7650176.0
286853,226400018008762022161,3091628.0
286854,200054807000172022Z220024A00,1009584.0
286855,20003357900018202310888,321926.3
286856,3902899400001225-1086239,994529.6
286857,200067551000162023ENV233901,125264.3
286858,212902126000112024PT1004,260211.7
286859,21971123100072202323MA02LT06,437559.8


In [26]:
df_with_anomalies[(df_with_anomalies['uid'] == '20003357900018202310888')][[
                       'uid','montant','is_synthetic_anomaly', 'titulaire_id', 'acheteur_id']]

Unnamed: 0,uid,montant,is_synthetic_anomaly,titulaire_id,acheteur_id
21621,20003357900018202310888,64000.0,False,84199629100016,20003357900018
131453,20003357900018202310888,64000.0,False,43154181200068,20003357900018
286855,20003357900018202310888,321926.301585,True,43154181200068,20003357900018
288289,20003357900018202310888,16812.913479,True,43154181200068,20003357900018


In [28]:
df_with_anomalies[anomaly_labels['procedure_manipulation'] == True][[
    'uid','procedure']].head(10)

Unnamed: 0,uid,procedure
295454,212505531000132024-MOE-TVX-2,Marché négocié sans publicité
295455,21440041800098202402,Procédure adaptée
295456,26590776600017202071002,Marché négocié sans publicité
295457,28750005200082202520250047,Procédure adaptée
295458,21720181300011202413042,Marché négocié sans publicité
295459,225900018012442023119300200,Marché négocié sans publicité
295460,254001399001232024MPIENR1_4C,Procédure adaptée
295461,218708501000182024V3F43806GF,Marché négocié sans publicité
295462,200030013000112460,Procédure adaptée
295463,231300021000122023231876,Marché négocié sans publicité


In [31]:
df_with_anomalies[(df_with_anomalies['uid'] == '200030013000112460')][[
                       'uid','procedure']]

Unnamed: 0,uid,procedure
95903,200030013000112460,Appel d'offres ouvert
101674,200030013000112460,Appel d'offres ouvert
153630,200030013000112460,Appel d'offres ouvert
195523,200030013000112460,Appel d'offres ouvert
199279,200030013000112460,Appel d'offres ouvert
235293,200030013000112460,Marché passé sans publicité ni mise en concurr...
269117,200030013000112460,Appel d'offres ouvert
288202,200030013000112460,Appel d'offres ouvert
295462,200030013000112460,Procédure adaptée
