In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import logging

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN



from scripts.preprocess_pipeline import create_pipeline
from scripts.data_cleaner import filter_top_cpv_categories
from scripts.synthetic_anomaly_generator import (SyntheticAnomalyGenerator,
                                                OriginalAnomalyRemover)

In [5]:
data_path='../data/data_clean.csv'
model_save_path='../data'
top_n = 40
cpv_column = 'codeCPV_2_3'

In [4]:
X = pd.read_csv(data_path, encoding='utf-8')

In [None]:
X = filter_top_cpv_categories(X, top_n=60, cpv_column=cpv_column)
# Split data first, before removing original anomalies
X_train, X_test = train_test_split(X,
                                   test_size=0.2,
                                   random_state=0,
                                   stratify=X[cpv_column],
                                   shuffle=True)
X_train, X_val = train_test_split(X_train,
                                  test_size=0.2,
                                  random_state=0,
                                  stratify=X_train[cpv_column],
                                  shuffle=True)
# Remove original anomalies from training and validation sets
#logger.info("Removing original anomalies from training and validation sets...")
anomaly_remover = OriginalAnomalyRemover()
anomaly_types = ['single_bid_competitive',
                'price_inflation',
                'price_deflation',
                #'procedure_manipulation',
                #'suspicious_modifications',
                'high_market_concentration',
                'temporal_clustering',
                #'excessive_subcontracting',
                #'short_contract_duration',
                'suspicious_buyer_supplier_pairs']


Filtered from 60 to 60 CPV categories, keeping 286412 rows out of 286412


In [None]:

# Clean training set
#logger.info("Cleaning training set...")
X_train = anomaly_remover.clean_dataset(
    X_train,
    anomaly_types=anomaly_types,
    strict_threshold=True
)

# Clean validation set
#logger.info("Cleaning validation set...")
X_val = anomaly_remover.clean_dataset(
    X_val,
    anomaly_types=anomaly_types,
    strict_threshold=True
)


INFO:scripts.synthetic_anomaly_generator:Identifying original anomalies for removal...
INFO:scripts.synthetic_anomaly_generator:Checking for: single_bid_competitive, price_inflation, price_deflation, high_market_concentration, temporal_clustering, suspicious_buyer_supplier_pairs
INFO:scripts.synthetic_anomaly_generator:single_bid_competitive: found 8595 anomalies
INFO:scripts.synthetic_anomaly_generator:price_inflation: found 3050 anomalies
INFO:scripts.synthetic_anomaly_generator:price_deflation: found 0 anomalies
INFO:scripts.synthetic_anomaly_generator:high_market_concentration: found 2300 anomalies
INFO:scripts.synthetic_anomaly_generator:temporal_clustering: found 3711 anomalies
INFO:scripts.synthetic_anomaly_generator:suspicious_buyer_supplier_pairs: found 6197 anomalies
INFO:scripts.synthetic_anomaly_generator:Total unique anomalous rows identified: 20161 (11.00%)
INFO:scripts.synthetic_anomaly_generator:Removed 20161 anomalous rows (11.00%)
INFO:scripts.synthetic_anomaly_genera

In [None]:

# Preprocess pipeline
numerical_columns = ['montant', 'dureeMois', 'offresRecues']
binary_columns = ['sousTraitanceDeclaree', 'origineFrance',
                  'marcheInnovant', 'idAccordCadre']

categorical_columns = ['procedure', 'nature', 'formePrix', 'ccag',
                       'typeGroupementOperateurs', 'tauxAvance_cat',
                       cpv_column]

nodes_columns = ['acheteur_id', 'titulaire_id']

preproc_pipeline = create_pipeline(numerical_columns,
                                   binary_columns,
                                 categorical_columns)
X_train_preproc = preproc_pipeline.fit_transform(X_train)
X_train_preproc.index = X_train.index
X_train_preproc = pd.concat([X_train_preproc, X_train[nodes_columns]],
                           axis=1)
X_val_preproc = preproc_pipeline.transform(X_val)
X_val_preproc.index = X_val.index
X_val_preproc = pd.concat([X_val_preproc, X_val[nodes_columns]],
                         axis=1)




In [None]:

# Generate synthetic anomalies for test set
#logger.info("Generating synthetic anomalies for test set...")
generator = SyntheticAnomalyGenerator(random_seed=42)
X_test_copy = X_test.copy()
# Reset index to avoid index mismatch issues
X_test_copy = X_test_copy.reset_index(drop=True)
# Generate anomalies
X_test_anomalies = generator.generate_anomalies(
    X_test_copy,
    anomaly_percentage=0.10,  # 10% anomalies
    anomaly_types=anomaly_types
)
X_test_preproc = preproc_pipeline.transform(X_test_anomalies)
X_test_preproc.index = X_test_anomalies.index
X_test_preproc = pd.concat([X_test_preproc,
                           X_test_anomalies[nodes_columns]], axis=1)


INFO:scripts.synthetic_anomaly_generator:Generating 5728 total synthetic anomaly rows by replacing
INFO:scripts.synthetic_anomaly_generator:Approximately 954 anomalies per type
INFO:scripts.synthetic_anomaly_generator:Generating single_bid_competitive anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 954 single bid competitive anomaly rows
INFO:scripts.synthetic_anomaly_generator:Generating price_inflation anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 954 price inflation anomaly rows
INFO:scripts.synthetic_anomaly_generator:Generating price_deflation anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 954 price deflation anomaly rows
INFO:scripts.synthetic_anomaly_generator:Generating high_market_concentration anomalies...
INFO:scripts.synthetic_anomaly_generator:Generated 954 high market concentration anomaly rows
INFO:scripts.synthetic_anomaly_generator:Generating temporal_clustering anomalies...
INFO:scripts.synthetic_anomaly_generator:Gen

In [15]:
X_train_preproc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 163142 entries, 115161 to 179392
Data columns (total 96 columns):
 #   Column                                                                                Non-Null Count   Dtype  
---  ------                                                                                --------------   -----  
 0   offres_recues_pipeline__offresRecues                                                  163142 non-null  float64
 1   other_num_pipeline__montant                                                           163142 non-null  float64
 2   other_num_pipeline__dureeMois                                                         163142 non-null  float64
 3   binary_pipeline__sousTraitanceDeclaree                                                163142 non-null  float64
 4   binary_pipeline__origineFrance                                                        163142 non-null  float64
 5   binary_pipeline__marcheInnovant                                         

In [16]:
X_test_preproc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57283 entries, 0 to 57282
Data columns (total 96 columns):
 #   Column                                                                                Non-Null Count  Dtype  
---  ------                                                                                --------------  -----  
 0   offres_recues_pipeline__offresRecues                                                  57283 non-null  float64
 1   other_num_pipeline__montant                                                           57283 non-null  float64
 2   other_num_pipeline__dureeMois                                                         57283 non-null  float64
 3   binary_pipeline__sousTraitanceDeclaree                                                57283 non-null  float64
 4   binary_pipeline__origineFrance                                                        57283 non-null  float64
 5   binary_pipeline__marcheInnovant                                                  