In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys
import logging
import pickle
import logging

# Data
from sklearn.model_selection import train_test_split

# Tensorflow
import tensorflow as tf

# Graph
import tensorflow_gnn as tfgnn

# Add the project root to the Python path
#sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.data_cleaner import filter_top_cpv_categories
#from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer
from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer
from scripts.synthetic_anomaly_generator import analyze_original_dataset_anomalies, OriginalAnomalyRemover


%load_ext autoreload
%autoreload 2

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


2025-06-09 16:09:10.762485: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-09 16:09:10.767977: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-09 16:09:10.824359: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-09 16:09:10.824403: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-09 16:09:10.824454: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Configuration
DATA_PATH = os.path.join(os.path.dirname(os.getcwd()),
                        'data')
MODEL_PATH = os.path.join(os.path.dirname(os.getcwd()),
                            'models', 'anomalies')
os.makedirs(MODEL_PATH, exist_ok=True)

# Initialize components
graph_builder = ProcurementGraphBuilder()
gnn_detector = GNNAnomalyDetector(hidden_dim=64, output_dim=32,
                                    num_layers=3)
analyzer = AnomalyAnalyzer()

In [3]:
DATA_PATH = os.path.join(os.path.dirname(os.getcwd()),
                        'data')
 
# Load and preprocess data
X = graph_builder.load_data(DATA_PATH)

original_anomalies = analyze_original_dataset_anomalies(X)

INFO:scripts.gnn_anomaly_detection:Loading data from /home/ronan/code/RonanB400/Project/decp_ml/data
INFO:scripts.synthetic_anomaly_generator:Analyzing original dataset for existing anomaly patterns...
INFO:scripts.synthetic_anomaly_generator:single_bid_competitive: 13438 contracts (4.68%)
INFO:scripts.synthetic_anomaly_generator:price_inflation: 4888 contracts (1.70%)
INFO:scripts.synthetic_anomaly_generator:price_deflation: 0 contracts (0.00%)
INFO:scripts.synthetic_anomaly_generator:procedure_manipulation: 23140 contracts (8.07%)
INFO:scripts.synthetic_anomaly_generator:suspicious_modifications: 14089 contracts (4.91%)
INFO:scripts.synthetic_anomaly_generator:high_market_concentration: 4898 contracts (1.71%)
INFO:scripts.synthetic_anomaly_generator:temporal_clustering: 9488 contracts (3.31%)
INFO:scripts.synthetic_anomaly_generator:excessive_subcontracting: 42225 contracts (14.72%)
INFO:scripts.synthetic_anomaly_generator:short_contract_duration: 0 contracts (0.00%)
INFO:scripts.syn


ORIGINAL DATASET ANOMALY PATTERN ANALYSIS
Total contracts analyzed: 286,850

Analyzing prevalence of synthetic anomaly patterns in original data:
--------------------------------------------------------------------------------
Anomaly Type                        Count      Percentage   Status         
--------------------------------------------------------------------------------
Excessive Subcontracting            42,225     14.72       % 🔴 Very High    
Procedure Manipulation              23,140     8.07        % 🟡 High         
Suspicious Modifications            14,089     4.91        % 🟠 Medium       
Single Bid Competitive              13,438     4.68        % 🟠 Medium       
Suspicious Buyer Supplier Pairs     12,063     4.21        % 🟠 Medium       
Temporal Clustering                 9,488      3.31        % 🟠 Medium       
High Market Concentration           4,898      1.71        % 🟠 Medium       
Price Inflation                     4,888      1.70        % 🟠 Medium       

In [4]:
anomaly_remover = OriginalAnomalyRemover()

anomaly_types = ['single_bid_competitive', 
                'price_inflation',
                'price_deflation',
                #'procedure_manipulation',
                #'suspicious_modifications',
                'high_market_concentration',  
                'temporal_clustering',
                #'excessive_subcontracting',
                #'short_contract_duration',
                'suspicious_buyer_supplier_pairs']

# Clean training set
logger.info("Cleaning training set...")
X_clean = anomaly_remover.clean_dataset(
    X, 
    anomaly_types=anomaly_types,
    strict_threshold=True
)


INFO:__main__:Cleaning training set...
INFO:scripts.synthetic_anomaly_generator:Identifying original anomalies for removal...
INFO:scripts.synthetic_anomaly_generator:Checking for: single_bid_competitive, price_inflation, price_deflation, high_market_concentration, temporal_clustering, suspicious_buyer_supplier_pairs
INFO:scripts.synthetic_anomaly_generator:single_bid_competitive: found 13438 anomalies
INFO:scripts.synthetic_anomaly_generator:price_inflation: found 4888 anomalies
INFO:scripts.synthetic_anomaly_generator:price_deflation: found 0 anomalies
INFO:scripts.synthetic_anomaly_generator:high_market_concentration: found 4898 anomalies
INFO:scripts.synthetic_anomaly_generator:temporal_clustering: found 9087 anomalies
INFO:scripts.synthetic_anomaly_generator:suspicious_buyer_supplier_pairs: found 12063 anomalies
INFO:scripts.synthetic_anomaly_generator:Total unique anomalous rows identified: 36315 (12.66%)
INFO:scripts.synthetic_anomaly_generator:Removed 36315 anomalous rows (12.6

In [5]:
original_anomalies = analyze_original_dataset_anomalies(X_clean)

INFO:scripts.synthetic_anomaly_generator:Analyzing original dataset for existing anomaly patterns...
INFO:scripts.synthetic_anomaly_generator:single_bid_competitive: 0 contracts (0.00%)
INFO:scripts.synthetic_anomaly_generator:price_inflation: 5969 contracts (2.38%)
INFO:scripts.synthetic_anomaly_generator:price_deflation: 0 contracts (0.00%)
INFO:scripts.synthetic_anomaly_generator:procedure_manipulation: 24380 contracts (9.73%)
INFO:scripts.synthetic_anomaly_generator:suspicious_modifications: 12071 contracts (4.82%)
INFO:scripts.synthetic_anomaly_generator:high_market_concentration: 257 contracts (0.10%)
INFO:scripts.synthetic_anomaly_generator:temporal_clustering: 3292 contracts (1.31%)
INFO:scripts.synthetic_anomaly_generator:excessive_subcontracting: 36926 contracts (14.74%)
INFO:scripts.synthetic_anomaly_generator:short_contract_duration: 0 contracts (0.00%)
INFO:scripts.synthetic_anomaly_generator:suspicious_buyer_supplier_pairs: 12454 contracts (4.97%)



ORIGINAL DATASET ANOMALY PATTERN ANALYSIS
Total contracts analyzed: 250,535

Analyzing prevalence of synthetic anomaly patterns in original data:
--------------------------------------------------------------------------------
Anomaly Type                        Count      Percentage   Status         
--------------------------------------------------------------------------------
Excessive Subcontracting            36,926     14.74       % 🔴 Very High    
Procedure Manipulation              24,380     9.73        % 🟡 High         
Suspicious Buyer Supplier Pairs     12,454     4.97        % 🟠 Medium       
Suspicious Modifications            12,071     4.82        % 🟠 Medium       
Price Inflation                     5,969      2.38        % 🟠 Medium       
Temporal Clustering                 3,292      1.31        % 🟠 Medium       
High Market Concentration           257        0.10        % 🟢 Low          
Single Bid Competitive              0          0.00        % ✅ Very Low     