In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys
import logging
import pickle
import logging

# Data
from sklearn.model_selection import train_test_split

# Tensorflow
import tensorflow as tf

# Graph
import tensorflow_gnn as tfgnn

# Add the project root to the Python path
#sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.data_cleaner import filter_top_cpv_categories
#from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer
from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer
from scripts.synthetic_anomaly_generator import analyze_original_dataset_anomalies, OriginalAnomalyRemover


%load_ext autoreload
%autoreload 2

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# Configuration
DATA_PATH = os.path.join(os.path.dirname(os.getcwd()),
                        'data')
MODEL_PATH = os.path.join(os.path.dirname(os.getcwd()),
                            'models', 'anomalies')
os.makedirs(MODEL_PATH, exist_ok=True)

# Initialize components
graph_builder = ProcurementGraphBuilder()
gnn_detector = GNNAnomalyDetector(hidden_dim=64, output_dim=32,
                                    num_layers=3)
analyzer = AnomalyAnalyzer()

In [6]:
DATA_PATH = os.path.join(os.path.dirname(os.getcwd()),
                        'data')
 
# Load and preprocess data
X = graph_builder.load_data(DATA_PATH)

original_anomalies = analyze_original_dataset_anomalies(X)

INFO:scripts.gnn_anomaly_detection:Loading data from /home/ronan/code/RonanB400/Project/decp_ml/data
INFO:scripts.synthetic_anomaly_generator:Analyzing original dataset for existing anomaly patterns...
INFO:scripts.synthetic_anomaly_generator:single_bid_competitive: 13438 contracts (4.68%)
INFO:scripts.synthetic_anomaly_generator:price_inflation: 4888 contracts (1.70%)
INFO:scripts.synthetic_anomaly_generator:price_deflation: 0 contracts (0.00%)
INFO:scripts.synthetic_anomaly_generator:procedure_manipulation: 23140 contracts (8.07%)
INFO:scripts.synthetic_anomaly_generator:suspicious_modifications: 14089 contracts (4.91%)
INFO:scripts.synthetic_anomaly_generator:high_market_concentration: 4898 contracts (1.71%)
INFO:scripts.synthetic_anomaly_generator:temporal_clustering: 9488 contracts (3.31%)
INFO:scripts.synthetic_anomaly_generator:excessive_subcontracting: 42225 contracts (14.72%)
INFO:scripts.synthetic_anomaly_generator:short_contract_duration: 0 contracts (0.00%)
INFO:scripts.syn


ORIGINAL DATASET ANOMALY PATTERN ANALYSIS
Total contracts analyzed: 286,850

Analyzing prevalence of synthetic anomaly patterns in original data:
--------------------------------------------------------------------------------
Anomaly Type                        Count      Percentage   Status         
--------------------------------------------------------------------------------
Excessive Subcontracting            42,225     14.72       % 🔴 Very High    
Procedure Manipulation              23,140     8.07        % 🟡 High         
Suspicious Modifications            14,089     4.91        % 🟠 Medium       
Single Bid Competitive              13,438     4.68        % 🟠 Medium       
Temporal Clustering                 9,488      3.31        % 🟠 Medium       
Suspicious Buyer Supplier Pairs     7,887      2.75        % 🟠 Medium       
High Market Concentration           4,898      1.71        % 🟠 Medium       
Price Inflation                     4,888      1.70        % 🟠 Medium       

In [7]:
anomaly_remover = OriginalAnomalyRemover()

anomaly_types = ['single_bid_competitive', 
                'price_inflation',
                'price_deflation',
                #'procedure_manipulation',
                #'suspicious_modifications',
                'high_market_concentration',  
                'temporal_clustering',
                #'excessive_subcontracting',
                #'short_contract_duration',
                'suspicious_buyer_supplier_pairs']

# Clean training set
logger.info("Cleaning training set...")
X_clean = anomaly_remover.clean_dataset(
    X, 
    anomaly_types=anomaly_types,
    strict_threshold=True
)


INFO:__main__:Cleaning training set...
INFO:scripts.synthetic_anomaly_generator:Identifying original anomalies for removal...
INFO:scripts.synthetic_anomaly_generator:Checking for: single_bid_competitive, price_inflation, price_deflation, high_market_concentration, temporal_clustering, suspicious_buyer_supplier_pairs
INFO:scripts.synthetic_anomaly_generator:single_bid_competitive: found 5644 anomalies
INFO:scripts.synthetic_anomaly_generator:price_inflation: found 3536 anomalies
INFO:scripts.synthetic_anomaly_generator:price_deflation: found 0 anomalies
INFO:scripts.synthetic_anomaly_generator:high_market_concentration: found 264 anomalies
INFO:scripts.synthetic_anomaly_generator:temporal_clustering: found 9087 anomalies
INFO:scripts.synthetic_anomaly_generator:suspicious_buyer_supplier_pairs: found 66 anomalies
INFO:scripts.synthetic_anomaly_generator:Total unique anomalous rows identified: 17746 (6.19%)
INFO:scripts.synthetic_anomaly_generator:Removed 17746 anomalous rows (6.19%)
INF

In [8]:
original_anomalies = analyze_original_dataset_anomalies(X_clean)

INFO:scripts.synthetic_anomaly_generator:Analyzing original dataset for existing anomaly patterns...
INFO:scripts.synthetic_anomaly_generator:single_bid_competitive: 7404 contracts (2.75%)
INFO:scripts.synthetic_anomaly_generator:price_inflation: 6621 contracts (2.46%)
INFO:scripts.synthetic_anomaly_generator:price_deflation: 0 contracts (0.00%)
INFO:scripts.synthetic_anomaly_generator:procedure_manipulation: 25208 contracts (9.37%)
INFO:scripts.synthetic_anomaly_generator:suspicious_modifications: 13382 contracts (4.97%)
INFO:scripts.synthetic_anomaly_generator:high_market_concentration: 3628 contracts (1.35%)
INFO:scripts.synthetic_anomaly_generator:temporal_clustering: 4644 contracts (1.73%)
INFO:scripts.synthetic_anomaly_generator:excessive_subcontracting: 39893 contracts (14.82%)
INFO:scripts.synthetic_anomaly_generator:short_contract_duration: 0 contracts (0.00%)
INFO:scripts.synthetic_anomaly_generator:suspicious_buyer_supplier_pairs: 6460 contracts (2.40%)



ORIGINAL DATASET ANOMALY PATTERN ANALYSIS
Total contracts analyzed: 269,104

Analyzing prevalence of synthetic anomaly patterns in original data:
--------------------------------------------------------------------------------
Anomaly Type                        Count      Percentage   Status         
--------------------------------------------------------------------------------
Excessive Subcontracting            39,893     14.82       % 🔴 Very High    
Procedure Manipulation              25,208     9.37        % 🟡 High         
Suspicious Modifications            13,382     4.97        % 🟠 Medium       
Single Bid Competitive              7,404      2.75        % 🟠 Medium       
Price Inflation                     6,621      2.46        % 🟠 Medium       
Suspicious Buyer Supplier Pairs     6,460      2.40        % 🟠 Medium       
Temporal Clustering                 4,644      1.73        % 🟠 Medium       
High Market Concentration           3,628      1.35        % 🟠 Medium       

In [None]:
# Now using all three splits: train, val, test (with synthetic anomalies)
(X_train_preproc, X_val_preproc, X_test_preproc, 
    X_train, X_val, X_test) = graph_builder.preprocess_data(X)

# Create graphs for all three splits
X_train_graph = graph_builder.create_graph(X_train_preproc, X_train, type='train')
X_val_graph = graph_builder.create_graph(X_val_preproc, X_val, type='val')
X_test_graph = graph_builder.create_graph(X_test_preproc, X_test, type='test')

In [None]:
# Scale derived node/edge features using training data
logger.info("Scaling derived node and edge features...")
node_features_train = X_train_graph['node_features']
edge_features_train = X_train_graph['edge_features']

# Fit scalers on training data (derived features)
node_features_train_scaled = graph_builder.node_scaler.fit_transform(
    node_features_train)
edge_features_train_scaled = graph_builder.edge_scaler.fit_transform(
    edge_features_train)

# Transform validation features using training scalers
node_features_val = X_val_graph['node_features']
edge_features_val = X_val_graph['edge_features']
node_features_val_scaled = graph_builder.node_scaler.transform(
    node_features_val)
edge_features_val_scaled = graph_builder.edge_scaler.transform(
    edge_features_val)

# Transform test features using training scalers
node_features_test = X_test_graph['node_features']
edge_features_test = X_test_graph['edge_features']
node_features_test_scaled = graph_builder.node_scaler.transform(
    node_features_test)
edge_features_test_scaled = graph_builder.edge_scaler.transform(
    edge_features_test)

# Create TensorFlow graphs for all three splits
X_train_tf_graph = gnn_detector.create_tensorflow_graph(
    X_train_graph, node_features_train_scaled, edge_features_train_scaled)
X_val_tf_graph = gnn_detector.create_tensorflow_graph(
    X_val_graph, node_features_val_scaled, edge_features_val_scaled)
X_test_tf_graph = gnn_detector.create_tensorflow_graph(
    X_test_graph, node_features_test_scaled, edge_features_test_scaled)

# Store graph tensors for later use
gnn_detector.graph_tensor_train = X_train_tf_graph
gnn_detector.graph_tensor_val = X_val_tf_graph
gnn_detector.graph_tensor_test = X_test_tf_graph

# Edge anomalies

In [None]:
gnn_detector.edge_model = gnn_detector.build_edge_model(X_train_graph['node_features'].shape[1], X_train_graph['edge_features'].shape[1],
                                              l2_regularization=0, dropout_rate=0)

edge_history = gnn_detector.train_edge_model(X_train_tf_graph, 
                                        validation_graph_tensor=X_val_tf_graph, 
                                         epochs=100)

gnn_detector.plot_edge_training_history(edge_history)

In [None]:
edge_reconstruction_error, edge_threshold = (
            gnn_detector.detect_edge_anomalies(
                X_test_tf_graph,
                threshold_percentile = 10))
        
# Calculate anomaly masks
edge_anomalies = edge_reconstruction_error > edge_threshold

In [None]:
synthetic_analysis = analyzer.analyze_synthetic_anomaly_detection(
            X_test_graph, edge_reconstruction_error, edge_threshold, threshold_percentile=10, show_plots=False)

# Node anomalies

In [None]:
gnn_detector.node_model = gnn_detector.build_node_model(X_train_graph['node_features'].shape[1], X_train_graph['edge_features'].shape[1],
                                              l2_regularization=0, dropout_rate=0)

node_history = gnn_detector.train_node_model(X_train_tf_graph, 
                                        validation_graph_tensor=X_val_tf_graph, 
                                        epochs=50)

gnn_detector.plot_node_training_history(node_history)

In [None]:

(node_reconstruction_error, edge_reconstruction_error, 
     node_threshold, edge_threshold) = gnn_detector.detect_anomalies()
    
# Calculate anomaly masks
node_anomalies = node_reconstruction_error > node_threshold
edge_anomalies = edge_reconstruction_error > edge_threshold

In [None]:
 #Create results analysis
graph_data = X_test_graph

analyzer = AnomalyAnalyzer()

node_results_df = analyzer.create_node_results_dataframe(
    graph_data, node_reconstruction_error, node_anomalies)

node_results_df.head()

In [None]:
edge_results_df = analyzer.create_edge_results_dataframe(
    graph_data, edge_reconstruction_error, edge_anomalies)

edge_results_df.head()

In [None]:
data_path

In [None]:
model_path = os.path.join(data_path, 'gnn_anomaly_model.keras')

tf.saved_model.save(gnn_detector.model, model_path)

In [None]:
graph_builder = ProcurementGraphBuilder()
graph_builder.visualize_procurement_graph(X_train_graph)