In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys
import logging
import json

# Data
from sklearn.model_selection import train_test_split

# Tensorflow
import tensorflow as tf

# Graph
import tensorflow_gnn as tfgnn

# Add the project root to the Python path
#sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.data_cleaner import filter_top_cpv_categories
from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer

from scripts.preprocess_pipeline import create_pipeline_cat


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
graph_builder = ProcurementGraphBuilder()

data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')

df = graph_builder.load_data(data_path)

X_train_preproc, X_test_preproc, X_train, X_test = graph_builder.preprocess_data(df)



INFO:scripts.gnn_anomaly_detection:Loading data from /home/ronan/code/RonanB400/Project/decp_ml/data
INFO:scripts.gnn_anomaly_detection:Preprocessing data...


Filtered from 392 to 60 CPV categories, keeping 250895 rows out of 286850




In [34]:
X_train_graph = graph_builder.create_graph(X_train_preproc, X_train, type='train')
X_test_graph = graph_builder.create_graph(X_test_preproc, X_test, type='test')

INFO:scripts.gnn_anomaly_detection:Creating graph structure from preprocessed data...
INFO:scripts.gnn_anomaly_detection:Filtered to 200716 valid contracts (removed 0 contracts with missing names)
INFO:scripts.gnn_anomaly_detection:Creating edges and edge features from preprocessed data...
INFO:scripts.gnn_anomaly_detection:Computing acheteur features from preprocessed data...
INFO:scripts.gnn_anomaly_detection:Computing titulaire features from preprocessed data...
INFO:scripts.gnn_anomaly_detection:Creating graph structure from preprocessed data...
INFO:scripts.gnn_anomaly_detection:Filtered to 50179 valid contracts (removed 0 contracts with missing names)
INFO:scripts.gnn_anomaly_detection:Creating edges and edge features from preprocessed data...
INFO:scripts.gnn_anomaly_detection:Computing acheteur features from preprocessed data...
INFO:scripts.gnn_anomaly_detection:Computing titulaire features from preprocessed data...


In [35]:
gnn_detector = GNNAnomalyDetector(hidden_dim=64, output_dim=32, num_layers=3)

# Scale features before creating TensorFlow graph
node_features_train = X_train_graph['node_features']
edge_features_train = X_train_graph['edge_features']

X_train_tf_graph = gnn_detector.create_tensorflow_graph(X_train_graph, node_features_train, edge_features_train)
gnn_detector.graph_tensor_train = X_train_tf_graph

# Scale features before creating TensorFlow graph
node_features_test = X_test_graph['node_features']
edge_features_test = X_test_graph['edge_features']

X_test_tf_graph = gnn_detector.create_tensorflow_graph(X_test_graph, node_features_test, edge_features_test)
gnn_detector.graph_tensor_test = X_test_tf_graph


INFO:scripts.gnn_anomaly_detection:Creating TensorFlow GNN graph...
INFO:scripts.gnn_anomaly_detection:Creating TensorFlow GNN graph...


In [36]:
gnn_detector.model = gnn_detector.build_model(X_train_graph['node_features'].shape[1], X_train_graph['edge_features'].shape[1])
gnn_detector.model.summary()

INFO:scripts.gnn_anomaly_detection:Building GNN model with node and edge anomaly detection...


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [()]                         0         []                            
                                                                                                  
 map_features_2 (MapFeature  ()                           12544     ['input_7[0][0]']             
 s)                                                                                               
                                                                                                  
 graph_update_6 (GraphUpdat  ()                           20608     ['map_features_2[0][0]']      
 e)                                                                                               
                                                                                            

In [37]:
history = gnn_detector.train(X_train_tf_graph, epochs=10)

INFO:scripts.gnn_anomaly_detection:Training GNN model for 10 epochs...


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:

(node_reconstruction_error, edge_reconstruction_error, 
     node_threshold, edge_threshold) = gnn_detector.detect_anomalies()
    
# Calculate anomaly masks
node_anomalies = node_reconstruction_error > node_threshold
edge_anomalies = edge_reconstruction_error > edge_threshold

INFO:scripts.gnn_anomaly_detection:Detecting node and edge anomalies...


graph_tensor is self.graph_tensor_test


ValueError: in user code:

    File "/home/ronan/.pyenv/versions/3.10.6/envs/decp_ml_env/lib/python3.10/site-packages/keras/src/engine/training.py", line 2416, in predict_function  *
        return step_function(self, iterator)
    File "/home/ronan/.pyenv/versions/3.10.6/envs/decp_ml_env/lib/python3.10/site-packages/keras/src/engine/training.py", line 2401, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/ronan/.pyenv/versions/3.10.6/envs/decp_ml_env/lib/python3.10/site-packages/keras/src/engine/training.py", line 2389, in run_step  **
        outputs = model.predict_step(data)
    File "/home/ronan/.pyenv/versions/3.10.6/envs/decp_ml_env/lib/python3.10/site-packages/keras/src/engine/training.py", line 2357, in predict_step
        return self(x, training=False)
    File "/home/ronan/.pyenv/versions/3.10.6/envs/decp_ml_env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/ronan/.pyenv/versions/3.10.6/envs/decp_ml_env/lib/python3.10/site-packages/tensorflow_gnn/keras/layers/graph_update.py", line 236, in call
        gt.check_scalar_graph_tensor(graph, "GraphUpdate")
    File "/home/ronan/.pyenv/versions/3.10.6/envs/decp_ml_env/lib/python3.10/site-packages/tensorflow_gnn/graph/graph_tensor.py", line 1747, in check_scalar_graph_tensor
        gp.check_scalar_graph_piece(graph, name=name)
    File "/home/ronan/.pyenv/versions/3.10.6/envs/decp_ml_env/lib/python3.10/site-packages/tensorflow_gnn/graph/graph_piece.py", line 1300, in check_scalar_graph_piece
        raise ValueError(

    ValueError: Exception encountered when calling layer 'graph_update_6' (type GraphUpdate).
    
    GraphUpdate requires a scalar GraphTensor, that is, with `GraphTensor.rank=0`, but got `rank=1`. Use GraphTensor.merge_batch_to_components() to merge all contained graphs into one contiguously indexed graph of the scalar GraphTensor.
    
    Call arguments received by layer 'graph_update_6' (type GraphUpdate):
      • graph=GraphTensor(
      context=Context(features={}, sizes=Tensor("model_2/map_features_2/ones_like:0", shape=(None, 1), dtype=int32), shape=(None,), indices_dtype=tf.int32),
      node_set_names=['entities'],
      edge_set_names=['contracts'])
