In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys
import logging
import pickle

# Data
from sklearn.model_selection import train_test_split

# Tensorflow
import tensorflow as tf

# Graph
import tensorflow_gnn as tfgnn

# Add the project root to the Python path
#sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.data_cleaner import filter_top_cpv_categories
from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer

from scripts.preprocess_pipeline import create_pipeline_cat


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
graph_builder = ProcurementGraphBuilder()

data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')

df = graph_builder.load_data(data_path)

X_train_preproc, X_test_preproc, X_train, X_test = graph_builder.preprocess_data(df)

X_train_graph = graph_builder.create_graph(X_train_preproc, X_train, type='train')
X_test_graph = graph_builder.create_graph(X_test_preproc, X_test, type='test')

INFO:scripts.gnn_anomaly_detection:Loading data from /home/ronan/code/RonanB400/Project/decp_ml/data
INFO:scripts.gnn_anomaly_detection:Preprocessing data...


Filtered from 392 to 60 CPV categories, keeping 250895 rows out of 286850




In [3]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')


# Load a pickle file
with open(os.path.join(data_path, 'graph_data_train.pkl'), 'rb') as f:
    X_train_graph = pickle.load(f)

with open(os.path.join(data_path, 'graph_data_test.pkl'), 'rb') as f:
    X_test_graph = pickle.load(f)
    
gnn_detector = GNNAnomalyDetector(hidden_dim=64, output_dim=32, num_layers=3)

# Scale features before creating TensorFlow graph
node_features_train = X_train_graph['node_features']
edge_features_train = X_train_graph['edge_features']

X_train_tf_graph = gnn_detector.create_tensorflow_graph(X_train_graph, node_features_train, edge_features_train)
gnn_detector.graph_tensor_train = X_train_tf_graph

# Scale features before creating TensorFlow graph
node_features_test = X_test_graph['node_features']
edge_features_test = X_test_graph['edge_features']

X_test_tf_graph = gnn_detector.create_tensorflow_graph(X_test_graph, node_features_test, edge_features_test)
gnn_detector.graph_tensor_test = X_test_tf_graph


INFO:scripts.gnn_anomaly_detection:Creating TensorFlow GNN graph...
INFO:scripts.gnn_anomaly_detection:Creating TensorFlow GNN graph...


In [4]:
gnn_detector.model = gnn_detector.build_model(X_train_graph['node_features'].shape[1], X_train_graph['edge_features'].shape[1])
gnn_detector.model.summary()

INFO:scripts.gnn_anomaly_detection:Building GNN model with node and edge anomaly detection...










Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [()]                         0         []                            
                                                                                                  
 input.merge_batch_to_compo  ()                           0         ['input_1[0][0]']             
 nents (InstanceMethod)                                                                           
                                                                                                  
 map_features (MapFeatures)  ()                           12544     ['input.merge_batch_to_compone
                                                                    nts[0][0]']                   
                                                                                              

In [27]:
gnn_detector.model = gnn_detector.build_model(X_train_graph['node_features'].shape[1], X_train_graph['edge_features'].shape[1],
                                              l2_regularization=0, dropout_rate=0)
history = gnn_detector.train(X_train_tf_graph, epochs=100)
gnn_detector.plot_training_history(history)

INFO:scripts.gnn_anomaly_detection:Building GNN model with node and edge anomaly detection...
INFO:scripts.gnn_anomaly_detection:Training GNN model for 100 epochs with 30.0% validation split...


ValueError: `validation_split` is only supported for Tensors or NumPy arrays, found following types in the input: [<class 'tensorflow.python.data.ops.repeat_op._RepeatDataset'>]

In [9]:

(node_reconstruction_error, edge_reconstruction_error, 
     node_threshold, edge_threshold) = gnn_detector.detect_anomalies()
    
# Calculate anomaly masks
node_anomalies = node_reconstruction_error > node_threshold
edge_anomalies = edge_reconstruction_error > edge_threshold

INFO:scripts.gnn_anomaly_detection:Detecting node and edge anomalies...


graph_tensor is self.graph_tensor_test


INFO:scripts.gnn_anomaly_detection:Detected 352 node anomalies (1.0%)
INFO:scripts.gnn_anomaly_detection:Detected 502 edge anomalies (1.0%)


In [12]:
 #Create results analysis
graph_data = X_test_graph

analyzer = AnomalyAnalyzer()

node_results_df = analyzer.create_node_results_dataframe(
    graph_data, node_reconstruction_error, node_anomalies)

node_results_df.head()

Unnamed: 0,entity_name,entity_type,node_reconstruction_error,is_node_anomaly,num_contracts,num_partners,contracts_per_partner,mean_feature_value,std_feature_value
128,26310012500016,Buyer,3672.006836,True,612.0,363.0,1.68595,0.0969,0.233282
86,22590001801244,Buyer,1161.103149,True,362.0,263.0,1.376426,0.119255,0.237588
74,21690123100011,Buyer,1157.516602,True,334.0,297.0,1.124579,0.100884,0.22325
159,20005480700017,Buyer,768.138611,True,299.0,241.0,1.240664,0.119134,0.239127
429,20005226400013,Buyer,484.078827,True,265.0,191.0,1.387435,0.115066,0.264518


In [14]:
edge_results_df = analyzer.create_edge_results_dataframe(
    graph_data, edge_reconstruction_error, edge_anomalies)

edge_results_df.head()

INFO:scripts.gnn_anomaly_detection:Optional column 'dateNotification' not found in contract_data, using None


Unnamed: 0,contract_id,edge_reconstruction_error,is_edge_anomaly,acheteur_id,titulaire_id,montant,codeCPV_3,procedure,dateNotification,log_amount,cpv_hash,procedure_hash,duration_months
47252,138157,0.20762,True,21950306700015,78924532100029,1070350.0,71300000,Procédure adaptée,,6.697244,1.122369,-2.46646,0.0
45932,68595,0.20762,True,21950306700015,31290872600093,1070350.0,71300000,Procédure adaptée,,6.697244,1.122369,-2.46646,0.0
45687,96609,0.20762,True,21950306700015,43906618400037,1070350.0,71300000,Procédure adaptée,,6.697244,1.122369,-2.46646,0.0
5677,262210,0.197431,True,13002682600011,39841012600039,12490225.0,33600000,Appel d'offres ouvert,,6.807647,2.871962,0.239156,0.0
49361,181225,0.170901,True,20006340200016,48336329700027,3425157.0,71200000,Procédure adaptée,,6.209105,1.950652,0.960029,1.0


In [16]:
data_path

'/home/ronan/code/RonanB400/Project/decp_ml/data'

In [20]:
model_path = os.path.join(data_path, 'gnn_anomaly_model.keras')

tf.saved_model.save(gnn_detector.model, model_path)



INFO:tensorflow:Assets written to: /home/ronan/code/RonanB400/Project/decp_ml/data/gnn_anomaly_model.keras/assets


INFO:tensorflow:Assets written to: /home/ronan/code/RonanB400/Project/decp_ml/data/gnn_anomaly_model.keras/assets


In [22]:
graph_builder = ProcurementGraphBuilder()
graph_builder.visualize_procurement_graph(X_train_graph)

KeyboardInterrupt: 