In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys
import logging
import pickle

# Data
from sklearn.model_selection import train_test_split

# Tensorflow
import tensorflow as tf

# Graph
import tensorflow_gnn as tfgnn

# Add the project root to the Python path
#sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.data_cleaner import filter_top_cpv_categories
from scripts.gnn_anomaly_detection import ProcurementGraphBuilder, GNNAnomalyDetector, AnomalyAnalyzer

from scripts.preprocess_pipeline import create_pipeline_cat


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
graph_builder = ProcurementGraphBuilder()

data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')

df = graph_builder.load_data(data_path)

X_train_preproc, X_test_preproc, X_train, X_test = graph_builder.preprocess_data(df)

X_train_graph = graph_builder.create_graph(X_train_preproc, X_train, type='train')
X_test_graph = graph_builder.create_graph(X_test_preproc, X_test, type='test')

INFO:scripts.gnn_anomaly_detection:Loading data from /home/ronan/code/RonanB400/Project/decp_ml/data
INFO:scripts.gnn_anomaly_detection:Preprocessing data...


Filtered from 392 to 60 CPV categories, keeping 250895 rows out of 286850




In [3]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')


# Load a pickle file
with open(os.path.join(data_path, 'graph_data_train.pkl'), 'rb') as f:
    X_train_graph = pickle.load(f)

with open(os.path.join(data_path, 'graph_data_test.pkl'), 'rb') as f:
    X_test_graph = pickle.load(f)
    
gnn_detector = GNNAnomalyDetector(hidden_dim=64, output_dim=32, num_layers=3)

# Scale features before creating TensorFlow graph
node_features_train = X_train_graph['node_features']
edge_features_train = X_train_graph['edge_features']

X_train_tf_graph = gnn_detector.create_tensorflow_graph(X_train_graph, node_features_train, edge_features_train)
gnn_detector.graph_tensor_train = X_train_tf_graph

# Scale features before creating TensorFlow graph
node_features_test = X_test_graph['node_features']
edge_features_test = X_test_graph['edge_features']

X_test_tf_graph = gnn_detector.create_tensorflow_graph(X_test_graph, node_features_test, edge_features_test)
gnn_detector.graph_tensor_test = X_test_tf_graph


INFO:scripts.gnn_anomaly_detection:Creating TensorFlow GNN graph...
INFO:scripts.gnn_anomaly_detection:Creating TensorFlow GNN graph...


In [4]:
gnn_detector.model = gnn_detector.build_model(X_train_graph['node_features'].shape[1], X_train_graph['edge_features'].shape[1])
gnn_detector.model.summary()

INFO:scripts.gnn_anomaly_detection:Building GNN model with node and edge anomaly detection...










Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [()]                         0         []                            
                                                                                                  
 input.merge_batch_to_compo  ()                           0         ['input_1[0][0]']             
 nents (InstanceMethod)                                                                           
                                                                                                  
 map_features (MapFeatures)  ()                           12544     ['input.merge_batch_to_compone
                                                                    nts[0][0]']                   
                                                                                              

In [7]:
history = gnn_detector.train(X_train_tf_graph, epochs=100)

INFO:scripts.gnn_anomaly_detection:Training GNN model for 100 epochs...


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [9]:

(node_reconstruction_error, edge_reconstruction_error, 
     node_threshold, edge_threshold) = gnn_detector.detect_anomalies()
    
# Calculate anomaly masks
node_anomalies = node_reconstruction_error > node_threshold
edge_anomalies = edge_reconstruction_error > edge_threshold

INFO:scripts.gnn_anomaly_detection:Detecting node and edge anomalies...


graph_tensor is self.graph_tensor_test


INFO:scripts.gnn_anomaly_detection:Detected 352 node anomalies (1.0%)
INFO:scripts.gnn_anomaly_detection:Detected 502 edge anomalies (1.0%)


In [None]:
 #Create results analysis
graph_data = X_test_graph

analyzer = AnomalyAnalyzer()

node_results_df = analyzer.create_node_results_dataframe(
    graph_data, node_reconstruction_error, node_anomalies)

edge_results_df = analyzer.create_edge_results_dataframe(
    graph_data, edge_reconstruction_error, edge_anomalies)