In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys
import logging


# Sklearn preprocessing
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectPercentile, mutual_info_regression, VarianceThreshold, SelectFromModel
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

# Graph
import networkx as nx
import tensorflow_gnn as tfgnn
from pyvis.network import Network
import webbrowser
from tempfile import NamedTemporaryFile

# Tensorflow
import tensorflow as tf


In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 1. Load Data

In [3]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', 'data_cpv.csv'))

In [4]:
data.head()

Unnamed: 0,uid,id,nature,acheteur_id,acheteur_nom,acheteur_siren,titulaire_id,titulaire_typeIdentifiant,titulaire_nom,titulaire_siren,...,origineFrance,lieuExecution_code,lieuExecution_typeCode,idAccordCadre,source_open_data,codeCPV_FR,codeCPV_2,codeCPV_3,codeCPV_4,codeCPV_5
0,210601209000132022_M013,2022_M013,Marché,21060120900013,COMMUNE DE SAINT ETIENNE DE TINEE,210601209.0,38177692100029,SIRET,SERMATECH,381776921.0,...,,6660,Code postal,,data.gouv.fr decp-2024.json,Travaux de construction de bâtiments,45000000,45200000,45210000,45210000
1,217100759000182024RENOCHARP,2024RENOCHARP,Marché,21710075900018,COMMUNE DE CHALMOUX,217100759.0,75203574100038,SIRET,MACON ETANCHEITE,752035741.0,...,0.0,71140,Code postal,,data.gouv.fr decp-2025-01.json,Travaux de charpente et de couverture et trava...,45000000,45200000,45260000,45261000
2,200066231000162022033INFOL00,2022033INFOL00,Marché,20006623100016,CC DES PORTES D'ARIEGE PYRENEES,200066231.0,49459697600014,SIRET,EQUADEX,494596976.0,...,,9100,Code postal,2022033INFOL00,data.gouv.fr decp-2024.json,Services de gestion d'installations pour le dé...,72000000,72500000,72510000,72514000
3,243100518001702024M05,2024M05,Marché,24310051800170,TOULOUSE METROPOLE,243100518.0,59278023300017,SIRET,RIVES & EAUX DU SUD-OUEST,592780233.0,...,0.0,31000,Code postal,,data.gouv.fr decp-2025-04.json,Services d'ingénierie,71000000,71300000,71300000,71300000
4,21590544900017202402401,202402401,Marché,21590544900017,COMMUNE DE SAINT SAULVE,215905449.0,32683156700010,SIRET,ALTOMARE ALTALU,326831567.0,...,,59800,Code postal,,data.gouv.fr decp-2024.json,Serrurerie,44000000,44300000,44310000,44316000


# 2. Preprocess

In [5]:
X = data[['montant', 'acheteur_nom', 'titulaire_nom', 
          'procedure', 'dureeMois', 'formePrix', 'attributionAvance',
          'sousTraitanceDeclaree', 'typeGroupementOperateurs', 'tauxAvance','codeCPV_3']]


In [6]:
cpv_3 = data['codeCPV_3'].value_counts()
cpv_3_list = cpv_3[cpv_3 > 200].reset_index()['codeCPV_3']

X = X[X['codeCPV_3'].isin(cpv_3_list)]

X = X[~X['dureeMois'].isna()].copy()

In [None]:
X.head()

In [11]:
def _calculate_node_features(entity_data: pd.DataFrame,
                               partner_col: str):
        """Calculate features for a node (buyer or supplier)."""
        n_partners = entity_data[partner_col].nunique()
        contracts_per_partner = len(entity_data) / n_partners \
            if n_partners > 0 else 0
            
        # Calculate contracts per partner distribution
        partner_contract_counts = entity_data[partner_col].value_counts()
        max_contracts_per_partner = partner_contract_counts.max() if not partner_contract_counts.empty else 0
        min_contracts_per_partner = partner_contract_counts.min() if not partner_contract_counts.empty else 0
        std_contracts_per_partner = partner_contract_counts.std() if len(partner_contract_counts) > 1 else 0
            
        features = [
            len(entity_data),  # Number of contracts
            entity_data['montant'].sum(),  # Total amount
            # entity_data['montant'].mean(),  # Average amount
            # entity_data['montant'].std() if len(entity_data) > 1 else 0,
            # n_partners,  # Number of unique partners
            # entity_data['montant'].max(),  # Maximum contract amount
            # entity_data['montant'].min(),  # Minimum contract amount
            # contracts_per_partner,  # Average contracts per partner
            # max_contracts_per_partner,  # Maximum contracts with any single partner
            # min_contracts_per_partner,  # Minimum contracts with any single partner
            # std_contracts_per_partner  # Standard deviation of contracts per partner
        ]
        return features


def create_graph(df: pd.DataFrame):
    """Transform procurement data into a graph structure."""
    logger.info("Creating graph structure...")
    
    # Create unique identifiers for buyers and suppliers
    buyers = df['acheteur_nom'].unique()
    suppliers = df['titulaire_nom'].unique()
    
    # Create node mappings
    buyer_to_id = {buyer: i for i, buyer in enumerate(buyers)}
    supplier_to_id = {supplier: i + len(buyers)
                        for i, supplier in enumerate(suppliers)}
    
    # Combine all nodes
    all_nodes = list(buyers) + list(suppliers)


    print("Creating edges...")
    
    # Create edges (contracts) and edge features
    edges = []
    edge_features = []
    
    for _, row in df.iterrows():
        buyer_id = buyer_to_id[row['acheteur_nom']]
        supplier_id = supplier_to_id[row['titulaire_nom']]
        
        edges.append([buyer_id, supplier_id])
        
        # Edge features: contract information
        cpv_hash = hash(str(row.get('codeCPV_3', ''))) % 1000
        proc_hash = hash(str(row.get('procedure', ''))) % 100
        duree = row.get('dureeMois', 0) \
            if pd.notna(row.get('dureeMois')) else 0
        
        edge_features.append([
            np.log1p(row['montant']),  # Log-transformed amount
            cpv_hash,  # CPV category
            proc_hash,  # Procedure type
            duree
        ])
    

    # Create node features
    node_features = []
    node_types = []  # 0 for buyers, 1 for suppliers
    

    print("Creating buyer features...")
    # Buyer features
    for buyer in buyers:
        buyer_data = df[df['acheteur_nom'] == buyer]
        features = _calculate_node_features(buyer_data,
                                                'titulaire_nom')
        node_features.append(features)
        node_types.append(0)  # Buyer
    

    print("Creating supplier features...")
    # Supplier features
    for supplier in suppliers:
        supplier_data = df[df['titulaire_nom'] == supplier]
        features = _calculate_node_features(supplier_data,
                                                'acheteur_nom')
        node_features.append(features)
        node_types.append(1)  # Supplier
    
    return {
        'nodes': all_nodes,
        'edges': np.array(edges),
        'node_features': np.array(node_features, dtype=np.float32),
        'edge_features': np.array(edge_features, dtype=np.float32),
        'node_types': np.array(node_types),
        'buyer_to_id': buyer_to_id,
        'supplier_to_id': supplier_to_id
    }

In [13]:
X_50 = X.copy().head(50)

X_50_graph = create_graph(X_50)
X_50_graph

INFO:__main__:Creating graph structure...


Creating edges...
Creating buyer features...
Creating supplier features...


{'nodes': ['COMMUNE DE SAINT ETIENNE DE TINEE',
  'COMMUNE DE CHALMOUX',
  "CC DES PORTES D'ARIEGE PYRENEES",
  'TOULOUSE METROPOLE',
  'COMMUNE DE SAINT SAULVE',
  'COMMUNE DE PIERRELATTE',
  'CA MONTELIMAR-AGGLOMERATION',
  'ORLEANS METROPOLE',
  'COMMUNE DE LA GARNACHE',
  'DEPARTEMENT DE LA CHARENTE',
  "COMMUNAUTE D'AGGLOMERATION LA ROCHELLE",
  'COMMUNE DE BERGERAC',
  'COMMUNE DE ELBEUF',
  'METROPOLE GRENOBLE-ALPES-METROPOLE (METRO)',
  'COMMUNE DE HARNES',
  'COMMUNE DE PONT-SAINT-ESPRIT',
  'COMMUNAUTE URBAINE DU GRAND REIMS',
  "COMMUNAUTE D'AGGLOMERATION DU PUY-EN-VELAY",
  'HABITAT DU GARD',
  'COMMUNE DE BEAULIEU SUR MER',
  'COMMUNE DE SAINT MARTIN LA GARENNE',
  'COMMUNE DE VILLEPARISIS',
  'COMMUNE DE CUSSET',
  'COMMUNE DE CRILLON LE BRAVE',
  'COMMUNE DE TARNOS',
  'COMMUNE DE CHATEAU-RENAULT',
  'ASA ASSAINISSEMENTDE LA BASSE PLAINE',
  'COMMUNE DE SAINT MALO',
  'PARIS HABITAT-OPH',
  "SYNDICAT AUDOIS D'ENERGIES ET DU NUMERIQUE SYADEN",
  'COMMUNE DE BOURRET',
  'C

In [29]:
def visualize_procurement_graph(graph_data, title: str = "Procurement Network"):
    """Create an interactive visualization of the full procurement graph.
    
    Args:
        graph_data: Dictionary containing the graph data from create_graph
        title: Title for the visualization
    """
    
# Create a new network
    net = Network(height="900px", width="100%", bgcolor="#ffffff",
                font_color="black", notebook=False)
    
    # Add nodes
    for i, (name, node_type) in enumerate(zip(
        graph_data['nodes'], graph_data['node_types'])):
        
        # Calculate node size based on number of contracts
        num_contracts = int(graph_data['node_features'][i][0])
        node_size = min(50 + num_contracts * 2, 100)  # Scale size but cap it
        
        # Calculate node color based on total amount
        total_amount = float(graph_data['node_features'][i][1])
        # Normalize amount to a color scale (blue to red)
        amount_ratio = min(total_amount / 1e6, 1.0)  # Cap at 1M
        color = f"rgb({int(255 * amount_ratio)}, 0, {int(255 * (1 - amount_ratio))})"
        
        # Add node with properties
        net.add_node(
            int(i),  # Convert to Python int
            label=str(name),  # Convert to Python string
            title=f"Type: {'Buyer' if node_type == 0 else 'Supplier'}\n"
                    f"Contracts: {num_contracts}\n"
                    f"Total Amount: {total_amount:,.2f}\n",
                    #f"Avg Amount: {float(graph_data['node_features'][i][2]):,.2f}\n"
                    #f"Partners: {int(graph_data['node_features'][i][4])}",
            color=color,
            size=node_size,
            shape="diamond" if node_type == 0 else "dot"
        )
    
    # Add edges with weights based on contract amounts
    for i, edge in enumerate(graph_data['edges']):
        # Get edge features
        edge_features = graph_data['edge_features'][i]
        contract_amount = float(np.exp(edge_features[0]))  # Convert back from log
        
        # Scale edge width based on contract amount
        edge_width = min(1 + contract_amount / 1e5, 5)  # Scale but cap at 5
        
        net.add_edge(
            int(edge[0]),  # Convert to Python int
            int(edge[1]),  # Convert to Python int
            width=edge_width,
            title=f"Amount: {contract_amount:,.2f}"
        )
    
    # Configure physics layout
    net.set_options("""
    {
        "physics": {
            "forceAtlas2Based": {
                "gravitationalConstant": -50,
                "centralGravity": 0.01,
                "springLength": 200,
                "springConstant": 0.08,
                "damping": 0.4,
                "avoidOverlap": 1
            },
            "maxVelocity": 50,
            "solver": "forceAtlas2Based",
            "timestep": 0.35,
            "stabilization": {
                "enabled": true,
                "iterations": 1000,
                "updateInterval": 25
            }
        },
        "interaction": {
            "hover": true,
            "tooltipDelay": 200,
            "hideEdgesOnDrag": true,
            "navigationButtons": true
        }
    }
    """)
    
    # Add title
    #net.set_title(title)
    
    # Save and open in browser
    with NamedTemporaryFile(delete=False, suffix='.html') as tmp:
        net.save_graph(tmp.name)
        webbrowser.open('file://' + tmp.name)

In [31]:
visualize_procurement_graph(X_50_graph)