In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys

In [2]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
db_path = os.path.join(data_path, 'datalab.sqlite')
conn = sqlite3.connect(db_path)
db = conn.cursor()
query = f"""
SELECT *
FROM "data.gouv.fr.2022.clean"
"""

data_raw = pd.read_sql_query(query, conn)
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299670 entries, 0 to 299669
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   uid                        299670 non-null  object 
 1   id                         299670 non-null  object 
 2   nature                     299670 non-null  object 
 3   acheteur_id                299670 non-null  object 
 4   acheteur_nom               299477 non-null  object 
 5   acheteur_siren             299500 non-null  object 
 6   titulaire_id               299670 non-null  object 
 7   titulaire_typeIdentifiant  299670 non-null  object 
 8   titulaire_nom              292097 non-null  object 
 9   titulaire_siren            298537 non-null  object 
 10  objet                      299670 non-null  object 
 11  montant                    299670 non-null  float64
 12  codeCPV                    299670 non-null  object 
 13  procedure                  29

In [3]:
# Ajoute le dossier parent du notebook au path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.preprocess import codeCPV_description
data_cpv = codeCPV_description(data_raw)
data_cpv.head()

Unnamed: 0,uid,id,nature,acheteur_id,acheteur_nom,acheteur_siren,titulaire_id,titulaire_typeIdentifiant,titulaire_nom,titulaire_siren,...,sousTraitanceDeclaree,typeGroupementOperateurs,tauxAvance,origineUE,origineFrance,lieuExecution_code,lieuExecution_typeCode,idAccordCadre,source_open_data,codeCPV_FR
0,210601209000132022_M013,2022_M013,Marché,21060120900013,COMMUNE DE SAINT ETIENNE DE TINEE,210601209,38177692100029,SIRET,SERMATECH,381776921,...,,Pas de groupement,,,,6660,Code postal,,data.gouv.fr decp-2024.json,Travaux de construction de bâtiments
1,217100759000182024RENOCHARP,2024RENOCHARP,Marché,21710075900018,COMMUNE DE CHALMOUX,217100759,75203574100038,SIRET,MACON ETANCHEITE,752035741,...,1.0,Pas de groupement,0.0,0.0,0.0,71140,Code postal,,data.gouv.fr decp-2025-01.json,Travaux de charpente et de couverture et trava...
2,200066231000162022033INFOL00,2022033INFOL00,Marché,20006623100016,CC DES PORTES D'ARIEGE PYRENEES,200066231,49459697600014,SIRET,EQUADEX,494596976,...,,Pas de groupement,,,,9100,Code postal,2022033INFOL00,data.gouv.fr decp-2024.json,Services de gestion d'installations pour le dé...
3,243100518001702024M05,2024M05,Marché,24310051800170,TOULOUSE METROPOLE,243100518,59278023300017,SIRET,RIVES & EAUX DU SUD-OUEST,592780233,...,1.0,Solidaire,0.1,0.0,0.0,31000,Code postal,,data.gouv.fr decp-2025-04.json,Services d'ingénierie
4,21590544900017202402401,202402401,Marché,21590544900017,COMMUNE DE SAINT SAULVE,215905449,32683156700010,SIRET,ALTOMARE ALTALU,326831567,...,,Pas de groupement,,,,59800,Code postal,,data.gouv.fr decp-2024.json,Serrurerie


In [4]:
data_cpv.to_csv(os.path.join(data_path, 'data_cpv.csv'), index=True)

In [6]:
data_cpv[['codeCPV', 'codeCPV_FR']].head()

Unnamed: 0,codeCPV,codeCPV_FR
0,45210000-2,Travaux de construction de bâtiments
1,45261000-4,Travaux de charpente et de couverture et trava...
2,72514200-3,Services de gestion d'installations pour le dé...
3,71300000,Services d'ingénierie
4,44316500-3,Serrurerie


In [7]:
# Load CPV reference data
cpv_path = os.path.join(os.path.dirname(os.getcwd()), 
                        'docs', 'cpv_2008_ver_2013_FR.csv')
df_cpv = pd.read_csv(cpv_path)
df_cpv.head()

Unnamed: 0,CODE,FR
0,03000000-1,"Produits agricoles, de l'élevage, de la pêche,..."
1,03100000-2,Produits agricoles et produits de l'horticulture
2,03110000-5,"Produits agricoles, produits de la culture mar..."
3,03111000-2,Graines
4,03111100-3,Graines de soja


In [8]:
def extract_cpv_hierarchy_level(cpv_code, level=2):
    """
    Extract higher-level hierarchy code from a CPV code.
    
    Args:
        cpv_code (str): Original CPV code (e.g., '03111900-1')
        level (str): Hierarchy level to extract. Options:
                    - 'division' (XX000000): First 2 digits + 6 zeros (default)
                    - 'group' (XXXX0000): First 4 digits + 4 zeros  
                    - 'class' (XXXXXX00): First 6 digits + 2 zeros
    
    Returns:
        str: Higher-level CPV code (e.g., '03000000')
    """
    # Remove any whitespace and convert to string
    cpv_str = str(cpv_code).strip()
    
    # Extract the numeric part before the dash
    if '-' in cpv_str:
        numeric_part = cpv_str.split('-')[0]
    else:
        numeric_part = cpv_str
    
    # Ensure we have at least 8 digits, pad with zeros if needed
    numeric_part = numeric_part.ljust(8, '0')
    
    # Extract based on hierarchy level
    if level == 2:
        # First 2 digits + 6 zeros (e.g., 03111900 -> 03000000)
        return numeric_part[:2] + '000000'
    elif level == 3:
        # First 3 digits + 5 zeros (e.g., 03111900 -> 03111000)
        return numeric_part[:3] + '00000'
    elif level == 4:
        # First 4 digits + 4 zeros (e.g., 03111900 -> 03110000)
        return numeric_part[:4] + '0000'
    elif level == 5:
        # First 5 digits + 3 zeros (e.g., 03111900 -> 03111000)
        return numeric_part[:5] + '000'
    else:
        raise ValueError("Level must be between 2 and 5")


def add_cpv_hierarchy_column(df, cpv_column='codeCPV', level=2, 
                             new_column_name=None):
    """
    Add a new column with higher-level CPV hierarchy codes to a DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing CPV codes
        cpv_column (str): Name of the column containing CPV codes 
                         (default: 'codeCPV')
        level (str): Hierarchy level to extract (default: 'division')
        new_column_name (str): Name for the new column. If None, 
                              will be auto-generated.
    
    Returns:
        pd.DataFrame: DataFrame with added hierarchy column
    """
    # Create a copy to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Auto-generate column name if not provided
    if new_column_name is None:
        new_column_name = f'codeCPV_{level}'
    
    # Apply the hierarchy extraction function
    df_copy[new_column_name] = df_copy[cpv_column].apply(
        lambda x: extract_cpv_hierarchy_level(x, level=level)
    )
    
    return df_copy

In [9]:
test_code = "03111900-1"
result = extract_cpv_hierarchy_level(test_code, level=2)
result

'03000000'

In [None]:
df_cpv_new = add_cpv_hierarchy_column(df_cpv, level=2)