In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sqlite3
import sys

In [2]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
db_path = os.path.join(data_path, 'datalab.sqlite')


In [25]:
data_cpv = pd.read_csv(os.path.join(data_path, 'data_cpv.csv'))
data_cpv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299670 entries, 0 to 299669
Data columns (total 36 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   uid                        299670 non-null  object 
 1   id                         299670 non-null  object 
 2   nature                     299670 non-null  object 
 3   acheteur_id                299670 non-null  int64  
 4   acheteur_nom               299477 non-null  object 
 5   acheteur_siren             299500 non-null  float64
 6   titulaire_id               299670 non-null  object 
 7   titulaire_typeIdentifiant  299670 non-null  object 
 8   titulaire_nom              292097 non-null  object 
 9   titulaire_siren            298537 non-null  float64
 10  objet                      299670 non-null  object 
 11  montant                    299670 non-null  float64
 12  codeCPV                    299670 non-null  object 
 13  procedure                  29

In [5]:
data_cpv[['codeCPV', 'codeCPV_FR']].head()

Unnamed: 0,codeCPV,codeCPV_FR
0,45210000-2,Travaux de construction de bâtiments
1,45261000-4,Travaux de charpente et de couverture et trava...
2,72514200-3,Services de gestion d'installations pour le dé...
3,71300000,Services d'ingénierie
4,44316500-3,Serrurerie


In [24]:
# Load CPV reference data
cpv_path = os.path.join(os.path.dirname(os.getcwd()), 
                        'docs', 'cpv_2008_ver_2013_FR.csv')
df_cpv = pd.read_csv(cpv_path)
df_cpv.head()

Unnamed: 0,CODE,FR
0,03000000-1,"Produits agricoles, de l'élevage, de la pêche,..."
1,03100000-2,Produits agricoles et produits de l'horticulture
2,03110000-5,"Produits agricoles, produits de la culture mar..."
3,03111000-2,Graines
4,03111100-3,Graines de soja


In [7]:
def extract_cpv_hierarchy_level(cpv_code, level=2):
    """
    Extract higher-level hierarchy code from a CPV code.
    
    Args:
        cpv_code (str): Original CPV code (e.g., '03111900-1')
        level (str): Hierarchy level to extract. Options:
                    - 'division' (XX000000): First 2 digits + 6 zeros (default)
                    - 'group' (XXXX0000): First 4 digits + 4 zeros  
                    - 'class' (XXXXXX00): First 6 digits + 2 zeros
    
    Returns:
        str: Higher-level CPV code (e.g., '03000000')
    """
    # Remove any whitespace and convert to string
    cpv_str = str(cpv_code).strip()
    
    # Extract the numeric part before the dash
    if '-' in cpv_str:
        numeric_part = cpv_str.split('-')[0]
    else:
        numeric_part = cpv_str
    
    # Ensure we have at least 8 digits, pad with zeros if needed
    numeric_part = numeric_part.ljust(8, '0')
    
    # Extract based on hierarchy level
    if level == 2:
        # First 2 digits + 6 zeros (e.g., 03111900 -> 03000000)
        return numeric_part[:2] + '000000'
    elif level == 3:
        # First 3 digits + 5 zeros (e.g., 03111900 -> 03111000)
        return numeric_part[:3] + '00000'
    elif level == 4:
        # First 4 digits + 4 zeros (e.g., 03111900 -> 03110000)
        return numeric_part[:4] + '0000'
    elif level == 5:
        # First 5 digits + 3 zeros (e.g., 03111900 -> 03111000)
        return numeric_part[:5] + '000'
    else:
        raise ValueError("Level must be between 2 and 5")


def add_cpv_hierarchy_column(df, cpv_column='codeCPV', level=2, 
                             new_column_name=None):
    """
    Add a new column with higher-level CPV hierarchy codes to a DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing CPV codes
        cpv_column (str): Name of the column containing CPV codes 
                         (default: 'codeCPV')
        level (str): Hierarchy level to extract (default: 'division')
        new_column_name (str): Name for the new column. If None, 
                              will be auto-generated.
    
    Returns:
        pd.DataFrame: DataFrame with added hierarchy column
    """
    # Create a copy to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Auto-generate column name if not provided
    if new_column_name is None:
        new_column_name = f'codeCPV_{level}'
    
    # Apply the hierarchy extraction function
    df_copy[new_column_name] = df_copy[cpv_column].apply(
        lambda x: extract_cpv_hierarchy_level(x, level=level)
    )
    
    return df_copy

In [8]:
test_code = "03111900-1"
result = extract_cpv_hierarchy_level(test_code, level=2)
result

'03000000'

In [9]:
data_cpv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299670 entries, 0 to 299669
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 299670 non-null  int64  
 1   uid                        299670 non-null  object 
 2   id                         299670 non-null  object 
 3   nature                     299670 non-null  object 
 4   acheteur_id                299670 non-null  int64  
 5   acheteur_nom               299477 non-null  object 
 6   acheteur_siren             299500 non-null  float64
 7   titulaire_id               299670 non-null  object 
 8   titulaire_typeIdentifiant  299670 non-null  object 
 9   titulaire_nom              292097 non-null  object 
 10  titulaire_siren            298537 non-null  float64
 11  objet                      299670 non-null  object 
 12  montant                    299670 non-null  float64
 13  codeCPV                    29

In [10]:
data_cpv_new = data_cpv.copy()
for i in range(2, 6):
    data_cpv_new = add_cpv_hierarchy_column(data_cpv_new, level=i)
    
data_cpv_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299670 entries, 0 to 299669
Data columns (total 37 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 299670 non-null  int64  
 1   uid                        299670 non-null  object 
 2   id                         299670 non-null  object 
 3   nature                     299670 non-null  object 
 4   acheteur_id                299670 non-null  int64  
 5   acheteur_nom               299477 non-null  object 
 6   acheteur_siren             299500 non-null  float64
 7   titulaire_id               299670 non-null  object 
 8   titulaire_typeIdentifiant  299670 non-null  object 
 9   titulaire_nom              292097 non-null  object 
 10  titulaire_siren            298537 non-null  float64
 11  objet                      299670 non-null  object 
 12  montant                    299670 non-null  float64
 13  codeCPV                    29

In [None]:
data_cpv_new['codeCPV_2'].unique().shape

(73,)

In [19]:
cpv_2 = pd.DataFrame(data_cpv_new['codeCPV_2'].value_counts())
cpv_2[cpv_2['count'] > 10].shape

(58, 1)

In [26]:
cpv_2

Unnamed: 0_level_0,count
codeCPV_2,Unnamed: 1_level_1
45000000,122409
71000000,42597
79000000,12606
90000000,10593
33000000,8304
...,...
13000000,1
04000000,1
54000000,1
46000000,1


In [14]:
print(data_cpv_new['codeCPV_3'].unique().shape)
print(data_cpv_new['codeCPV_4'].unique().shape)
print(data_cpv_new['codeCPV_5'].unique().shape)


(393,)
(1226,)
(2611,)


In [20]:
# Ajoute le dossier parent du notebook au path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.preprocess import extract_cpv_hierarchy_level, codeCPV_group


In [21]:
test_code = "03111900-1"
result = extract_cpv_hierarchy_level(test_code, level=2)
result

'03000000'

In [22]:
data_cpv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299670 entries, 0 to 299669
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 299670 non-null  int64  
 1   uid                        299670 non-null  object 
 2   id                         299670 non-null  object 
 3   nature                     299670 non-null  object 
 4   acheteur_id                299670 non-null  int64  
 5   acheteur_nom               299477 non-null  object 
 6   acheteur_siren             299500 non-null  float64
 7   titulaire_id               299670 non-null  object 
 8   titulaire_typeIdentifiant  299670 non-null  object 
 9   titulaire_nom              292097 non-null  object 
 10  titulaire_siren            298537 non-null  float64
 11  objet                      299670 non-null  object 
 12  montant                    299670 non-null  float64
 13  codeCPV                    29

In [23]:
data_cpv_new = codeCPV_group(data_cpv, levels=[2, 3, 4, 5])
data_cpv_new.info()

Data saved successfully to: /home/ronan/code/RonanB400/Project/decp_ml/data/data_cpv.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299670 entries, 0 to 299669
Data columns (total 37 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 299670 non-null  int64  
 1   uid                        299670 non-null  object 
 2   id                         299670 non-null  object 
 3   nature                     299670 non-null  object 
 4   acheteur_id                299670 non-null  int64  
 5   acheteur_nom               299477 non-null  object 
 6   acheteur_siren             299500 non-null  float64
 7   titulaire_id               299670 non-null  object 
 8   titulaire_typeIdentifiant  299670 non-null  object 
 9   titulaire_nom              292097 non-null  object 
 10  titulaire_siren            298537 non-null  float64
 11  objet                      299670 non-null  object 
 1