In [1]:
import pandas as pd
import numpy as np

In [2]:

df = pd.read_csv('../data/data_cpv.csv', encoding='utf-8')

In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299670 entries, 0 to 299669
Data columns (total 36 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   uid                        299670 non-null  object 
 1   id                         299670 non-null  object 
 2   nature                     299670 non-null  object 
 3   acheteur_id                299670 non-null  int64  
 4   acheteur_nom               299477 non-null  object 
 5   acheteur_siren             299500 non-null  float64
 6   titulaire_id               299670 non-null  object 
 7   titulaire_typeIdentifiant  299670 non-null  object 
 8   titulaire_nom              292097 non-null  object 
 9   titulaire_siren            298537 non-null  float64
 10  objet                      299670 non-null  object 
 11  montant                    299670 non-null  float64
 12  codeCPV                    299670 non-null  object 
 13  procedure                  29

In [None]:
# Create a DataFrame with unique CPV_2 codes and their French descriptions
cpv_descriptions_df = df[['codeCPV_2', 'codeCPV_FR']].drop_duplicates()

# Check if there are multiple descriptions for the same codeCPV_2
duplicates = cpv_descriptions_df['codeCPV_2'].duplicated(keep=False)
if any(duplicates):
    print(f"Found {sum(duplicates)} CPV codes with multiple descriptions")
    print(cpv_descriptions_df[duplicates].sort_values('codeCPV_2'))

    # In case of duplicates, keep the first description
    cpv_descriptions_df = cpv_descriptions_df.drop_duplicates(subset='codeCPV_2', keep='first')

# Sort by CPV code
cpv_descriptions_df = cpv_descriptions_df.sort_values('codeCPV_2')

# Reset index for clean DataFrame
cpv_descriptions_df = cpv_descriptions_df.reset_index(drop=True)

# Display the result
print(f"Created DataFrame with {len(cpv_descriptions_df)} unique CPV codes")
cpv_descriptions_df.head(10)


Found 4490 CPV codes with multiple descriptions
        codeCPV_2                            codeCPV_FR
299569    3000000                        Haricots verts
165188    3000000                    Graines d'arachide
2190      3000000                        Légumes-fruits
58611     3000000                    Produits à graines
2129      3000000                 Produits de pépinière
...           ...                                   ...
138637   98000000                Services de mieux-être
5330     98000000               Services de déplacement
5320     98000000      Services de nettoyage de textile
2430     98000000                   Services de chenils
5858     98000000  Services de gestion de blanchisserie

[4490 rows x 2 columns]
Created DataFrame with 73 unique CPV codes


Unnamed: 0,codeCPV_2,codeCPV_FR
0,0,
1,1000000,
2,2000000,
3,3000000,Produits agricoles et produits de l'horticulture
4,4000000,
5,5000000,
6,7000000,
7,9000000,Huiles lubrifiantes et agents lubrifiants
8,10000000,
9,11000000,


In [16]:
cpv_descriptions_df


Unnamed: 0,codeCPV_2,codeCPV_FR
0,0,
1,1000000,
2,2000000,
3,3000000,Produits agricoles et produits de l'horticulture
4,4000000,
...,...,...
68,92000000,Services de musées
69,93000000,
70,95000000,
71,98000000,Autres services


In [17]:
# Remove rows where codeCPV_FR is NaN
cpv_descriptions_df = cpv_descriptions_df.dropna(subset=['codeCPV_FR'])

# Verify NaN values are removed
print(f"DataFrame now has {len(cpv_descriptions_df)} rows after removing NaN values")
cpv_descriptions_df

DataFrame now has 45 rows after removing NaN values


Unnamed: 0,codeCPV_2,codeCPV_FR
3,3000000,Produits agricoles et produits de l'horticulture
7,9000000,Huiles lubrifiantes et agents lubrifiants
11,14000000,"Macadam, tarmacadam et sable bitumineux"
12,15000000,Produits alimentaires divers
13,16000000,Pièces pour machines agricoles
15,18000000,"Vêtements professionnels, vêtements de travail..."
16,19000000,Sacs et sachets à ordures en polyéthylène
19,22000000,Livres de bibliothèque
21,24000000,Chaux hydratée
26,30000000,Matériel et fournitures informatiques


In [15]:
cpv_descriptions_df

Unnamed: 0,codeCPV_2,codeCPV_FR
0,0,
1,1000000,
2,2000000,
3,3000000,Produits agricoles et produits de l'horticulture
4,4000000,
...,...,...
68,92000000,Services de musées
69,93000000,
70,95000000,
71,98000000,Autres services


In [19]:
# Save the cleaned CPV descriptions to a CSV file
output_path = '../../decp_prod/models/cpv_descriptions.csv'
cpv_descriptions_df.to_csv(output_path, index=False)
print(f"Saved {len(cpv_descriptions_df)} CPV descriptions to {output_path}")

Saved 45 CPV descriptions to ../../decp_prod/models/cpv_descriptions.csv
