In [1]:
import os 
import gzip 
import glob 
import xmltodict 
import polars as pl 
import pandas as pd 
import pyarrow as pa 

pd.set_option('display.max_columns',100)

colonnes_pd_budget = ['Id_Fichier',
    'Nomenclature',
    'Exer' ,
    'TypOpBudg' , #des 2 et des 1
    'Operation',
    'Nature',
    'ContNat',
    'LibCpte',
    'Fonction',
    'ContFon',
    'ArtSpe', 
    'CodRD', 
    'MtBudgPrec', 
    'MtRARPrec', 
    'MtPropNouv', 
    'MtPrev', 
    'OpBudg',
    'CredOuv', 
    'MtReal', 
    'MtRAR3112', 
    'ContOp',
    'OpeCpteTiers',
    'MtSup',
    'APVote',
    'Brut',
    'BudgetHorsRAR',
    'Comp',
    'ICNE',
    'ICNEPrec',
    'MtOpeCumul',
    'MtOpeInfo',
    'Net',
    'ProdChaRat',
    'RARPrec',
    'CaracSup',
    'TypOpe',
    'Section',
    'ChapSpe',
    'ProgAutoLib',
    'ProgAutoNum',
    'VirCredNum',
    'CodeRegion']

colonnes_doc_budgetaire = [
 'Id_Fichier',
 'Nomenclature',
 'Exer',
 'IdColl',
 'Siren',
 'CodColl',
 'LibelleColl',
 'DteStr',
 '@date',
 'DteDec',
 'DteDecEx',
 'NumDec',
 'IdPost',
 'LibellePoste',
 'LibelleEtabPal',
 'IdEtabPal',
 'LibelleEtab',
 'IdEtab',
 'NatDec',
 'NatVote', 	
 'OpeEquip', 
 'CodInseeColl',	
 'VoteFormelChap', 	
 'TypProv', 	
 'BudgPrec',
 'RefProv',	
 'ReprRes', 	
 'NatFonc', 	
 'PresentationSimplifiee', 	
 'DepFoncN2',	
 'RecFoncN2' ,	
 'DepInvN2' ,	
 'RecInvN2' ,	
 'CodTypBud',
 'CodBud',
 'ProjetBudget',  	
 'Affect',
 'SpecifBudget',
 'FinJur',	 	
 '@md5',	
 '@sha1']



In [None]:
def _isolement_id(fichier) : 
 '''Extrait l'id du nom du fichier pour la liste comprehension de securité

 ATTENTION, le premier split / va changer si on l'applique sur du minio '''
 val_id_fichier_source = fichier.split("/")[-1].split('.')[0]
 if '-' in val_id_fichier_source : 
  val_id_fichier = val_id_fichier_source.split('-')[1]
 else : 
  val_id_fichier= val_id_fichier_source
 return val_id_fichier

def parse_fichier(chemin) : 
 '''Ouvre et parse le fichier gzip'''
 with gzip.open(chemin, 'rb') as fichier_ouvert : 
  fichier_xml_gzip = fichier_ouvert.read()
  fichier_xml = fichier_xml_gzip.decode('latin-1')
  fichier_dict = xmltodict.parse(fichier_xml)
 return fichier_dict

def extraction_annexe(chemin_annexe, dict_metadonnees) : 
 liste_annexe = []
 for row in chemin_annexe : 
  liste_par_ligne = {}
  for a, b in row.items() : 
   liste_par_ligne.update({a : b.get('@V')})
   liste_par_ligne.update(dict_metadonnees)
  liste_annexe.append(liste_par_ligne)
 return liste_annexe 

def extraction_donnees(chemin) : 
 dict_annexe = {}
 for a, b in chemin.items() : 
   dict_annexe.update({a : b.get('@V')})
 return dict_annexe 

def extraction_lignes_budget_liste(chemin, dict_id) :
 liste_budget = []
 for lignes in chemin : 
  dict_ligne = {}
  dict_ligne.update(dict_id)
  for a, b in lignes.items() :
     if a not in ['MtSup', 'CaracSup'] : 
       dict_ligne.update({a : b.get('@V')}) 
 
     elif a == 'MtSup' : 
       dict_ligne.update({a : b})
       type_m = lignes.get('MtSup')
 
       if isinstance(type_m, dict) : 
        dict_ligne.update({type_m.get('@Code') : type_m.get('@V')})
 
       elif isinstance(type_m, list) : 
          for j in b : 
           dict_ligne.update({j.get('@Code') : j.get('@V')})
 
     elif a == 'CaracSup' :   
       dict_ligne.update({a : b})
       type_c = lignes.get('CaracSup')
 
       if isinstance(type_c, dict) :
        dict_ligne.update({type_c.get('@Code') : type_c.get('@V')})
 
       elif isinstance(type_c, list) : 
          for j in b : 
           dict_ligne.update({j.get('@Code') : j.get('@V')})

  liste_budget.append(dict_ligne)
 return liste_budget

def extraction_budget(fichier_parse, dict_id) : 
 ''' Extrait toutes les données budgetaires, y compris carac et mtsup '''
 lignes_budget = fichier_parse['DocumentBudgetaire']['Budget']['LigneBudget'] 

 if isinstance(lignes_budget, dict) : 
  donnees_budget_prep = extraction_donnees(lignes_budget)
  donnees_budget_prep.update(dict_id)
  donnees_budget = [donnees_budget_prep]

 elif isinstance(lignes_budget, list) : 
  donnees_budget = extraction_lignes_budget_liste(lignes_budget, dict_id)

 df_budget = pd.DataFrame(donnees_budget)
 df_colonnes = pd.DataFrame(columns=colonnes_pd_budget)
 df_budget_sans_schema = pd.concat([df_colonnes, df_budget])
 df_budget_propre = nettoyage_budget(df_budget_sans_schema)
 return df_budget_propre 



In [122]:
def extraction_document_budgetaire(fichier_parse, dictionnaire_id) : 
  ''' Extrait les métadonnées du fichier pour la table document_budgetaire '''
  blocbudget = extraction_donnees(fichier_parse['DocumentBudgetaire']['Budget']['BlocBudget'])
  entetedocbudg = extraction_donnees(fichier_parse['DocumentBudgetaire']['EnTeteDocBudgetaire'])
  entetebudget = extraction_donnees(fichier_parse['DocumentBudgetaire']['Budget']['EnTeteBudget'])
  scellement = fichier_parse['DocumentBudgetaire']['Scellement']

  liste_fichier = [{**blocbudget, **entetedocbudg, **entetebudget, **scellement, **dictionnaire_id}]
  df_doc_budgetaire_prep_1 = pd.DataFrame(liste_fichier)
  df_colonnes_budg = pd.DataFrame(columns= colonnes_doc_budgetaire)
  df_doc_budgetaire_prep_2 = pd.concat([df_colonnes_budg, df_doc_budgetaire_prep_1])
  df_doc_budgetaire = nettoyage_doc_budg(df_doc_budgetaire_prep_2)
  return df_doc_budgetaire

In [3]:
def extraction_annexe_concours(fichier_parse, dictionnaire_id) : 
 ''' Extrait les données de l'annexe concours pour la table correspondante 
 !! Necessite un try si Data_concours n'est pas dans le fichier '''
 concours = fichier_parse['DocumentBudgetaire']['Budget']['Annexes']['DATA_CONCOURS']['CONCOURS']
 liste_donnees_concours = extraction_annexe(concours, dictionnaire_id)
 df_concours = pd.DataFrame(liste_donnees_concours)
 return df_concours


In [125]:
def nettoyage_budget(df) : 
 df['Id_Fichier'] = df['Id_Fichier'].astype('Int32')
 df['Nomenclature'] = df['Nomenclature'].astype(str)
 df['Exer'] = df['Exer'].astype('Int16')
 df['TypOpBudg'] = df['TypOpBudg'].astype('Int32')
 #df['Operation'] = df['Operation'].astype('Int16') #12VEM488 , aire de jeux etc
 df['ArtSpe'] = df['ArtSpe'].replace(
    {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
 df['MtBudgPrec'] = df['MtBudgPrec'].astype(float)
 df['MtRARPrec'] = df['MtRARPrec'].astype(float)
 df['MtPropNouv'] = df['MtPropNouv'].astype(float)
 df['MtPrev'] = df['MtPrev'].astype(float)
 df['OpBudg'] = df['OpBudg'].replace(
    {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
 df['CredOuv'] = df['CredOuv'].astype(float)
 df['MtReal'] = df['MtReal'].astype(float)
 df['MtRAR3112'] = df['MtRAR3112'].astype(float)
 #df['ContOp'] = df['ContOp'].replace('', None).astype('Int16') #contient des '16 01'
 df['APVote'] = df['APVote'].astype(float)
 df['Brut'] = df['Brut'].astype(float)
 df['BudgetHorsRAR'] = df['BudgetHorsRAR'].astype(float)
 df['ICNE'] = df['ICNE'].astype(float)
 df['ICNEPrec'] = df['ICNEPrec'].astype(float)
 df['MtOpeCumul'] = df['MtOpeCumul'].astype(float)
 df['MtOpeInfo'] = df['MtOpeInfo'].astype(float)
 df['Net'] = df['Net'].astype(float)
 df['ProdChaRat'] = df['ProdChaRat'].astype(float)
 df['TypOpe'] = df['TypOpe'].astype('Int32')
 df['CodeRegion'] = df['CodeRegion'].astype('Int16')
 return df 

def nettoyage_doc_budg(df) : 
 df = df.drop(columns=['NatCEPL', 'Departement'])
 df = df.rename(columns={'IdColl' : 'Siret', '@date' : 'date_precise',
                    '@md5' : 'md5', '@sha1' : 'sha1'}) 
 
 df['Id_Fichier'] = df['Id_Fichier'].astype('Int32')
 df['Exer'] = df['Exer'].astype('Int16')
 df['Siren'] = df['Siret'].str.slice(0,9)
 df['Siret'] = df['Siret'].astype('Int64')
 df['Siren'] = df['Siren'].astype('Int64')
 df['DteStr'] = pd.to_datetime(df['DteStr'])
 df['DteDec'] = pd.to_datetime(df['DteDec'], errors= 'coerce')
 df['date_precise'] = pd.to_datetime(df['date_precise'], format='ISO8601', utc= True)
 df['DteDecEx'] = pd.to_datetime(df['DteDecEx'])
 df['IdEtabPal'] = df['IdEtabPal'].astype('Int64')
 df['IdEtab'] = df['IdEtab'].astype('Int64')
 df['OpeEquip'] = df['OpeEquip'].replace(
     {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
 df['VoteFormelChap'] = df['VoteFormelChap'].replace(
     {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
 df['TypProv'] = df['TypProv'].astype('Int16')
 df['BudgPrec'] = df['BudgPrec'].astype('Int16')
 df['ReprRes'] = df['ReprRes'].astype('Int16')
 df['NatFonc'] = df['NatFonc'].astype('Int16')
 df['PresentationSimplifiee'] = df['PresentationSimplifiee'].replace(
     {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
 df['DepFoncN2'] = df['DepFoncN2'].astype(float)
 df['RecFoncN2'] = df['RecFoncN2'].astype(float)
 df['DepInvN2'] = df['DepInvN2'].astype(float)
 df['RecInvN2'] = df['RecInvN2'].astype(float)
 df['ProjetBudget'] = df['ProjetBudget'].replace(
     {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
 df['SpecifBudget'] = df['SpecifBudget'].astype('Int64')
 df['FinJur'] = df['FinJur'].astype('Int64')
 return df  

 

--------------------------------------------------------------

Optionnel, explo : 

In [123]:
def multi_document(chemin_des_xml) :
 ''' Changement à faire : 
 Faire extraire l'Id_Fichier'''
 chemin_xml_entree_glob = glob.glob(os.path.join(chemin_des_xml, "*.gz"))
 #connection à la table
 liste_df = []

 for fichier in chemin_xml_entree_glob : 
  id_fichier = _isolement_id(fichier)
  #print(id_fichier)
  #Necessite verif dans tables
  if id_fichier is None : 
   print('vide')
   pass 
  else : 
   try : 
    #print('etape 2')
    fichier_parse = parse_fichier(fichier)
    dict_metadonnees = {'Id_Fichier' : id_fichier}
    #print(dict_metadonnees)
    df_doc = extraction_document_budgetaire(fichier_parse, dict_metadonnees)
    liste_df.append(df_doc)
    #insertion dans table
   except Exception as e : 
     print(id_fichier, 'erreur')
     print(e)
 
 df_mega = pd.concat(liste_df)
 return df_mega 

In [103]:
df_doc1 = multi_document(chemin_20)

775775 erreur
no element found: line 1, column 0


In [62]:
df_doc1['FinJur'].value_counts()

FinJur
910811363         2
9308127722        1
05                1
21590017600011    1
380790386         1
Name: count, dtype: int64

In [113]:
#df_doc1 = df_doc1.drop(columns=['NatCEPL', 'Departement'])
df_doc1 = df_doc1.rename(columns={'IdColl' : 'Siret'})

In [120]:
df_doc1['Id_Fichier'] = df_doc1['Id_Fichier'].astype('Int32')
df_doc1['Exer'] = df_doc1['Exer'].astype('Int16')
#df_doc1['Siren'] = df_doc1['Siret'].str.slice(0,9)
#df_doc1['Siret'] = df_doc1['Siret'].astype('Int64')
#df_doc1['Siren'] = df_doc1['Siren'].astype('Int64')
df_doc1['DteStr'] = pd.to_datetime(df_doc1['DteStr'])
df_doc1['DteDec'] = pd.to_datetime(df_doc1['DteDec'], errors= 'coerce')
df_doc1['@date'] = pd.to_datetime(df_doc1['@date'], format='ISO8601', utc= True)
df_doc1['DteDecEx'] = pd.to_datetime(df_doc1['DteDecEx'])
df_doc1['IdEtabPal'] = df_doc1['IdEtabPal'].astype('Int64')
df_doc1['IdEtab'] = df_doc1['IdEtab'].astype('Int64')
df_doc1['OpeEquip'] = df_doc1['OpeEquip'].replace(
    {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
df_doc1['VoteFormelChap'] = df_doc1['VoteFormelChap'].replace(
    {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
df_doc1['TypProv'] = df_doc1['TypProv'].astype('Int16')
df_doc1['BudgPrec'] = df_doc1['BudgPrec'].astype('Int16')
df_doc1['ReprRes'] = df_doc1['ReprRes'].astype('Int16')
df_doc1['NatFonc'] = df_doc1['NatFonc'].astype('Int16')
df_doc1['PresentationSimplifiee'] = df_doc1['PresentationSimplifiee'].replace(
    {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
df_doc1['DepFoncN2'] = df_doc1['DepFoncN2'].astype(float)
df_doc1['RecFoncN2'] = df_doc1['RecFoncN2'].astype(float)
df_doc1['DepInvN2'] = df_doc1['DepInvN2'].astype(float)
df_doc1['RecInvN2'] = df_doc1['RecInvN2'].astype(float)
df_doc1['ProjetBudget'] = df_doc1['ProjetBudget'].replace(
    {'0' : False, '1' : True, 'false' : False, "true" : True}).astype(bool)
df_doc1['SpecifBudget'] = df_doc1['SpecifBudget'].astype('Int64')
df_doc1['FinJur'] = df_doc1['FinJur'].astype('Int64')


df_doc1.head()

Unnamed: 0,Id_Fichier,Nomenclature,Exer,Siret,Siren,CodColl,LibelleColl,DteStr,@date,DteDec,DteDecEx,NumDec,IdPost,LibellePoste,LibelleEtabPal,IdEtabPal,LibelleEtab,IdEtab,NatDec,NatVote,OpeEquip,CodInseeColl,VoteFormelChap,TypProv,BudgPrec,RefProv,ReprRes,NatFonc,PresentationSimplifiee,DepFoncN2,RecFoncN2,DepInvN2,RecInvN2,CodTypBud,CodBud,ProjetBudget,Affect,SpecifBudget,FinJur,@md5,@sha1
0,671182,M4-M49_D,2020,25540368500027,255403685,434,SDAA54 M49,2021-03-25,2021-03-25 10:17:22.986000+00:00,2021-03-23,2021-03-25,,54011,Trésorerie de Maxéville,syndicat départemental d'assaissinissement aut...,25540368500019.0,SDAA54 M49,25540368500027,9,FcIc,False,,False,1,2,,3,1,False,0.0,0.0,0.0,0.0,A,0,True,,,,B7DFD2AD2AF730F68F6E7D65E6C77AB4,86DE552F22F3445EB85F0445A42E312B8D65F8B2
0,815288,M14-M14_COM_SUP3500,2021,20007067000191,200070670,50,ZA BUDAN PLEINE-FOUGERES,2021-01-01,2022-01-31 09:57:22.066000+00:00,2021-01-01,2021-01-01,,35045,CENTRE DES FINANCES PUBLIQUES,CC PAYS DOL BAIE MONT SAINT-MICHEL,20007067000019.0,ZA BUDAN PLEINE-FOUGERES,20007067000191,9,FcIc,True,,False,1,1,,3,1,True,0.0,0.0,0.0,0.0,A,16,False,,,,B0C9D982A0A2D619AB6131C0719D3FC9,7E2C610430F21DEF6C2DE0DFA5DFC3D3F4E84A6B
0,618507,M4-M49_D,2019,21350002800096,213500028,413,ASSAINISSEMENT AMANLIS,2019-01-01,2020-08-13 15:35:46+00:00,2019-01-01,2019-01-01,,35034,CENTRE DES FINANCES PUBLIQUES DE RETIERS,COMMUNE D AMANLIS,21350002800013.0,ASSAINISSEMENT AMANLIS,21350002800096,9,FcIc,False,,False,1,2,,3,1,False,0.0,0.0,0.0,0.0,A,0,False,,,,4203FB8AC4864957537E079846456939,22ABF0D1B2B3981AF017D1DD53A231E304EAC8F3
0,653656,M14-M14_COM_500_3500,2020,21560146900095,215601469,301,LOTISSEMENT ESPACE AMEDEE,2020-01-01,2021-02-22 16:15:27+00:00,2020-01-01,2020-01-01,,56038,CENTRE DES FINANCES PUBLIQUE,,,LOTISSEMENT ESPACE AMEDEE,21560146900095,9,FcIc,False,,False,1,2,,3,1,False,0.0,0.0,0.0,0.0,P,0,False,,,,8DA2060B9C49D192BF1C95C85FFBA0C8,11BD944F301E635062355A5E88CE094505D73566
0,687838,M14-M14_COM_500_3500,2020,21430062600014,214300626,206,CHASPUZAC,2021-03-23,2021-03-25 12:42:20.421000+00:00,NaT,NaT,,43030,TRESORERIE LE PUY-ST-JEAN,,,COMMUNE DE CHASPUZAC,21430062600014,9,FcIc,True,43062.0,False,1,2,,3,1,True,,,,,P,0,False,,,,65A88195E80AB330DBC2BCA5C2932E9E,8B05CDA876A142A4264551467AA121892C254B4F


In [124]:
df_doc1.dtypes

Id_Fichier                              Int32
Nomenclature                           object
Exer                                    Int16
Siret                                   Int64
Siren                                   Int64
CodColl                                object
LibelleColl                            object
DteStr                         datetime64[ns]
@date                     datetime64[ns, UTC]
DteDec                         datetime64[ns]
DteDecEx                       datetime64[ns]
NumDec                                 object
IdPost                                 object
LibellePoste                           object
LibelleEtabPal                         object
IdEtabPal                               Int64
LibelleEtab                            object
IdEtab                                  Int64
NatDec                                 object
NatVote                                object
OpeEquip                                 bool
CodInseeColl                      

In [119]:
df8['@date'] = pd.to_datetime(df8['@date'], format='ISO8601', utc= True)
df8.head()

Unnamed: 0,Id_Fichier,Nomenclature,Exer,Siret,Siren,CodColl,LibelleColl,DteStr,@date,DteDec,DteDecEx,NumDec,IdPost,LibellePoste,LibelleEtabPal,IdEtabPal,LibelleEtab,IdEtab,NatDec,NatVote,OpeEquip,CodInseeColl,VoteFormelChap,TypProv,BudgPrec,RefProv,ReprRes,NatFonc,PresentationSimplifiee,DepFoncN2,RecFoncN2,DepInvN2,RecInvN2,CodTypBud,CodBud,ProjetBudget,Affect,SpecifBudget,FinJur,@md5,@sha1
0,671182,M4-M49_D,2020,25540368500027,255403685,434,SDAA54 M49,2021-03-25,2021-03-25 10:17:22.986000+00:00,2021-03-23,2021-03-25,,54011,Trésorerie de Maxéville,syndicat départemental d'assaissinissement aut...,25540368500019.0,SDAA54 M49,25540368500027,9,FcIc,False,,False,1,2,,3,1,False,0.0,0.0,0.0,0.0,A,0,True,,,,B7DFD2AD2AF730F68F6E7D65E6C77AB4,86DE552F22F3445EB85F0445A42E312B8D65F8B2
0,815288,M14-M14_COM_SUP3500,2021,20007067000191,200070670,50,ZA BUDAN PLEINE-FOUGERES,2021-01-01,2022-01-31 09:57:22.066000+00:00,2021-01-01,2021-01-01,,35045,CENTRE DES FINANCES PUBLIQUES,CC PAYS DOL BAIE MONT SAINT-MICHEL,20007067000019.0,ZA BUDAN PLEINE-FOUGERES,20007067000191,9,FcIc,True,,False,1,1,,3,1,True,0.0,0.0,0.0,0.0,A,16,False,,,,B0C9D982A0A2D619AB6131C0719D3FC9,7E2C610430F21DEF6C2DE0DFA5DFC3D3F4E84A6B
0,618507,M4-M49_D,2019,21350002800096,213500028,413,ASSAINISSEMENT AMANLIS,2019-01-01,2020-08-13 15:35:46+00:00,2019-01-01,2019-01-01,,35034,CENTRE DES FINANCES PUBLIQUES DE RETIERS,COMMUNE D AMANLIS,21350002800013.0,ASSAINISSEMENT AMANLIS,21350002800096,9,FcIc,False,,False,1,2,,3,1,False,0.0,0.0,0.0,0.0,A,0,False,,,,4203FB8AC4864957537E079846456939,22ABF0D1B2B3981AF017D1DD53A231E304EAC8F3
0,653656,M14-M14_COM_500_3500,2020,21560146900095,215601469,301,LOTISSEMENT ESPACE AMEDEE,2020-01-01,2021-02-22 16:15:27+00:00,2020-01-01,2020-01-01,,56038,CENTRE DES FINANCES PUBLIQUE,,,LOTISSEMENT ESPACE AMEDEE,21560146900095,9,FcIc,False,,False,1,2,,3,1,False,0.0,0.0,0.0,0.0,P,0,False,,,,8DA2060B9C49D192BF1C95C85FFBA0C8,11BD944F301E635062355A5E88CE094505D73566
0,687838,M14-M14_COM_500_3500,2020,21430062600014,214300626,206,CHASPUZAC,2021-03-23,2021-03-25 12:42:20.421000+00:00,NaT,NaT,,43030,TRESORERIE LE PUY-ST-JEAN,,,COMMUNE DE CHASPUZAC,21430062600014,9,FcIc,True,43062.0,False,1,2,,3,1,True,,,,,P,0,False,,,,65A88195E80AB330DBC2BCA5C2932E9E,8B05CDA876A142A4264551467AA121892C254B4F


In [100]:
df_doc1['FinJur'].value_counts()

FinJur
910811363         2
9308127722        1
05                1
21590017600011    1
380790386         1
Name: count, dtype: int64

DteDec : demande beaucoup de nettoyage mannuel, avec des années en 1300 ou 1020, mal pris pour la date 

NatCEPL, Departement : champ libres, poubelle 

Affect : 029026 ou PORNIC

In [126]:
chemin_20 = '../../fichiers20/todo_xml_20/'

def multi_budget(chemin_des_xml) :
 ''' Changement à faire : 
 - Enlever la liste, ne plus faire de concat
 - Faire une fonction de select dans la table 
 - Faire une fonction d'insertion dans la table
 - Voir les 7 erreurs '''  
 chemin_xml_entree_glob = glob.glob(os.path.join(chemin_des_xml, "*.gz"))
 #connection à la table
 liste_df = []

 for fichier in chemin_xml_entree_glob : 
  id_fichier = _isolement_id(fichier)
  #print(id_fichier)
  #Necessite verif dans tables
  if id_fichier is None : 
   print('vide')
   pass 
  else : 
   try : 
    #print('etape 2')
    fichier_parse = parse_fichier(fichier)
    chemin_exer = fichier_parse['DocumentBudgetaire']['Budget']['BlocBudget']['Exer']
    chemin_nomenclature = fichier_parse['DocumentBudgetaire']['Budget']['EnTeteBudget']['Nomenclature']
    dict_metadonnees = {'Id_Fichier' : id_fichier, 
                        'Nomenclature' : chemin_nomenclature.get('@V'),
                        'Exer' : chemin_exer.get('@V')}
    #print(dict_metadonnees)
    df_budget = extraction_budget(fichier_parse, dict_metadonnees)
    liste_df.append(df_budget)
    #insertion dans table
   except Exception as e : 
     print(id_fichier, 'erreur')
     print(e)
 
 df_mega = pd.concat(liste_df)
 return df_mega 

In [6]:
der = multi_budget(chemin_20)

775775 erreur
no element found: line 1, column 0


In [127]:
der.head()

Unnamed: 0,Id_Fichier,Nomenclature,Exer,TypOpBudg,Operation,Nature,ContNat,LibCpte,Fonction,ContFon,ArtSpe,CodRD,MtBudgPrec,MtRARPrec,MtPropNouv,MtPrev,OpBudg,CredOuv,MtReal,MtRAR3112,ContOp,OpeCpteTiers,MtSup,ApVote,Brut,BudgetHorsRAR,Comp,ICNE,ICNEPrec,MtOpeCumul,MtOpeInfo,Net,ProdChaRat,RARPrec,CaracSup,TypOpe,ChapSpe,ProgAutoLib,ProgAutoNum,VirCredNum,CodeRegion,Section,APVote,RARprec
0,671182,M4-M49_D,2020,2.0,,23,,,,,False,D,,,,,True,9800.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,
1,671182,M4-M49_D,2020,,,2051,,,,,False,D,,,,,False,2010.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,
2,671182,M4-M49_D,2020,,,21562,,,,,False,D,,,,,False,6465.32,,0.0,,,,,,,,,,,,,,,,,,,,,,,,
3,671182,M4-M49_D,2020,,,2188,,,,,False,D,,,,,False,5324.68,,0.0,,,,,,,,,,,,,,,,,,,,,,,,
4,671182,M4-M49_D,2020,,,60226,,,,,False,D,,,,,False,5000.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,


In [128]:
der['Comp'].value_counts()

Comp
0    22
Name: count, dtype: int64

In [10]:
der[(~der['RARPrec'].isna()) | (~der['RARprec'].isna()) ][['Id_Fichier','RARPrec', 'RARPrec']]

Unnamed: 0,Id_Fichier,RARPrec,RARPrec.1
0,760357,0.00,0.00
1,760357,0.00,0.00
2,760357,0.00,0.00
3,760357,0.00,0.00
4,760357,0.00,0.00
...,...,...,...
90,793145,14619.00,14619.00
91,793145,1600.00,1600.00
92,793145,49675.00,49675.00
93,793145,3417.00,3417.00


In [23]:
der['CodeRegion'].value_counts()

CodeRegion
1    1427
2     733
4     663
3     525
5     140
Name: count, dtype: int64

In [8]:
der[~der['APVote'].isna()]

Unnamed: 0,Id_Fichier,Nomenclature,Exer,TypOpBudg,Operation,Nature,ContNat,LibCpte,Fonction,ContFon,ArtSpe,CodRD,MtBudgPrec,MtRARPrec,MtPropNouv,MtPrev,OpBudg,CredOuv,MtReal,MtRAR3112,ContOp,OpeCpteTiers,MtSup,ApVote,Brut,BudgetHorsRAR,Comp,ICNE,ICNEPrec,MtOpeCumul,MtOpeInfo,Net,ProdChaRat,RARPrec,CaracSup,TypOpe,ChapSpe,ProgAutoLib,ProgAutoNum,VirCredNum,CodeRegion,Section,APVote,RARprec
35,763528,M61-M61,2021,,2017,21578,,PLAN IMMO 1 - 2017_2021,,,,D,3102.65,,1412.99,1412.99,False,901.70,901.70,,2017,,"[{'@Code': 'APVote', '@V': '1412.99'}, {'@Code...",,,3102.65,,,,,,,,,"[{'@Code': 'ProgAutoNum', '@V': '2017_2021'}, ...",1,,PLAN IMMO1 - 2017_2021,2017_2021,,,,1412.99,
41,763528,M61-M61,2021,,2017,2184,,PLAN IMMO 1 - 2017_2021,,,,D,74307.15,,-23700.89,-23700.89,False,60390.71,22586.98,37803.73,2017,,"[{'@Code': 'APVote', '@V': '-23700.89'}, {'@Co...",,,36503.42,,,,,,,,37803.73,"[{'@Code': 'ProgAutoNum', '@V': '2017_2021'}, ...",1,,PLAN IMMO1 - 2017_2021,2017_2021,,,,-23700.89,
46,763528,M61-M61,2021,,2017,231312,,PLAN IMMO 1 - 2017_2021,,,,D,6399974.92,,-1279646.32,-1279646.32,False,3088239.24,2699346.31,86799.82,2017,,"[{'@Code': 'APVote', '@V': '-1279646.32'}, {'@...",,,6313175.10,,,,,,,,86799.82,"[{'@Code': 'ProgAutoNum', '@V': '2017_2021'}, ...",1,,PLAN IMMO1 - 2017_2021,2017_2021,,,,-1279646.32,
0,801927,M57-M57,2022,,201814,2315,,LIAISON 9 ECLUSES / ACROPOLE,518,,False,D,0.00,0.0,6000000.00,6000000.00,False,0.00,0.00,0.00,201814,,"[{'@Code': 'MtOpeCumul', '@V': '3729058.92'}, ...",,,,,,,3729058.92,,,,0,"[{'@Code': 'TypOpe', '@V': '1'}, {'@Code': 'Pr...",1,,LIAISON 9 ECLUSES / ACROPOLE,AP1801,,,,11623082.15,
86,801927,M57-M57,2022,,202001,21838,,REFONTE SYSTEME INFORMATION,020,,False,D,0.00,0.0,500000.00,500000.00,False,0.00,0.00,0.00,202001,,"[{'@Code': 'MtOpeCumul', '@V': '158063.71'}, {...",,,,,,,158063.71,,,,0,"[{'@Code': 'TypOpe', '@V': '1'}, {'@Code': 'Pr...",1,,REFONTE SYSTEME INFORMATION,AP2001,,,,296000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,787082,M57-M57,2021,,48,21314,,Espace culturel et associatif Gallieni,311,,False,D,365000.00,0.0,0.00,0.00,False,0.00,0.00,0.00,21,,"[{'@Code': 'APVote', '@V': '365000.00'}, {'@Co...",,,365000.00,,,,,,,,,"{'@Code': 'ProgAutoNum', '@V': '000041'}",,,,000041,,,,365000.00,
58,787082,M57-M57,2021,,0020,2128,,Aménagement du Parc des Coteaux - Cypressat,511,,False,D,150000.00,0.0,0.00,0.00,False,0.00,0.00,0.00,21,,"[{'@Code': 'APVote', '@V': '166000.00'}, {'@Co...",,,150000.00,,,,,,,,,"{'@Code': 'ProgAutoNum', '@V': '000040'}",,,,000040,,,,166000.00,
62,787082,M57-M57,2021,,44,2313,,PPMS,20,,False,R,0.00,0.0,0.00,0.00,False,0.00,0.00,0.00,23,,"{'@Code': 'APVote', '@V': '121717.11'}",,,,,,,,,,,,"{'@Code': 'ProgAutoNum', '@V': '000034'}",,,,000034,,,,121717.11,
40,783586,M61-M61,2021,,2021001,231312,,CONSTRUCTION DE 5 CPI TYPES,,,,D,750000.00,,-400000.00,-400000.00,False,,,,2021001,,"[{'@Code': 'APVote', '@V': '-400000.00'}, {'@C...",,,750000.00,,,,,,,,,"[{'@Code': 'ProgAutoNum', '@V': '43'}, {'@Code...",1,,CONSTRUCTION DE 5 CPI TYPES,43,,,,-400000.00,


In [11]:
f719853 = '../../fichiers20/todo_xml_20/760357.xml.gz'
parse_fichier(f719853)

{'DocumentBudgetaire': {'@xsi:schemaLocation': 'http://www.minefi.gouv.fr/cp/demat/docbudgetaire Actes_budgetaires___Schema_Annexes_Bull_V15\\DocumentBudgetaire.xsd',
  '@xmlns': 'http://www.minefi.gouv.fr/cp/demat/docbudgetaire',
  '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
  'VersionSchema': {'@V': '97'},
  'BlocEditeur': {'CodeEditeur': {'@V': 'CGI'}},
  'VersionOutil': [{'@outil': 'DSC_COMMON', '@version': '27'},
   {'@outil': 'REMAT', '@version': '3_2021'}],
  'Scellement': {'@md5': 'd5a4cda10294a7e64ae7424322937871',
   '@sha1': '8fed6ac32c38a767ba8a5c7e7ddfc27f9f50e05e',
   '@date': '2021-05-26T09:02:18'},
  'EnTeteDocBudgetaire': {'DteStr': {'@V': '2021-05-26'},
   'LibellePoste': {'@V': 'La Paierie Départementale'},
   'IdPost': {'@V': '041090'},
   'LibelleColl': {'@V': 'Val de Loire Numérique'},
   'IdColl': {'@V': '20004605000023'},
   'NatCEPL': {'@V': 'Etablissements Publics'}},
  'Budget': {'EnTeteBudget': {'LibelleEtab': {'@V': 'Val de Loire Très Haut Dé