In [1]:
!wget http://odm-budgetaire.org/composants/schemas/schema_doc_budg_V113.zip


--2024-02-19 22:01:55--  http://odm-budgetaire.org/composants/schemas/schema_doc_budg_V113.zip
Connecting to 192.168.112.24:8888... connected.
Proxy request sent, awaiting response... 200 OK
Length: 101783 (99K) [application/zip]
Saving to: ‘schema_doc_budg_V113.zip’


2024-02-19 22:01:55 (378 MB/s) - ‘schema_doc_budg_V113.zip’ saved [101783/101783]



In [2]:
import zipfile
import os 

VERSION_SCHEMA = "V113"

path_to_zip = f"./schema_doc_budg_{VERSION_SCHEMA}.zip"
schema_directory = f'./schema_doc_budg/{VERSION_SCHEMA}'

with zipfile.ZipFile(path_to_zip, 'r') as zip_ref:
    try:
       os.makedirs(schema_directory)
    except FileExistsError:
       pass
    zip_ref.extractall(schema_directory)
os.remove(path_to_zip) 


# Début du traitement

In [3]:
import xmltodict
import pandas as pd

VERSION_SCHEMA = "V113"

In [4]:
def create_dict_from_xml(chemin_fichier: str) -> dict:
    """
        Créer un dictionnaire à partir d'un chemin de fichier XML
    """
    with open(chemin_fichier, encoding='utf8') as fd:
        doc = xmltodict.parse(fd.read(), dict_constructor=dict)
    return doc

#### Parsing des différentes annexes existantes pour générer un dictionnaire des donnéesTypes de données présents dans les documents budgétaires (budgets et annexes)

In [5]:
def _dict_complex_type(annexe_type : dict, isTypeBudget=False) -> pd.DataFrame:
    """
    """
    list_records = []
    all_complex_type = annexe_type['xs:schema']['xs:complexType']
    if isTypeBudget:
        #Rustine pour gérer la documentation de TNomenclature qui est a un dictionnaire sans être un type complexe
        all_complex_type.pop(3)
    types_complexe = dict()
    for complex_type in all_complex_type:
        temp_dict = dict()
        nom_type_complexe = complex_type['@name']
        if isinstance(complex_type['xs:attribute'], dict):
            for element in complex_type['xs:attribute']['xs:simpleType']['xs:restriction']['xs:enumeration']:
                temp_dict[element['@value']] = element.get('xs:annotation', {}).get('xs:documentation', element['@value'])

        result_dict = {
            "type" : nom_type_complexe,
            "enum" : temp_dict
        }
        list_records.append(result_dict)
    
    df = pd.DataFrame.from_records(list_records)

    return df
    
def generate_complex_type_df(chemin : str) -> pd.DataFrame:
    """
       Créer un dataframe des différents types présent dans les documents budgétaires (Annexes et budget)
       Un travail similaire devra être fait sur les CFU
    """
    annexe_type = create_dict_from_xml(chemin)
    # condition pour gérer les annexes budget ?
    if chemin.split("/")[-1] == "CommunBudget.xsd":
        complexe_types_df = _dict_complex_type(annexe_type, isTypeBudget=True)
    else:
        complexe_types_df = _dict_complex_type(annexe_type)
    return complexe_types_df

#### Parsing des différentes annexes existantes pour générer un dictionnaire des données

In [6]:
def _init_annexe_data_dictionnary(class_annexe : dict) -> pd.DataFrame:
    elements = class_annexe['xs:sequence']['xs:element']
    list_records = []
    dict_champs = dict()
    nom_annexe = class_annexe["@name"][1:]
    for element in elements:
        documentation = element['xs:annotation']['xs:documentation']
        if isinstance(documentation, str):  
            libelle = documentation
            description = documentation
        elif isinstance(documentation, list):
            libelle = documentation[0]['z:libelle']
            description = documentation[0]['z:description']
        else:
            libelle = element['xs:annotation']['xs:documentation']['z:libelle']
            description = element['xs:annotation']['xs:documentation'].get('z:description')
        dict_champs = {
            "nom_annexe" : nom_annexe,
            "nom_champ" : element["@name"],
            "type" : element["@type"],
            "libelle" : libelle,
            "description" : description,
        }
        list_records.append(dict_champs)

    
    df = pd.DataFrame.from_records(list_records)
    df["description"] = df["description"].str.replace(r'^<[^<>]*>', '', regex=True)
    df["description"] = df["description"].str.replace(r'^\s*<ul>', '', regex=True)
    df["description"] = df["description"].str.replace(r'^\s*<li>', '', regex=True)
    df["description"] = df["description"].str.replace(r'<ul>', ' : ', regex=True)
    df["description"] = df["description"].str.replace(r'<li>', ' - ', regex=True)
    df["description"] = df["description"].str.replace(r'<[^<>]*>', ' ', regex=True)
    df["description"] = df["description"].str.replace(r'\s\s+', ' ', regex=True) 
    
    return df


def generate_annexe_data_document(chemin_annexe: str, complex_type_df: pd.DataFrame) -> pd.DataFrame:
    if chemin_annexe.split("/")[-1] == "Class_Signatures.xsd":
        class_to_generate = create_dict_from_xml(chemin_annexe)['xs:schema']['xs:complexType'][0]
    else:
        class_to_generate = create_dict_from_xml(chemin_annexe)['xs:schema']['xs:complexType'][1]
    init_df = _init_annexe_data_dictionnary(class_to_generate)
    init_df = init_df.merge(complex_type_df, how='left')
    return init_df

In [7]:
def create_annexe_data_dictionnary() -> pd.DataFrame:
    # Création du dictionnaire sur les annexes
    annexe_complexe_types = generate_complex_type_df(f"./schema_doc_budg/{VERSION_SCHEMA}/SchemaDocBudg/CommunAnnexe.xsd")
    # Récupération du nom du fichier xsd de l'ensemble des annexes existantes dans le schéma
    dict_annexe = create_dict_from_xml(f"./schema_doc_budg/{VERSION_SCHEMA}/SchemaDocBudg/Class_Annexes.xsd")["xs:schema"]['xs:include']
    #Suppression du budget qui sera traité à part
    dict_annexe.pop(0)
    
    class_annexe_paths = []
    df_result = pd.DataFrame()

    for annexe in dict_annexe: 
        class_annexe_paths.append(f"./schema_doc_budg/{VERSION_SCHEMA}/SchemaDocBudg/{annexe['@schemaLocation']}")
    
    for annexe_path in class_annexe_paths:
        df = generate_annexe_data_document(annexe_path, annexe_complexe_types)
        df_result = pd.concat([df, df_result])

    return df_result

def create_budget_data_dictionnary() -> pd.DataFrame:
    complex_type_df = generate_complex_type_df(f"./schema_doc_budg/{VERSION_SCHEMA}/SchemaDocBudg/CommunBudget.xsd")

    chemin_annexe = f"./schema_doc_budg/{VERSION_SCHEMA}/SchemaDocBudg/Class_LigneBudget.xsd"
    class_to_generate = create_dict_from_xml(chemin_annexe)
    df_ligne_budget = _init_annexe_data_dictionnary(class_to_generate['xs:schema']['xs:complexType'])
    df_ligne_budget = df_ligne_budget.merge(complex_type_df, how='left')
    
    chemin_annexe = f"./schema_doc_budg/{VERSION_SCHEMA}/SchemaDocBudg/Class_Budget.xsd"
    class_to_generate = create_dict_from_xml(chemin_annexe)
    df_budget = _init_annexe_data_dictionnary(class_to_generate['xs:schema']['xs:complexType'][1])
    df_budget = df_budget.merge(complex_type_df, how='left')
    
    
    chemin_annexe = f"./schema_doc_budg/{VERSION_SCHEMA}/SchemaDocBudg/Class_DocumentBudgetaire.xsd"
    class_to_generate = create_dict_from_xml(chemin_annexe)
    df_doc_budget = _init_annexe_data_dictionnary(class_to_generate['xs:schema']['xs:complexType'][1])
    df_doc_budget = df_doc_budget.merge(complex_type_df, how='left')
    df_doc_budget
    
    final_df = pd.concat([df_budget, df_ligne_budget, df_doc_budget])
    return final_df
        

In [14]:
def main() -> pd.DataFrame:
    df_annexe = create_annexe_data_dictionnary()
    df_budget = create_budget_data_dictionnary()

    df_final = pd.concat([df_annexe, df_budget])
    
    return df_final

#generate_annexe_data_document("./schema_doc_budg/V113/SchemaDocBudg/Class_Fonds_Europeens.xsd", annexe_complexe_types)

In [26]:
df_final = main()

df_final.to_csv("dictionnaire_donnees.csv", index=False)