In [18]:
import sys
import os
import re
import yaml
import json
import pandas as pd
import numpy as np
from pyparsing import *

# Sobre

Notebook experimental sobre extrair dados do powerbi via formato pbip e tmdl

# Referências

* [TMDL model no powerbi](https://powerbi.microsoft.com/en-us/blog/tmdl-in-power-bi-desktop-developer-mode-preview/#:~:text=Before%20giving%20it%20a%20try,semantic%20model%20using%20TMDL%20format%E2%80%9D.)

# Extrair dados do TMLD

In [20]:
def parse_tmdl(content):
    data = {}
    context = [data]  # Stack to hold the current context in the nested structure
    last_indent_level = 0

    # Pattern to capture the indentation level and content of each line
    line_re = re.compile(r"^(\s*)(\S.*)")

    for line in content.split('\n'):
        if not line.strip():
            continue  # Ignore empty lines

        indent, line_content = line_re.match(line).groups()
        indent_level = len(indent) // 4  # Assuming each indentation level is a tab or 4 spaces

        # Navigate up the context stack if the current indent level is less than the last
        if indent_level <= last_indent_level:
            context = context[:indent_level+1]

        if ':' in line_content:
            key, value = map(str.strip, line_content.split(':', 1))
            context[-1][key] = value
        else:
            key = line_content.strip()
            new_dict = {}
            context[-1][key] = new_dict
            context.append(new_dict)  # Push new dictionary onto the context stack for nested blocks

        last_indent_level = indent_level  # Update the last indent level

    return data

In [21]:
with open('pbip/PNP_Publicada_dev.SemanticModel/definition/model.tmdl', 
          'r', encoding='utf-8') as file:
    file_content = file.read()

parsed_data = parse_tmdl(file_content)
print(parsed_data)


{'model Model': {}, 'culture': 'pt-BR', 'defaultPowerBIDataSourceVersion': 'powerBI_V3', 'discourageImplicitMeasures': {}, 'sourceQueryCulture': 'pt-BR', 'dataAccessOptions': {}, 'fastCombine': {}, 'legacyRedirects': {}, 'returnErrorValuesAsNull': {}, 'queryGroup Parâmetros': {}, 'annotation PBI_QueryGroupOrder = 0': {}, "queryGroup 'Modelo PNP\\Dimensões'": {}, "queryGroup 'Modelo PNP\\Fatos'": {}, 'annotation PBI_QueryGroupOrder = 1': {}, "queryGroup 'Modelo Orçamentário'": {}, 'annotation PBI_QueryGroupOrder = 3': {}, "queryGroup 'Modelo Orçamentário\\Orçamento - Siop'": {}, '/// Grupo de Consultas do Modelo Orçamentário DSBR': {}, "queryGroup 'Modelo Orçamentário\\Orçamento TG'": {}, '/// Modelo de Dados Acadêmicos do Rede Revalide/PNP': {}, "queryGroup 'Modelo PNP'": {}, 'queryGroup Instagram': {}, 'annotation PBI_QueryGroupOrder = 4': {}, "queryGroup 'Modelo Gestão de Pessoas'": {}, 'annotation PBI_QueryGroupOrder = 2': {}, "queryGroup 'Modelo Gestão de Pessoas\\Fatos'": {}, "que

In [22]:
def tmdl_to_yaml(tmdl_content):
    parsed_data = parse_tmdl(tmdl_content)
    return yaml.dump(parsed_data, default_flow_style=False, sort_keys=False)

In [23]:
f = tmdl_to_yaml(file_content)

In [None]:

with open('parsed_data.json', 'w', encoding='utf-8') as json_file:
    json.dump(parsed_data, json_file, ensure_ascii=False, indent=4)

In [None]:
def parse_tmdl(content):
    lines = content.split('\n')
    groups = []
    current_group = []
    
    for line in lines:
        # Check if the line is not indented (starts at the beginning of the line)

        if line.strip() and not line.startswith('\t') and not line.startswith('    '):
            if current_group:
                groups.append('\n'.join(current_group))
                current_group = []
            # Append the unindented line to start a new group
            current_group.append(line)
        elif line.strip():
            current_group.append(line)
    
    # Add the last group if any
    if current_group:
        groups.append('\n'.join(current_group))
    
    return groups

with open('pbip/PNP_Publicada_dev.SemanticModel/definition/model.tmdl', 
# with open('pbip/PNP_Publicada_dev.SemanticModel/definition/tables/dimCurso.tmdl', 
          'r', encoding='utf-8') as file:
    file_content = file.read()

parsed_data = parse_tmdl(file_content)

In [36]:
parsed_data

['/// @description A tabela dimCurso possui informações sobre os cursos oferecidos por uma instituição de ensino.',
 '/// @author dsbr',
 '/// @version 1',
 '/// @date 03/11/2022\t',
 'table dimCurso\n\tlineageTag: e6e8bb82-8093-4629-9276-4f9d480108e8\n\t/// @description Corresponde à matrícula ponderada pelos fatores de: - Fator de Equiparação de carga horária; - Fator de Esforço de curso.\n\t/// @author dsbr\n\t/// @version 1\n\t/// @date 03/11/2022\n\tmeasure \'Matrícula Equivalente\' = ```\n\t\t\t/*@description Corresponde à matrícula ponderada pelos fatores de: - Fator de Equiparação de carga horária; - Fator de Esforço de curso.\n\t\t\t@author dsbr\n\t\t\t@version 1\n\t\t\t@date 03/11/2022*/\n\t\t\tCALCULATE\n\t\t\t([Matrícula Equivalente Apresentada Geral], dimFinanciamento[nomeFonteFinanciamentoCorrigido] IN {"Sem Programa Associado", "MedioTec"})\n\t\t\t```\n\t\tformatString: 0.000\n\t\tlineageTag: d67dd037-af45-4c8c-b957-a21e69aedb2b\n\t\tchangedProperty = FormatString\n\t///