In [30]:
import json
import logging
import re
import os

from bs4 import BeautifulSoup
from markdown import markdown

logging.basicConfig(level=logging.INFO)

"""
https://github.com/slidoapp/dbt-superset-lineage/blob/main/dbt_superset_lineage/push_descriptions.py
"""

'\nhttps://github.com/slidoapp/dbt-superset-lineage/blob/main/dbt_superset_lineage/push_descriptions.py\n'

In [31]:
dbt_manifest_file = "/Users/martin/Dropbox/Development/keboola_dbt_beer_demo_datafold/target/manifest.json"
dbt_db_name = 'KEBOOLA_11273'

In [32]:
def get_tables_from_dbt(dbt_manifest, dbt_db_name):
    tables = {}
    for table_type in ['nodes', 'sources']:
        manifest_subset = dbt_manifest[table_type]

        for table_key_long in manifest_subset:
            table = manifest_subset[table_key_long]
            name = table['name']
            schema = table['schema']
            database = table['database']

            table_key_short = schema + '.' + name
            columns = table['columns']

            if dbt_db_name is None or database == dbt_db_name:
                # fail if it breaks uniqueness constraint
                assert table_key_short not in tables, \
                    f"Table {table_key_short} is a duplicate name (schema + table) " \
                    f"across databases. " \
                    "This would result in incorrect matching between Superset and dbt. " \
                    "To fix this, remove duplicates or add the ``dbt_db_name`` argument."

                tables[table_key_short] = {'columns': columns}


    assert tables, "Manifest is empty!"

    return tables


def convert_markdown_to_plain_text(md_string):
    """Converts a markdown string to plaintext.
    The following solution is used:
    https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
    """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(md_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)

    # extract text
    soup = BeautifulSoup(html, 'html.parser')
    text = ''.join(soup.findAll(text=True))

    # make one line
    single_line = re.sub(r'\s+', ' ', text)

    # make fixes
    single_line = re.sub('→', '->', single_line)
    single_line = re.sub('<null>', '"null"', single_line)

    return single_line

In [33]:
with open(dbt_manifest_file) as f:
    dbt_manifest = json.load(f)

names = get_tables_from_dbt(dbt_manifest, dbt_db_name)
names


{'WORKSPACE_28691590.beers_with_breweries': {'columns': {'brewery_id': {'name': 'brewery_id',
    'description': 'The unique identifier for the brewery',
    'meta': {'primary-key': True,
     'dimension': {'sql': '${TABLE}.brewery_id'},
     'meta': None,
     'metrics': {'num_unique_breweries': {'type': 'count_distinct'}}},
    'data_type': None,
    'quote': None,
    'tags': []},
   'beer_id': {'name': 'beer_id',
    'description': 'The unique identifier for the beer',
    'meta': {'primary-key': True, 'dimension': {'sql': '${TABLE}.beer_id'}},
    'data_type': None,
    'quote': None,
    'tags': []},
   'beer_name': {'name': 'beer_name',
    'description': 'The name of the beer',
    'meta': {'primary-key': True},
    'data_type': None,
    'quote': None,
    'tags': []},
   'beer_style': {'name': 'beer_style',
    'description': 'Style of the beer (IPA, Porter, etc)',
    'meta': {'dimension': {'sql': '${TABLE}.beer_style'},
     'metrics': {'num_unique_beer_styles': {'type': 'c