In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Define schemas for the tables
source_schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("type", StringType(), True),
    StructField("description", StringType(), True)
])
entity_schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("source_id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("processing_type", StringType(), True),
    StructField("description", StringType(), True)
])
entity_parameters_schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("entity_id", IntegerType(), False),
    StructField("parameter_name", StringType(), True),
    StructField("parameter_value", StringType(), True),
    StructField("description", StringType(), True)
])


# Helper function to retrieve ID value from DataFrame based on given name
def get_id_dict(df):
    return {
        row['name']: row['id']
        for row in df.select("id", "name").collect()
    }


# Define source types
source_types = [
    ("FakerAPI", "api", "API source for Faker data generation"),
    ("sql_f1_data", "sql" , "f1 data mart")
    ]

# Generate source data dynamically with enumerated IDs
source_data = [(i, source[0], source[1], source[2])
               for i, source in enumerate(source_types, start=1)]
df_source = spark.createDataFrame(source_data, schema=source_schema)
df_source.write.mode("overwrite").saveAsTable("lh_metadata.source")

# Construct dictionary from the source table
source_id_dict = get_id_dict(df_source)


# Function to create entities dynamically with enumerated IDs
def create_entity_data(entities, source_id_dict):
    entity_data = []
    for i, entity in enumerate(entities, start=1):
        entity_dict = {
            'id': i,
            'source_id': source_id_dict[entity['source']],
            'name': entity['name'],
            'processing_type': entity['processing_type'],
            'description': f"{entity['name']} from {entity['source']}"
        }
        entity_data.append(
            (entity_dict['id'], entity_dict['source_id'], entity_dict['name'],
             entity_dict['processing_type'], entity_dict['description']))
    return entity_data


# Populate the entities list dynamically
names_merge = ['persons', 'products', 'users']
names_rebuild = ['addresses', 'companies', 'books']
names_merge_sql = ['fact_constructor_statistics', 'fact_driver_statistics']
names_rebuild_sql = ['dim_constructors', 'dim_drivers', 'dim_races', 'dim_status']

# Dynamic generation of entities list with source field
entities = [{'source': 'FakerAPI', 'name': name, 'processing_type': 'merge', 'sink_file_folder': 'landing_zone', 'sink_file_format': 'json', 'api_query': f'{name}?_quantity=1000&_locale=en_EN'} for name in names_merge] + \
           [{'source': 'FakerAPI', 'name': name, 'processing_type': 'rebuild', 'sink_file_folder': 'landing_zone', 'sink_file_format': 'json', 'api_query': f'{name}?_quantity=1000&_locale=en_EN'} for name in names_rebuild] + \
           [{'source': 'sql_f1_data', 'name': name, 'processing_type': 'merge', 'sink_file_folder': 'landing_zone', 'sink_file_format': 'parquet'} for name in names_merge_sql] + \
           [{'source': 'sql_f1_data', 'name': name, 'processing_type': 'rebuild', 'sink_file_folder': 'landing_zone', 'sink_file_format': 'parquet'} for name in names_rebuild_sql]

# Generate entity data dynamically
entity_data = create_entity_data(entities, source_id_dict)
df_entity = spark.createDataFrame(entity_data, entity_schema)
df_entity.write.mode("overwrite").saveAsTable("lh_metadata.entity")


# Function to create entity parameters data dynamically with enumerated IDs
def create_entity_parameters_data(entities, df_entity):
    entity_parameters_data = []
    param_id = 1

    # Local helper function to retrieve ID value from DataFrame based on given name
    def get_id(name, df):
        id_row = df.filter(df.name == name).select("id").first()
        return id_row.id if id_row else None

    for entity in entities:
        entity_id = get_id(entity['name'], df_entity)
        if entity['source'] == 'FakerAPI':
            parameters = [
                (entity_id, 'sink_file_format', entity['sink_file_format'], ''),
                (entity_id, 'sink_file_folder', entity['sink_file_folder'], ''),
                (entity_id, 'api_query', entity['api_query'], ''),
                (entity_id, 'column_to_explode', 'data', ''),
                (entity_id, 'key_columns', '["id"]', '')
            ]
        else: # This is some stupid shit, would've been smarter to just fix the Data Mart objects
            name = entity["name"]
            name = name.lower()
            if name == "dim_status":
                singular_name = "dim_status"
            elif name.startswith("dim_"):
                singular_name = name[:-1] if name.endswith("s") else name
            elif name.startswith("fact_"):
                singular_name = name.replace("_statistics", "")
            else:
                singular_name = name

            key_column = f"{singular_name}_key"

            # Format to add "_key" maintaining underscores
            key_column = f'["{singular_name}_key"]'.lower()
            parameters = [
                (entity_id, 'sink_file_format', entity['sink_file_format'], ''),
                (entity_id, 'sink_file_folder', entity['sink_file_folder'], ''),
                (entity_id, 'key_columns', key_column, ''),
                (entity_id, 'source_schema', 't_dm', ''),
                (entity_id, 'target_schema', 'dbo', '')
            ]
        for param in parameters:
            entity_parameters_data.append((param_id, param[0], param[1], param[2], param[3]))
            param_id += 1
    return entity_parameters_data


# Generate entity parameters data dynamically
entity_parameters_data = create_entity_parameters_data(entities, df_entity)
df_entity_parameters = spark.createDataFrame(entity_parameters_data, entity_parameters_schema)
df_entity_parameters.write.mode("overwrite").saveAsTable("lh_metadata.entity_parameters")

In [None]:
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.repl.eagerEval.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", "true")

df_entity_parameters.show(truncate=False, n=1000)
df_source.show(truncate=False, n=1000)
df_entity.show(truncate=False, n=1000)
