In [None]:
import requests
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType

spark = SparkSession.builder.appName("TestSparkApp").appName('MDESparkApp').getOrCreate()

class OpenAPIMetadataGenerator:
    """
    A class to Generate Metadata file (CSV) from Open API (Swagger).

    Attributes:
    -----------
        dest_path: Output path to write metadata file.
        swagger_url: URL of the source API endpoint.
    """

    def __init__(self, dest_path, swagger_url="https://api.edgraph.dev/edfi/v5.3/saas/core/metadata/data/v3/123/2022/resources/swagger.json"):
        self.swagger_url = swagger_url
        # self.filename = metadata_output_filename
        self.dest_path = dest_path
        self.swagger_json = self.get_swagger_content()
        self.metadata_headers = ['table_name', 'column_name', 'type', 'format', 'maxLength', 'x-Ed-Fi-isIdentity', 'items', '$ref']
        self.metadata = []
        self.definitions = {}

    def get_swagger_content(self):
        return json.loads(requests.get(self.swagger_url).text)

    def create_dataframe(self, schema):
        return spark.createDataFrame(self.metadata, schema)

    def create_definitions(self):
        for entity in self.swagger_json['definitions']:
            properties = self.swagger_json['definitions'][entity]['properties']
            table_name = entity.split('_')[-1]

            table_schema = {}

            for prop in properties:
                if 'description' in properties[prop].keys():
                    properties[prop].pop('description')
                field_info = properties[prop]
                table_schema[prop] = field_info
            self.definitions[table_name] = table_schema

    def create_metadata(self):

        self.create_definitions()

        for entity in self.definitions:
            table_schema = self.definitions[entity]
            for column_name in table_schema:
                column_schema = {}
                for header in self.metadata_headers:
                    if(header in table_schema[column_name]):
                        column_schema[header] = table_schema[column_name][header]
                    else:
                        column_schema[header] = None
                if(column_schema['x-Ed-Fi-isIdentity'] != True):
                    column_schema['x-Ed-Fi-isIdentity'] = False
                if(column_schema['items'] != None):
                    column_schema['items']['$ref'] = (column_schema['items']['$ref']).replace('edFi_', '')
                if(column_schema['$ref'] != None):
                    column_schema['$ref'] = (column_schema['$ref']).replace('edFi_', '').replace('tpdm_', '')
                column_schema['table_name'] = entity
                column_schema['column_name'] = column_name
                self.metadata.append(column_schema)

    def write_metadata(self):
        schema = StructType([StructField(field, StringType(), True) for field in self.metadata_headers])
        df = self.create_dataframe(schema)
        df.coalesce(1).write.csv(f"{self.dest_path}", header=True, mode='overwrite')
