**Ed-Fi Module Resource Generation**

The Ed-Fi Module uses a bunch of resources, mainly dataflows and trigger pipelines (with invoke the data flows). In the interest of asset maintainability, we have developed this utility to generate the required resources. 

    Pre-requisites: Customized Ed-Fi Metadata file (CSV)
    Please go through the Ed-Fi Schema in the Schemas section of OEA to understand about the Ed-Fi Metadata file and customizing it according to the requirements. 

After generating the Customized Ed-Fi Metadata file, run the EdFiResourceGenerator class. This will create 2 folders - pipeline and notebook. Both the folders, contain JSON files which needs to be committed and pushed to respective folders in your Repository which is being connected to synapse. 

You should be able to see the pipelines and notebooks in your synapse workspace. Publish them.

In [10]:
script_lines = """parameters{
	entity as string ('students'),
	directory as string ('Latest'),
	schoolYear as string ('2017'),
	districtId as string ('255901')
}
source(output(
	),
	allowSchemaDrift: true,
	validateSchema: false,
	inferDriftedColumnTypes: true,
	ignoreNoFilesFound: false,
	format: 'json',
	fileSystem: 'stage1',
	folderPath: ("Landing/Transactional/{$directory}/v5.3/DistrictId={$districtId}/SchoolYear={$schoolYear}/{$entity}/Delta/"),
	documentForm: 'documentPerLine',
	mode: 'read') ~> SourceJSON
SelectColumns alterRow(upsertIf(true())) ~> AlterConditions
DerivedColumn select(mapColumn(
	),
	skipDuplicateMapInputs: true,
	skipDuplicateMapOutputs: true) ~> SelectColumns
SourceJSON derive(LastModifiedDate = currentUTC()) ~> DerivedColumn
AlterConditions sink(allowSchemaDrift: true,
	validateSchema: false,
	format: 'delta',
	fileSystem: 'stage2',
	folderPath: ("Standardized/Transactional/{$directory}/v5.3/General/DistrictId={$districtId}/SchoolYear={$schoolYear}/{$entity}"),
	mergeSchema: false,
	autoCompact: false,
	optimizedWrite: true,
	vacuum: 0,
	deletable:false,
	insertable:false,
	updateable:false,
	upsertable:true,
	keys:['id'],
	umask: 0022,
	preCommands: [],
	postCommands: [],
	skipDuplicateMapInputs: true,
	skipDuplicateMapOutputs: true,
	saveOrder: 1) ~> SinkDELTA"""

In [11]:
dataflow_string = """{
	"name": "Base_Flow",
	"properties": {
		"folder": {
			"name": "EdFi/Generated"
		},
		"type": "MappingDataFlow",
		"typeProperties": {
			"sources": [
				{
					"linkedService": {
						"referenceName": "LS_datalake",
						"type": "LinkedServiceReference"
					},
					"name": "SourceJSON"
				}
			],
			"sinks": [
				{
					"linkedService": {
						"referenceName": "LS_datalake",
						"type": "LinkedServiceReference"
					},
					"name": "SinkDELTA"
				}
			],
			"transformations": [
				{
					"name": "AlterConditions"
				},
				{
					"name": "SelectColumns"
				},
				{
					"name": "DerivedColumn"
				}
			],
			"scriptLines": []
		}
	}
}"""

In [12]:
pipeline_string = """{
    "name": "Ingest_academicWeeks",
    "properties": {
        "activities": [
            {
                "name": "Process Upserts to stage2",
                "type": "ExecuteDataFlow",
                "dependsOn": [],
                "policy": {
                    "timeout": "1.00:00:00",
                    "retry": 0,
                    "retryIntervalInSeconds": 30,
                    "secureOutput": false,
                    "secureInput": false
                },
                "userProperties": [],
                "typeProperties": {
                    "dataflow": {
                        "referenceName": "Upsert_academicWeeks",
                        "type": "DataFlowReference",
                        "parameters": {
                            "entity": "'academicWeeks'",
                            "directory": {
                                "value": "'@{pipeline().parameters.Directory}'",
                                "type": "Expression"
                            },
                            "schoolYear": {
                                "value": "'@{pipeline().parameters.SchoolYear}'",
                                "type": "Expression"
                            },
                            "districtId": {
                                "value": "'@{pipeline().parameters.DistrictId}'",
                                "type": "Expression"
                            }
                        }
                    },
                    "integrationRuntime": {
                        "referenceName": "IR-DataFlows",
                        "type": "IntegrationRuntimeReference"
                    },
                    "traceLevel": "Fine",
                    "continuationSettings": {
                        "customizedCheckpointKey": "c741d7b2-35a0-409f-8457-4a02ef4ab47c"
                    }
                }
            },
            {
                "name": "Process Deletes to stage2",
                "type": "ExecuteDataFlow",
                "dependsOn": [
                    {
                        "activity": "Process Upserts to stage2",
                        "dependencyConditions": [
                            "Succeeded"
                        ]
                    }
                ],
                "policy": {
                    "timeout": "1.00:00:00",
                    "retry": 0,
                    "retryIntervalInSeconds": 30,
                    "secureOutput": false,
                    "secureInput": false
                },
                "userProperties": [],
                "typeProperties": {
                    "dataflow": {
                        "referenceName": "Process deletes",
                        "type": "DataFlowReference",
                        "parameters": {
                            "SourceTable": "'schools'",
                            "directory": {
                                "value": "'@{pipeline().parameters.Directory}'",
                                "type": "Expression"
                            },
                            "schoolYear": {
                                "value": "'@{pipeline().parameters.SchoolYear}'",
                                "type": "Expression"
                            },
                            "districtId": {
                                "value": "'@{pipeline().parameters.DistrictId}'",
                                "type": "Expression"
                            },
                            "entity": "'schools'"
                        }
                    },
                    "integrationRuntime": {
                        "referenceName": "IR-DataFlows",
                        "type": "IntegrationRuntimeReference"
                    },
                    "traceLevel": "Fine",
                    "continuationSettings": {
                        "customizedCheckpointKey": "7377e987-1348-493f-a975-dff763bb3941"
                    }
                }
            }
        ],
        "parameters": {
            "Directory": {
                "type": "string",
                "defaultValue": "EdFi"
            },
            "SchoolYear": {
                "type": "string",
                "defaultValue": "2017"
            },
            "DistrictId": {
                "type": "string",
                "defaultValue": "255901"
            }
        },
        "folder": {
            "name": "EdFi/Single District Per Instance/Generated"
        },
        "annotations": []
    }
}"""

In [13]:
from asyncio.windows_utils import pipe
from pyspark.sql.types import *
import requests
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
import json
import uuid

spark = SparkSession.builder.config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0")\
        .config("spark.sql.autoBroadcastJoinThreshold", "-1")\
        .appName('MDESparkApp')\
        .getOrCreate()
class EdFiResourceGenerator():
    """
    This class generates Data flows for transforming Ed-Fi tables in Azure Synapse.

    Attributes:
    -----------
    metadata_path: path to the metadata file.
    out_path: Output path where we write all the dataflow JSON files.
    swagger_url: URL to the Open API Swagger endpoint.
    templates_path: Path to the Dataflow templates.
    """

    def __init__(self, metadata_path, out_path, swagger_url, templates_path=None):
        self.metadata_path = metadata_path
        self.out_path = out_path
        self.templates_path = templates_path
        self.swagger_url = swagger_url
        self.metadata_values = spark.read.csv(self.metadata_path, header=True).collect()
        self.tables = list(set([x.table_name for x in self.metadata_values]))
        self.processed_tables = []
        self.yet_to_process_tables = [x for x in self.tables]
        self.schemas = {}
        self.columns = {}
        self.dependency_dict = {}
        self.primitive_tables = []

    def get_reference(self, row):
        flattened_fields = []
        if(row.type == 'array'):
            reference = (row.items).split('/')[-1][:-1]
        elif(row['$ref'] != None):
            if('::' in row['$ref']):
                reference = row['$ref'].split('::')[0]
                flattened_fields = (row['$ref'].split('::')[1]).split(':')
            else:
                reference = row['$ref']
        else:
            return None, None
        reference = reference.split('/')[-1]
        return reference, flattened_fields

    def get_data_type(self, dtype, format):
        if(dtype == 'string'):
            if(format == 'date'):
                return 'date'
            if(format == 'date-time'):
                return 'timestamp'
            return 'string'
        if(dtype == 'number'):
            return 'float'
        return dtype

    def get_schema(self, dtype, schema_list):
        schema = ''
        for line in schema_list:
            line = line if line[-1] == ',' else line + ','
            schema += line
        schema = schema[:-1] if schema[-1] == ',' else schema
        schema = '(' + schema + ')'
        if(dtype == 'array'):
            schema = schema + '[]'
        return schema + ','


    def create_primitive_schemas(self):
        print("Creating Primitive Schemas")
        for table in self.tables:
            table_values = [x for x in self.metadata_values if x.table_name == table]
            isPrimitive = not(any(x for x in table_values if x['$ref'] != None)) and not(any(x for x in table_values if x.type == 'array'))
            if(isPrimitive):
                table_schema = [f'{row.column_name} as {self.get_data_type(row.type, row.format)},'  for row in table_values]
                table_schema[-1] = table_schema[-1][:-1]
                self.schemas[table] = table_schema
                col_list = [row.column_name + ',' for row in table_values]
                col_list[-1] = col_list[-1][:-1]
                self.columns[table] = col_list
                self.primitive_tables.append(table)
                self.processed_tables.append(table)
                self.yet_to_process_tables.remove(table)
        print("Completed creating primitive schemas")

    def create_dependency_dict(self):
        referenced_df_values = [x for x in self.metadata_values if x.type == 'array' or x['$ref'] != None]
        for row in referenced_df_values:
            reference, flatten_fields = self.get_reference(row)
            if(row.table_name in self.dependency_dict.keys() and reference not in self.dependency_dict[row.table_name]):
                self.dependency_dict[row.table_name].append(reference)
            elif(row.table_name not in self.dependency_dict):
                self.dependency_dict[row.table_name] = [reference]

    def create_complex_schemas(self):
        while len(self.yet_to_process_tables) > 0:
            for entity in self.yet_to_process_tables:
                if(len([x for x in self.dependency_dict[entity] if x not in self.processed_tables]) == 0):
                    table_schema = [x for x in self.metadata_values if x.table_name == entity]
                    spark_schema = []
                    self.columns[entity] = []
                    for row in table_schema:
                        reference, flatten_fields = self.get_reference(row)
                        if(row.type == 'array'):
                            # Handle Array Objects
                            column_schema = self.get_schema('array', self.schemas[reference])
                        elif(row['$ref'] != None):
                            # Handle Normal Objects
                            column_schema = self.get_schema('object', self.schemas[reference])
                        else:
                            # Primitive Data type
                            column_schema = self.get_data_type(row.type, row.format) + ','
                        if(flatten_fields is None or len(flatten_fields) == 0):
                            self.columns[row.table_name].append(row.column_name + ',')
                        else:
                            self.columns[row.table_name] += [f"{x} = {row.column_name}.{x}," for x in flatten_fields]
                        spark_schema.append(f"{row.column_name} as {column_schema}")
                    # self.columns[row.table_name][-1] = self.columns[row.table_name][-1][:-1]
                    self.columns[row.table_name].append('LastModifiedDate')
                    spark_schema[-1] = spark_schema[-1][:-1]
                    self.schemas[entity] = spark_schema
                    self.processed_tables.append(entity)
                    self.yet_to_process_tables.remove(entity)
            #print(len(self.yet_to_process_tables))

    def create_resources(self):
        lines = []
        if(self.templates_path is not None):
            with open(f'{self.templates_path}/Scriptlines.txt') as f:
                for x in f:
                    lines.append(x.replace('\t', '').replace('\n', ''))
            with open(f"{self.templates_path}/Dataflow.json") as f:
                dataflow_json = json.load(f)
        else:
            lines = [x.replace('\t', '') for x in script_lines.split('\n')]
            dataflow_json = json.loads(dataflow_string)
            pipeline_json = json.loads(pipeline_string)
        swagger_json = json.loads(requests.get(self.swagger_url).text)


        for endpoint in swagger_json['paths']:
            if(endpoint.count('/') == 2):
                response = swagger_json['paths'][endpoint]['get']['responses']['200']['schema']
                reference = response['items']['$ref']
                reference = reference.split('_')[-1]
                entity = endpoint.split('/')[-1]
                lines[1] = f"entity as string ('{entity}'),"
                dataflow_json['properties']['typeProperties']['scriptLines'] = lines[:7] + self.schemas[reference] + lines[7:19] + self.columns[reference] + lines[19:]
                dataflow_json['name'] = f'Upsert_{entity}'

                pipeline_json['name'] = f'Ingest_{entity}'
                pipeline_json['properties']['activities'][0]['typeProperties']['dataflow']['referenceName'] = f"Upsert_{entity}"
                pipeline_json['properties']['activities'][0]['typeProperties']['dataflow']['parameters']['entity'] = f"'{entity}'"
                pipeline_json['properties']['activities'][0]['typeProperties']['continuationSettings']['customizedCheckpointKey'] = str(uuid.uuid4())

                pipeline_json['properties']['activities'][1]['typeProperties']['dataflow']['referenceName'] = "Process deletes"
                pipeline_json['properties']['activities'][1]['typeProperties']['dataflow']['parameters']['entity'] = f"'{entity}'"
                pipeline_json['properties']['activities'][1]['typeProperties']['dataflow']['parameters']['SourceTable'] = f"'{entity}'"
                pipeline_json['properties']['activities'][1]['typeProperties']['continuationSettings']['customizedCheckpointKey'] = str(uuid.uuid4())
                
                pipeline_json['properties']['folder']['name'] = 'EdFi/Single District Per Instance/Generated'


                with open(f"{self.out_path}/dataflow/Upsert_{entity}.json", 'w') as f:
                    f.write(json.dumps(dataflow_json))

                with open(f"{self.out_path}/pipeline/Upsert_{entity}.json", 'w') as f:
                    f.write(json.dumps(pipeline_json))

    def create_dataflows(self):
        self.create_primitive_schemas()
        self.create_dependency_dict()
        self.create_complex_schemas()
        self.create_resources()