In [None]:

#General parameters
workspace = "dev"

#EdFi specific parameters
# kvName = "kv-oea-insmod1"
# authUrl = "https://api.edgraph.dev/edfi/v5.2/saas/5eb775fb-4eff-4889-9eae-3919b7a2d321/oauth/token"
# dataManagementUrl = "https://api.edgraph.dev/edfi/v5.2/saas/data/v3/5eb775fb-4eff-4889-9eae-3919b7a2d321/2011"
# changeQueriesUrl = "https://api.edgraph.dev/edfi/v5.2/saas/changequeries/v1/5eb775fb-4eff-4889-9eae-3919b7a2d321/2011" 
# dependenciesUrl = "https://api.edgraph.dev/edfi/v5.2/saas/metadata/data/v3/5eb775fb-4eff-4889-9eae-3919b7a2d321/2011/dependencies"
# apiVersion = "5.2"
# batchLimit = 100
# moduleName = "EdFi-3"
# minChangeVer = None
# maxChangeVer = None
# schoolYear = None
# districtId = None


StatementMeta(spark3p2med, 178, 43, Finished, Available)

In [None]:
%run /OEA_py

StatementMeta(, 178, -1, Finished, Available)

2023-05-22 20:22:34,058 - OEA - INFO - Now using workspace: dev
2023-05-22 20:22:34,059 - OEA - INFO - OEA initialized.


In [None]:
oea.set_workspace(workspace)

StatementMeta(spark3p2med, 178, 45, Finished, Available)

2023-05-22 20:22:34,724 - OEA - INFO - Now using workspace: dev


In [None]:
import requests
import json
import uuid
from requests.auth import HTTPBasicAuth
from datetime import datetime
import logging

logger = logging.getLogger('EdFiLandClient')

class EdFiLandClient:

    #The constructor
    def __init__(self, workspace, kvName, moduleName, authUrl, dataManagementUrl, changeQueriesUrl, dependenciesUrl, apiVersion, batchLimit, minChangeVer="", maxChangeVer="", schoolYear=None, districtId=None):
        self.workspace = workspace
        self.keyvault_linked_service = 'LS_KeyVault'
        oea.kvName = kvName

        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        for handler in logging.getLogger().handlers:
            handler.setFormatter(formatter)           
        # Customize log level for all loggers
        logging.getLogger().setLevel(logging.INFO)   
        logger.info(f"minChangeVersion={minChangeVer} and maxChangeVersion={maxChangeVer}")

        if not kvName and workspace == "dev":
            logger.info("defaulting to test data")
            self.clientId = ""
            self.clientSecret = ""
        else:
            try:
                #try to get the credentials from keyvault
                self.clientId = oea._get_secret("edfi-clientid")
                self.clientSecret = oea._get_secret("edfi-clientsecret")
            except Exception as e:
                #if there was an error getting the credentials
                #if this is the dev instance proceed with test data, otherwise raise the Exception
                logger.info(f"failed to retrieve clientId and clientSecret from keyvault with exception: {str(e)}")
                if workspace == "dev":
                    logger.info("defaulting to test data")
                    self.clientId = ""
                    self.clientSecret = ""
                else:
                    raise
        
        self.authUrl = authUrl
        self.dataManagementUrl = dataManagementUrl
        self.changeQueriesUrl = changeQueriesUrl
        self.dependenciesUrl = dependenciesUrl
        self.runDate = datetime.utcnow().strftime('%Y-%m-%d')
        self.authTime = None
        self.expiresIn = None
        self.accessToken = None
        districtPath = districtId if districtId != None else "All"
        schoolYearPath = schoolYear if schoolYear != None else "All"
        self.transactionalFolder = f"Transactional/{moduleName}/{apiVersion}/DistrictId={districtPath}/SchoolYear={schoolYearPath}"
        self.batchLimit = batchLimit
        self.minChangeVer = minChangeVer
        self.maxChangeVer = maxChangeVer

    #Method to get the access token for the test data set
    def authenticateWithAuthorization(self):
        #TODO: need to update this if we want it to work with other edfi provided test data set versions
        result = requests.post("https://api.ed-fi.org/v5.2/api/oauth/token",{"grant_type":"client_credentials"},headers={"Authorization":"Basic UnZjb2hLejl6SEk0OkUxaUVGdXNhTmY4MXh6Q3h3SGZib2xrQw=="})
        return result

    #Method to get the access token for a production system with basic auth
    def authenticateWithBasic(self):
        authHeader = HTTPBasicAuth(self.clientId, self.clientSecret)
        result = requests.post(self.authUrl,{"grant_type":"client_credentials"},auth=authHeader)
        return result

    #This method orchestrates the authentication
    def authenticate(self):
        self.authTime = datetime.now()
        if not self.clientId or not self.clientSecret: #self.workspace == "dev":
            result = self.authenticateWithAuthorization().json()
            logger.info(result)
        else:
            result = self.authenticateWithBasic().json()
        self.expiresIn = result["expires_in"]
        self.accessToken = result["access_token"]
    
    #This method manages the access token, refreshing it when required
    def getAccessToken(self):
        currentTime = datetime.now()
        #Get a new access token if none exists, or if the expires time is within 5 minutes of expiry
        if self.accessToken == None or (currentTime-self.authTime).total_seconds() > self.expiresIn - 300:
            self.authenticate()
            return self.accessToken
        else:
            return self.accessToken 

    def getChangeQueryVersion(self):
        access_token = self.getAccessToken()
        response = requests.get(changeQueriesUrl + "/availableChangeVersions", headers={"Authorization":"Bearer " + access_token})
        return response.json()
    
    def getEntities(self):
        return requests.get(self.dependenciesUrl).json()

    def getDeletes(self,resource, minChangeVersion, maxChangeVersion):
        url = f"{self.dataManagementUrl}{resource}/deletes?MinChangeVersion={minChangeVersion}&MaxChangeVersion={maxChangeVersion}"
        result = requests.get(url,headers = {"Authorization": f"Bearer {self.getAccessToken()}"})
        return result

    def writeToDeletesFile(self, resource, deletes):
        path = f"stage1/{self.transactionalFolder}{resource}/delete_batch_data/rundate={self.runDate}/data.json"
        mssparkutils.fs.put(oea.to_url(path),deletes.text)

    def landEntities(self):
        entities = self.getEntities()
        changeVersion = self.getChangeQueryVersion()
        minChangeVersion = changeVersion['OldestChangeVersion'] if self.minChangeVer == None else int(self.minChangeVer)
        maxChangeVersion = changeVersion['NewestChangeVersion']  if self.maxChangeVer == None else int(self.maxChangeVer)
        for entity in entities:
            resource = entity['resource']
            resourceMinChangeVersion = self.getChangeVersion(resource, minChangeVersion) if self.minChangeVer == None else minChangeVersion

            self.landEntity(resource, resourceMinChangeVersion, maxChangeVersion)
            deletes = self.getDeletes(resource,resourceMinChangeVersion,maxChangeVersion)
            if len(deletes.json()):
                self.writeToDeletesFile(resource,deletes)
    
    def getChangeVersion(self, resource, default):
        path = f"stage1/{self.transactionalFolder}{resource}/changeFile.json"
        if mssparkutils.fs.exists(oea.to_url(path)):
            return json.loads(mssparkutils.fs.head(oea.to_url(path)))['changeVersion']
        else:
            return default

    def landEntity(self,resource,minChangeVersion,maxChangeVersion):
        logger.info(f"initiating {resource}")
        path = f"stage1/{self.transactionalFolder}{resource}"
        url = f"{self.dataManagementUrl}{resource}?MinChangeVersion={minChangeVersion}&MaxChangeVersion={maxChangeVersion}&totalCount=true"
        total_count_response = requests.get(url, headers={"Authorization":f"Bearer {self.getAccessToken()}"})
        try:
            #Keyset pagination implementation: https://techdocs.ed-fi.org/display/ODSAPIS3V61/Improve+Paging+Performance+on+Large+API+Resources
            
            #split into the total number of partitions, and the range size
            total_count = int(total_count_response.headers["Total-Count"])
            partitions = total_count // self.batchLimit 
            range_size = maxChangeVersion // partitions

            for i in range(partitions + 1):
                #calculate the min and max change version for the partition
                partitionMinChangeVersion = i*range_size
                partitionMaxChangeVersion = min(maxChangeVersion, (i+1)*range_size)

                #Calculate the number of batches per partition
                partitionUrl=f"{self.dataManagementUrl}{resource}?MinChangeVersion={partitionChangeVersion}&MaxChangeVersion={partitionChangeVersion}&totalCount=true"
                partition_count_response = requests.get(partitionUrl, headers={"Authorization":f"Bearer {self.getAccessToken()}"})
                partition_count = int(partition_count_response.headers["Total-Count"])
                batches = partition_count // self.batchLimit

                for j in range(batches + 1):
                    batchUrl=f"{partitionUrl}&limit={self.batchLimit}&offset={(j)*self.batchLimit}"
                    data = requests.get(batch_url, headers={"Authorization":f"Bearer {self.getAccessToken()}"}) 
                    if(data.status_code < 400):         
                        filepath = f"{path}/delta_batch_data/rundate={self.runDate}/data{uuid.uuid4()}.json"
                        output = json.loads(data.text)
                        output_string = ""
                        for line in output:
                            output_string += json.dumps(line) + "\n"
                        mssparkutils.fs.put(oea.to_url(filepath),output_string)
                    else:
                        logger.info(f"There was an error retrieving batch data for {resource}")
        except:
            data = requests.get(url, headers={"Authorization":f"Bearer {self.getAccessToken()}"})          
            if(data.status_code < 400):         
                filepath = f"{path}/delta_batch_data/rundate={self.runDate}/data{uuid.uuid4()}.json"
                output = json.loads(data.text)
                output_string = ""
                for line in output:
                    output_string += json.dumps(line) + "\n"
                mssparkutils.fs.put(oea.to_url(filepath),output_string)
            else:
                logger.info(f"There was an error retrieving data for {resource}")
    
        changeFilepath = f"{path}/changeFile.json"
        changeData = {"changeVersion":maxChangeVersion}
        mssparkutils.fs.put(oea.to_url(changeFilepath),json.dumps(changeData),True)
        logging.info(f"completed {resource}")



StatementMeta(spark3p2med, 178, 46, Finished, Available)

In [None]:
exception = None
try:
    edfiLandClient = EdFiLandClient(workspace, kvName, moduleName, authUrl, dataManagementUrl, changeQueriesUrl, dependenciesUrl, apiVersion, batchLimit, minChangeVer, maxChangeVer, schoolYear, districtId)
    edfiLandClient.landEntities()
except Exception as e:
    exception = e


StatementMeta(spark3p2med, 178, 47, Finished, Cancelled)

2023-05-22 20:22:35,382 - EdFiLandClient - INFO - minChangeVersion=None and maxChangeVersion=None
2023-05-22 20:22:35,382 - EdFiLandClient - INFO - defaulting to test data
 
2023-05-22 20:22:35,894 - EdFiLandClient - INFO - {'access_token': '2b7762dd12a147a78bc6b0c5a32d8162', 'expires_in': 1800, 'token_type': 'bearer'}
{'access_token': '2b7762dd12a147a78bc6b0c5a32d8162', 'expires_in': 1800, 'token_type': 'bearer'}
2023-05-22 20:22:38,066 - EdFiLandClient - INFO - initiating /ed-fi/absenceEventCategoryDescriptors
2023-05-22 20:22:39,580 - root - INFO - completed /ed-fi/absenceEventCategoryDescriptors
2023-05-22 20:22:39,795 - EdFiLandClient - INFO - initiating /ed-fi/academicHonorCategoryDescriptors
2023-05-22 20:22:40,817 - root - INFO - completed /ed-fi/academicHonorCategoryDescriptors
2023-05-22 20:22:41,027 - EdFiLandClient - INFO - initiating /ed-fi/academicSubjectDescriptors
2023-05-22 20:22:41,855 - root - INFO - completed /ed-fi/academicSubjectDescriptors
2023-05-22 20:22:42,073

In [None]:
if exception != None:
    raise exception

StatementMeta(, , , Cancelled, )