In [1]:
%run oeai_py

StatementMeta(, , -1, Finished, Available)

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [2]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_KV_NAME"  
keyvault_linked_service = "INSERT_LS_NAME"  

# Synapse OEA environment paths
bronze_path = oeai.get_secret(spark, "bromcom-bronze", keyvault_linked_service, keyvault)
school_ids_secret = oeai.get_secret(spark, "bromcom-ids", keyvault_linked_service, keyvault)
school_ids = school_ids_secret.split(",")
appid = oeai.get_secret(spark, "bromcom-appid", keyvault_linked_service, keyvault)
token = oeai.get_secret(spark, "bromcom-appsecret", keyvault_linked_service, keyvault)

# Set up date parameters
today = datetime.today()
last_year = today - timedelta(days=365)
DateFrom = last_year.strftime('%Y-%m-%d')
DateTo = today.strftime('%Y-%m-%d')

StatementMeta(spark3p3sm, 44, 3, Finished, Available)

In [3]:
# initialise the audit log
audit_log = oeai.load_audit_log(spark, bronze_path + "audit_log.json")
audit_logs = []
error_log_path = bronze_path + "error_log.txt"

StatementMeta(spark3p3sm, 44, 4, Finished, Available)

In [4]:
def get_school_data(endpoint: str, appid: str, token: str, school_id: str, query: str) -> dict:
    """
    Retrieves school-related data from a specified API endpoint.

    This function makes HTTP GET requests to a given API endpoint to fetch data related to a specific school. It handles pagination by iterating through pages of data until no further pages are available. If an error occurs during a request, it logs the error and stops fetching data.

    Parameters:
    endpoint (str): The API endpoint to make the request to. Example: 'v1/schooldata'.
    appid (str): The application ID used for API authentication.
    token (str): The token used for API authentication.
    school_id (str): The unique identifier of the school for which data is being fetched.
    query (str): Additional query parameters to be appended to the URL. Example: '&year=2021'.

    Returns:
    dict: A list of dictionaries, each dictionary containing data for one page of results. 
          The structure of each dictionary depends on the API's response format.

    Raises:
    Logs an error message if the request returns a status code other than 200.

    Example usage:
    data = get_school_data('v1/schooldata', 'app123', 'token456', 'school789', '&year=2021')
    """

    url = f"https://api.bromcomcloud.com/{endpoint}?applicationId={appid}&applicationSecret={token}&schoolId={school_id}{query}"
    all_data = []
    next_url = url

    while next_url:
        response = requests.get(next_url)
        if response.status_code != 200:
            error_message = f"Error: {traceback.format_exc()}"
            oeai.log_error(spark, error_message, error_log_path)
            break

        response_data = response.json()
        data_from_response = response_data.get("data", [])
        
        if isinstance(data_from_response, dict):
            all_data.append(data_from_response)
        else:
            all_data.extend(data_from_response)
        
        next_url = response_data.get("meta", {}).get("pagination", {}).get("next")

    return all_data


StatementMeta(spark3p3sm, 44, 5, Finished, Available)

In [5]:
def load_bronze(spark, endpoint: str, appid: str, token: str, school_id: str, query: str, limit=None, audit_log_file="audit_log.json"):
    """
    Loads data from a specified API endpoint into a Bronze layer in a data lake architecture, handling API pagination and audit logging.

    This function fetches data using the 'get_school_data' function, then processes and stores the data in a JSON format. It also handles logging for audit purposes, including start and end times, duration, and the number of records returned.

    Args:
        spark (SparkSession): The active SparkSession for DataFrame operations.
        endpoint (str): The API endpoint to retrieve data from.
        appid (str): The application ID for API authentication.
        token (str): The token used for API authentication.
        school_id (str): The unique identifier of the school for which data is being fetched.
        query (str): Additional query parameters to be appended to the API call.
        limit (int, optional): The limit for the number of records to retrieve. Defaults to None.
        audit_log_file (str, optional): The filename for the audit log. Defaults to "audit_log.json".

    Returns:
        DataFrame: A PySpark DataFrame with the loaded data, although the function currently returns an empty DataFrame.

    Note:
        The function currently does not use the 'limit' and 'audit_log_file' arguments. 
        Additionally, the final returned DataFrame is empty and does not contain the fetched data.

    Example usage:
        df = load_bronze(spark, 'v1/schooldata', 'app123', 'token456', 'school789', '&year=2021')
    """
    global audit_log
    df = pd.DataFrame()
    data_list = []  
    full_data_list = []  
    
    # Calculate the current time and the time from which we should update
    now = datetime.now()

    r = get_school_data(endpoint, appid, token, school_id, query)
    # Ensure the data is always a list
    if isinstance(r, dict) and 'data' in r:
        data_list = [r['data']]
    elif isinstance(r, list):
        data_list = r

    # Construct the directory path
    school_folder = os.path.join(bronze_path, school_id)

    # Check and create directory if it doesn't exist
    if not os.path.exists(school_folder):
        os.makedirs(school_folder)

    # Calculate the duration of the API call
    start_time = datetime.now()

    if not data_list:
        oeai.save_empty_json(spark, school_folder + "/" + endpoint + ".json")
    else:
        try:
            r_df = pd.DataFrame.from_records(data_list)
            flattened_data_list = [oeai.flatten_json(item) for item in data_list]

            # Convert the list of dictionaries to a Pandas DataFrame
            pandas_df = pd.DataFrame(flattened_data_list)

            # Convert the Pandas DataFrame to a PySpark DataFrame
            r_df = spark.createDataFrame(pandas_df)

            # Add school_id and unique_key to the DataFrame
            r_df = r_df.withColumn("school_id", lit(school_id))
            if "studentID" in r_df.columns:
                r_df = r_df.withColumn("unique_key", concat(lit(school_id),r_df["studentID"].cast("string").cast("string")))
            else:
                r_df = r_df.withColumn("unique_key", lit(school_id).cast("string"))

            # Save the DataFrame to a JSON file
            r_df.write.mode("overwrite").json(school_folder + "/" + endpoint + ".json")
            
        # if the key doesn't exist, skip it    
        except Exception as e:
            error_message = f"Error: {traceback.format_exc()}"
            oeai.log_error(spark, error_message, error_log_path)
            pass

    # Update the audit log
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()
    duration_str = str(duration)
    audit_data = {
        "school_id": school_id,
        "endpoint": endpoint,
        "query": query,
        "start_time": start_time.strftime('%Y-%m-%d %H:%M:%S'),
        "end_time": end_time.strftime('%Y-%m-%d %H:%M:%S'),
        "duration": duration_str,
        "records_returned": str(len(data_list)),
    }
    audit_log.append(audit_data)
    
    return df

StatementMeta(spark3p3sm, 44, 6, Finished, Available)

In [6]:
'''
  BRONZE PROCESS
'''
# This section of code is dedicated to the 'Bronze Process' in a data pipeline.
# It is responsible for fetching and storing raw data from various endpoints.

# Initialize an empty query string
query = ""

# Get the current date and format it as 'YYYY-MM-DDT00:00:00'.
# This formatted date can be used in query parameters where date filters are required.
datetoday = datetime.now().strftime('%Y-%m-%dT00:00:00')

# Loop over each school ID to set up and process daily jobs.
# 'daily_jobs' is a list of tuples, each representing an API endpoint and its associated parameters.
for school_id in school_ids:
    daily_jobs = [
        # Each tuple contains the endpoint, appid, token, school_id, and query string.
        # The query strings are set up to filter the data as per the requirements.
        ("Schools", appid, token, school_id, "&entityFilter=schoolID=1"),
        ("Students", appid, token, school_id, query),
        ("StudentFlatView", appid, token, school_id, query),
        ("AttendanceSessions", appid, token, school_id, "&entityFilter=year=2023"),
        ("CalendarModels", appid, token, school_id, query),
        ("Attendances", appid, token, school_id, "&entityFilter=calendarStartDate>'2023-09-01T00:00:00' AND calendarStartDate<'2024-01-13T00:00:00' AND calendarName='PM'"),
        ]

    # Process each job in 'daily_jobs' using the 'load_bronze' function.
    # This function will load and process the data from each endpoint.
    for job in daily_jobs:
        load_bronze(spark, job[0], job[1], job[2], job[3], job[4])

    # Save the updated audit log to a JSON file at the specified path.
    # This log tracks the execution and outcomes of the data loading processes.
    oeai.save_audit_log(spark, audit_log, bronze_path + "audit_log.json")

StatementMeta(spark3p3sm, 44, 7, Finished, Available)