In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE" # fully qualified for Fabric  
keyvault_linked_service = "INSERT_YOUR_KEYVAULT_LINKED_SERVICE_NAME_HERE" # linked service name for Synapse

# Synapse OEA environment paths
bronze_path = oeai.get_secret(spark, "bromcom-bronze", keyvault_linked_service, keyvault)
school_ids_secret = oeai.get_secret(spark, "bromcom-ids", keyvault_linked_service, keyvault)
school_ids = school_ids_secret.split(",")
appid = oeai.get_secret(spark, "bromcom-appid", keyvault_linked_service, keyvault)
token = oeai.get_secret(spark, "bromcom-appsecret", keyvault_linked_service, keyvault)

# Set up date parameters
today = datetime.today()
last_year = today - timedelta(days=365)
DateFrom = last_year.strftime('%Y-%m-%d')
DateTo = today.strftime('%Y-%m-%d')

In [None]:
# initialise the audit log
audit_log = oeai.load_audit_log(spark, bronze_path + "audit_log.json")
audit_logs = []
error_log_path = bronze_path + "error_log.txt"

In [None]:
def get_school_data(endpoint: str, appid: str, token: str, school_id: str, query: str) -> dict:
    
    url = f"https://api.bromcomcloud.com/{endpoint}?applicationId={appid}&applicationSecret={token}&schoolId={school_id}{query}"
    all_data = []
    next_url = url

    while next_url:
        #print(next_url)
        response = requests.get(next_url)
        # Check if the request was successful
        if response.status_code != 200:
            error_message = f"Error: {traceback.format_exc()}"
            oeai.log_error(spark, error_message, error_log_path)
            break

        response_data = response.json()
        # Check if data is a list or dictionary
        data_from_response = response_data.get("data", [])
        
        if isinstance(data_from_response, dict):
            all_data.append(data_from_response)
        else:
            all_data.extend(data_from_response)
        
        # Check if there are more pages
        next_url = response_data.get("meta", {}).get("pagination", {}).get("next")

    return all_data

In [None]:
def load_bronze(spark, endpoint: str, appid: str, token: str, school_id: str, query: str, use_date_chunk: str, limit=None, audit_log_file="audit_log.json"):
    """
    Loads data from an API into a Bronze layer, handling pagination, date chunking, and audit logging.

    Args:
        spark (SparkSession): Active SparkSession for DataFrame operations.
        endpoint (str): API endpoint to retrieve data from.
        subkey (str): Subkey for identifying the specific data.
        school_id (str): Unique identifier for the school.
        token (str): Authentication token for API access.
        pagination_type (str): Type of pagination used by the API ('cursor' or 'offset').
        limit (int, optional): Limit for the number of records to retrieve. Defaults to None.
        query (str, optional): Additional query parameters for the API call. Defaults to None.
        audit_log_file (str, optional): Filename for the audit log. Defaults to "audit_log.json".
        
    Returns:
        DataFrame: A PySpark DataFrame with the loaded data.
    """
    global audit_log
    df = pd.DataFrame()
    data_list = []  # Default empty list
    full_data_list = []  # Default empty list
    
    # Calculate the current time and the time from which we should update
    now = datetime.now()
    last_updated_time = "2023-08-01 00:00:00"
    if isinstance(last_updated_time, str):
        last_updated_time = datetime.strptime(last_updated_time, "%Y-%m-%d %H:%M:%S")
    else:
        last_updated_time = last_updated_str


    # Use 'safe_get' to retrieve the last updated date for the specific school_id and endpoint (subkey)
    #print("about to call safe_get(LastUpdated, school_id, subkey) as: ", LastUpdated, " ", school_id, " ", subkey)
    #last_updated_str = oeai.safe_get_or_create(LastUpdated, "2018-09-01 00:00:00", school_id, subkey)

    # if 
    if use_date_chunk:
            for start_date, end_date in oeai.generate_date_chunks(last_updated_time, now, chunk_size=timedelta(weeks=1)):
                chunk_query = oeai.update_query_with_chunks_bromcom(query, start_date, end_date)
                r = get_school_data(endpoint, appid, token, school_id, chunk_query)
                #print(chunk_query)
                # Check if the response is not None and not empty before processing
                if r:
                    if isinstance(r, dict) and 'data' in r:
                        data_list.append(r['data'])
                    elif isinstance(r, list):
                        data_list.extend(r)
                else:
                    error_message = f"Empty response, not adding to data_list: {traceback.format_exc()}"
                    oeai.log_error(spark, error_message, error_log_path)
    else:
        r = get_school_data(endpoint, appid, token, school_id, query)
        # Ensure the data is always a list
        if isinstance(r, dict) and 'data' in r:
            data_list = [r['data']]
        elif isinstance(r, list):
            data_list = r

    # Construct the directory path
    school_folder = os.path.join(bronze_path, school_id)

    # Check and create directory if it doesn't exist
    if not os.path.exists(school_folder):
        os.makedirs(school_folder)

    # Calculate the duration of the API call
    start_time = datetime.now()

    if not data_list:
        oeai.save_empty_json(spark, school_folder + "/" + endpoint + ".json")
        #print("Just saved empty json")
    else:
        try:
            # Update the last called date in the JSON file
            #LastUpdated[subkey] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            r_df = pd.DataFrame.from_records(data_list)
            flattened_data_list = [oeai.flatten_json(item) for item in data_list]

            # Convert the list of dictionaries to a Pandas DataFrame
            pandas_df = pd.DataFrame(flattened_data_list)

            # Convert the Pandas DataFrame to a PySpark DataFrame
            r_df = spark.createDataFrame(pandas_df)

            # Add school_id and unique_key to the DataFrame
            r_df = r_df.withColumn("school_id", lit(school_id))
            if "studentID" in r_df.columns:
                r_df = r_df.withColumn("unique_key", concat(lit(school_id),r_df["studentID"].cast("string").cast("string")))
            else:
                r_df = r_df.withColumn("unique_key", lit(school_id).cast("string"))

            # Save the DataFrame to a JSON file
            r_df.write.mode("overwrite").json(school_folder + "/" + endpoint + ".json")

            #with open(bronze_path+'last_run.json', 'w') as f:
            #    json.dump(LastUpdated, f)
            
        # if the key doesn't exist, skip it    
        except Exception as e:
            error_message = f"Error: {traceback.format_exc()}"
            oeai.log_error(spark, error_message, error_log_path)
            pass

    # Update the audit log
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()
    duration_str = str(duration)
    audit_data = {
        "school_id": school_id,
        "endpoint": endpoint,
        "query": query,
        "start_time": start_time.strftime('%Y-%m-%d %H:%M:%S'),
        "end_time": end_time.strftime('%Y-%m-%d %H:%M:%S'),
        "duration": duration_str,
        "records_returned": str(len(data_list)),
    }
    audit_log.append(audit_data)
    
    return df

In [None]:
'''
  BRONZE PROCESS
'''
# introduce a limit for testing or leave as None for Live
Limit = None
query = ""
# Get today's date and format it as 'YYYY-MM-DDT00:00:00'
datetoday = datetime.now().strftime('%Y-%m-%dT00:00:00')

# Set up the daily Jobs list with required endpoints
for school_id in school_ids:
    daily_jobs = [
        ("Schools", appid, token, school_id, "&entityFilter=schoolID=1", False),
        ("Students", appid, token, school_id, query, False),
        ("StudentFlatView", appid, token, school_id, query, False),
        ("AttendanceSessions", appid, token, school_id, "&entityFilter=year=2023", False),
        ("CalendarModels", appid, token, school_id, query, False),
        #("Attendances", appid, token, school_id, query, True),
        #("Attendances", appid, token, school_id, "&entityFilter=calendarStartDate>'2023-08-01T00:00:00'"),       
        ]

    # call load bronze for each of the daily jobs
    for job in daily_jobs:
        load_bronze(spark, job[0], job[1], job[2], job[3], job[4], job[5])

    # Save the audit log
    oeai.save_audit_log(spark, audit_log, bronze_path + "audit_log.json")
