In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE"  # fully qualified for Fabric
keyvault_linked_service = "INSERT_YOUR_KEYVAULT_LINKED_SERVICE_NAME_HERE" # not required for Fabric  

# Synapse OEA environment paths
bronze_path = oeai.get_secret(spark, "cpoms-bronze", keyvault_linked_service, keyvault)
school_ids_secret = oeai.get_secret(spark, "cpoms-ids", keyvault_linked_service, keyvault)
school_ids = school_ids_secret.split(",")  # Assuming the IDs are comma-separated

In [None]:
# Create the spark session and initialise the audit log
audit_log = oeai.load_audit_log(spark, bronze_path + "audit_log.json")
audit_logs = []
error_log_path = bronze_path + "error_log.txt"

In [None]:
def update_query_with_chunks_cpoms(original_query, start_date, end_date):
        """
        Updates a query string by replacing or adding 'updated_after' and 'updated_before' parameters 
        with the provided start and end dates.

        Args:
            original_query (str): The original query string.
            start_date (datetime): The start date for the 'updated_after' parameter.
            end_date (datetime): The end date for the 'updated_before' parameter.

        Returns:
            str: The updated query string.
        """
        # Remove existing 'updated_after' parameter using regex
        query_without_updated_after = re.sub(r'filters[updated_at.gt]=[^&]*', '', original_query)

        # Trim any trailing '&' characters
        query_without_updated_after = query_without_updated_after.rstrip('&')

        # Ensure the query starts correctly with '?' or '&' based on existing content
        if query_without_updated_after and not query_without_updated_after.startswith('?'):
            query_prefix = '&' if '?' in query_without_updated_after else '?'
            query_without_updated_after = query_prefix + query_without_updated_after

        # Format the new 'updated_after' and 'updated_before' parameters
        formatted_start_date = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
        formatted_end_date = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
        chunk_query = f"{query_without_updated_after}&filters[updated_at.gt]={formatted_start_date}&filters[updated_at.lt]={formatted_end_date}"
        
        return chunk_query    

In [None]:
def get_school_data(token: str, school_id: str, endpoint: str, query: str) -> dict:
    """
    Get the data for a school from the CPOMS API.

    Args:
        token (str): The token to use for the Wonde API.
        school_id (str): The ID of the school to get data for.
        endpoint (str): The endpoint to get data from.
        query (str): Additional query parameters.
        pagination_type (str): Type of pagination ('cursor' or 'offset').

    Returns:
        dict: The data for the school from the Wonde API.
    """

    # Format the query string
    query = f"?{query.lstrip('&?')}"

    # Construct the initial URL
    base_url = f"https://{school_id}.cpoms.net/api/v1/{endpoint}"
    url = base_url
    
    # set the request headers
    headers = {
        'Authorization': 'Token ' + token,
        'Content-Type': 'application/json'
    }
    
    all_data = []
    url = url + query
    page = 1
    # per_page_limit = 50  # Default limit per page

    while True:
        print(url + "page " + str(page))
        response = requests.get(url, headers=headers, params={"page": page})

        # Check if the request was successful
        if response.status_code != 200:
            # Create a list of Row objects with your data
            data = [
                Row(Description="Error fetching data from URL", Value=url),
                Row(Description="Response Text", Value=response.text),
                Row(Description="School ID", Value=school_id),
                Row(Description="Token", Value=token)
            ]

            # Create a DataFrame from the data
            df = spark.createDataFrame(data)

            # Specify the filename
            filename = bronze_path + "debug_info.csv"

            # Write the DataFrame to a CSV file
            try:
                df.write.csv(filename, header=True, mode="overwrite")
                print("Written the data to", filename)
            except Exception as e:
                print(f"Error occurred: {e}")
                print(f"Error fetching data from {paginated_url}. Stopping.")
            break

        response_data = response.json()
        page += 1

        # Check if data is a list or dictionary
        data_from_response = response_data.get("data", [])
        if isinstance(data_from_response, dict):
            all_data.append(data_from_response)
        else:
            all_data.extend(data_from_response)
    
        if len(response_data["data"]) < 30:
            break

    return all_data

In [None]:
def load_bronze(spark, endpoint: str, school_id: str, urn: str, token: str, limit=None, query=None, use_date_chunk=False, audit_log_file="audit_log.json", override_date=None):
    global audit_log
    df = pd.DataFrame()
    data_list = []  # Default empty list
    full_data_list = []  # Default empty list
    
    # Calculate the duration of the API call
    start_time = datetime.now()
    now = datetime.now()

    if override_date:
        last_updated_str = oeai.safe_get_or_create(LastUpdated, override_date, school_id, endpoint)
        last_updated_time = datetime.strptime(override_date, "%Y-%m-%d %H:%M:%S")
    else:
        last_updated_str = oeai.safe_get_or_create(LastUpdated, "2023-09-01 00:00:00", school_id, endpoint)
        if last_updated_str is None:
            last_updated_time = now - timedelta(weeks=2)
        elif isinstance(last_updated_str, str):
            last_updated_time = datetime.strptime(last_updated_str, "%Y-%m-%d %H:%M:%S")
        elif isinstance(last_updated_str, datetime):
            last_updated_time = last_updated_str
        else:
            last_updated_time = now - timedelta(weeks=2)

    # If last_updated_time is more than two weeks ago, chunk the requests
    if use_date_chunk and (now - last_updated_time).days > 7:
        for start_date, end_date in oeai.generate_date_chunks(last_updated_time, now, chunk_size=timedelta(days=7)):
            
            chunk_query = update_query_with_chunks_cpoms(query, start_date, end_date)
            r = get_school_data(token, school_id, endpoint, chunk_query)

            # Check if the response is not None and not empty before processing
            if r:
                if isinstance(r, dict) and 'data' in r:
                    data_list.append(r['data'])
                elif isinstance(r, list):
                    data_list.extend(r)
            else:
                error_message = f"Empty response, not adding to data_list: {traceback.format_exc()}"
                oeai.log_error(spark, error_message, error_log_path)
    else:
        
        #if not override_date and last_updated_str is not None:
        formatted_date = last_updated_time.strftime('%Y-%m-%dT%H:%M:%SZ')
        query += "&filters[updated_at.gt]=" + formatted_date
        
        r = get_school_data(token, school_id, endpoint, query)

        # Ensure the data is always a list
        if isinstance(r, dict) and 'data' in r:
            data_list = [r['data']]
        elif isinstance(r, list):
            data_list = r

    # Construct the directory path
    school_folder = os.path.join(bronze_path, urn)

    # Check and create directory if it doesn't exist
    if not os.path.exists(school_folder):
        os.makedirs(school_folder)

    #if not override_date:
    LastUpdated[school_id][endpoint] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    json_str = json.dumps(LastUpdated)
    last_updated_df = spark.createDataFrame([LastUpdated])
    last_updated_df.repartition(1).write.mode("overwrite").json(bronze_path + 'last_run')

    if not data_list:
        oeai.save_empty_json(spark, school_folder + "/" + endpoint + ".json")
    else:
        try:
            nested = ['categories.json']
            # Flatten each item in data_list
            if endpoint in nested:
                flattened_data_list = oeai.flatten_nested_json(json.dumps(data_list))
            else:
                flattened_data_list = [oeai.flatten_json(item) for item in data_list]

            # Convert the list of dictionaries to a Pandas DataFrame
            pandas_df = pd.DataFrame(flattened_data_list)

            # Convert the Pandas DataFrame to a PySpark DataFrame
            r_df = spark.createDataFrame(pandas_df)

            # Add school_id and unique_key to the DataFrame
            r_df = r_df.withColumn("school_id", lit(urn))
            if "student_data_id" in r_df.columns:
                r_df = r_df.withColumn("unique_key", concat(lit(urn),r_df["student_data_id"].cast("string"), r_df["id"].cast("string")))
            else:
                r_df = r_df.withColumn("unique_key", concat(lit(urn), r_df["id"].cast("string")))
            
            # Save the DataFrame to a JSON file
            r_df.write.mode("overwrite").json(school_folder + "/" + endpoint + ".json")
      
        # if the key doesn't exist, skip it    
        except KeyError as e:
            print(f"KeyError for key: {e}")
            #print(r_df)  # If you suspect the error is due to r_df
            pass

    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()
    
    # Convert duration to a string representation
    duration_str = str(duration)

    # Update the audit log
    audit_data = {
        "school_id": school_id,
        "endpoint": endpoint,
        "query": query,
        "start_time": start_time.strftime('%Y-%m-%d %H:%M:%S'),
        "end_time": end_time.strftime('%Y-%m-%d %H:%M:%S'),
        "duration": duration_str,
        "records_returned": str(len(data_list)),
    }

    # Append the audit log to the audit_logs list
    audit_log.append(audit_data)

    return df

In [None]:
# introduce a limit for testing or leave as None for Live
Limit = None

schools_list = []
for school_id in school_ids:
    name, urn = school_id.split('-', 1)
    secret_name = f"cpoms-{name}"
    try:
        token = oeai.get_secret(spark, secret_name, keyvault_linked_service, keyvault)
        schools_list.append({"school_id": name, "urn": urn, "token": token})
    except Exception as e:
        print(f"Error retrieving secret for {school_id}: {e}")

# Define the path
json_file_path = bronze_path + 'last_run'

# Reset LastUpdated 
LastUpdated = {}

# Read the JSON file 
try:
    df = spark.read.json(bronze_path + 'last_run')
    rows = df.collect()
    if rows:
        LastUpdated = rows[0].asDict()
    else:
        # Handle the case where the JSON file might be empty or not read correctly
        LastUpdated = {}
except:
    LastUpdated = {}

# Convert 'LastUpdated' Row objects to dictionaries
for key, value in LastUpdated.items():
    if isinstance(value, Row):
        LastUpdated[key] = oeai.row_to_dict(value)

for school in schools_list:
    school_id = school["school_id"]
    token = school["token"]  
    urn = school["urn"]

    daily_jobs = [
        ("students", Limit, "?", False), 
        ("categories", Limit, "?", False), 
        ("incidents", Limit, "?", True), 
        ("actions", Limit, "?", True), 
        ("links", Limit, "?", False), 
        ]

    # call load bronze for each of the daily jobs
    for job in daily_jobs:
        #spark, endpoint: str, school_id: str, urn: str, token: str, limit=None, query=None, use_date_chunk=False, audit_log_file="audit_log.json"):
        #load_bronze(spark, job[0], school["school_id"], school["urn"], school["token"], job[1], job[2], job[3])
        # to override the lastupdated:
        load_bronze(spark, job[0], school["school_id"], school["urn"], school["token"], job[1], job[2], job[3], override_date="2024-01-01 00:00:00")

    # Save the audit log
    oeai.save_audit_log(spark, audit_log, bronze_path + "audit_log.json")