In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI("Fabric")

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE" # Fabric requires full URL eg "https://key_vault_name.vault.azure.net/"
keyvault_linked_service = "INSERT_YOUR_LINKED_SERVICE_NAME_HERE"  # Not required for Fabric.

# Synapse OEA environment path & secrets
bronze_path = oeai.get_secret(spark, "msgraph-rp-bronze", keyvault_linked_service, keyvault)
msgraph_tenantid = oeai.get_secret(spark, "msgraph-tenantid", keyvault_linked_service, keyvault)
msgraph_clientid = oeai.get_secret(spark, "msgraph-clientid", keyvault_linked_service, keyvault)
msgraph_secret = oeai.get_secret(spark, "msgraph-secret", keyvault_linked_service, keyvault)

In [None]:
from pyspark.sql import functions as F
# Get today's date
today = spark.sql("SELECT current_date() AS today").collect()[0]["today"]

# Get yesterday's date
yesterday = spark.sql("SELECT date_sub(current_date(), 1) AS yesterday").collect()[0]["yesterday"]

# Output today's and yesterday's dates
print("Today's Date:", today)
print("Yesterday's Date:", yesterday)

In [None]:
from datetime import datetime, timedelta
from pyspark.sql import Row

In [None]:
# initialise the audit log
audit_log = oeai.load_audit_log(spark, bronze_path + "audit_log.json")
audit_logs = []
error_log_path = bronze_path + "error_log.txt"

In [None]:
# URL for obtaining the bearer token
token_url = f"https://login.windows.net/{msgraph_tenantid}/oauth2/v2.0/token"

# Prepare the body for the token request
token_body = {
    'grant_type': 'client_credentials',
    'client_id': msgraph_clientid,
    'client_secret': msgraph_secret,
    'scope': 'https://graph.microsoft.com/.default'
}

In [None]:
# Function to call the API for a token
def get_bearer_token(token_url, body):
    try:
        response = requests.post(token_url, data=body)
        # If the response was successful, no Exception will be raised
        response.raise_for_status()
        print("Token request successful.")
        return response.json()['access_token']
    except Exception as e:
        print(f"Error obtaining token: {e}")
        return None

In [None]:
import time
import requests

def call_api_with_token(url, params, token):
    headers = {
        'Authorization': f'Bearer {token}',
        'Content-Type': 'application/json',
        'Accept': 'application/json',
    }
    all_data = []  # Store all records from all pages
    while url:  # Loop as long as there's a URL to call
        try:
            print(f"Making API call to: {url}")
            print(f"With params: {params}")
            response = requests.get(url, params=params, headers=headers)
            print(f"Response Status: {response.status_code} - {response.reason}")
            if response.status_code != 200:
                print(f"Non-success response received: {response.text}")
                return None

            data = response.json()
            if 'value' in data:
                all_data.extend(data['value'])  # Append the current page of records
                print(f"Records received: {len(data['value'])}")
            else:
                print("No 'value' key in response.")
                break

            # Handle pagination
            url = data.get('@odata.nextLink')  # Update the URL for the next call
            if url:
                print(f"Pagination link found: {url}")
                params = {}  # Clear params because nextLink includes required parameters
            else:
                print("No pagination link found.")
        except Exception as e:
            print(f"Error in API call: {e}")
            return None
    print("API call successful. Total records received: ", len(all_data))
    return {'value': all_data}



In [None]:
def fetch_assignments_for_class(class_id, gt_date, access_token, spark, max_records=None):
    assignments_api_url = f"https://graph.microsoft.com/v1.0/education/classes/{class_id}/assignments"
    return fetch_graph_data(gt_date, assignments_api_url, access_token, spark, {}, max_records)

In [None]:
def fetch_graph_data(gt_date, api_url, access_token, spark, params, max_records=None):
    df = None
    items = []

    while api_url and (max_records is None or len(items) < max_records):
        response = call_api_with_token(api_url, params, access_token)

        if response and 'value' in response:
            items.extend(response['value'])

            if max_records is not None and len(items) >= max_records:
                items = items[:max_records]
                break

            api_url = response.get('@odata.nextLink', None)
            params = {}  # Clear params as the nextLink URL will have them

    if items:
        schema_fields = [StructField(k, StringType(), True) for k in items[0].keys()]
        schema = StructType(schema_fields)
        
        rdd = spark.sparkContext.parallelize(items)
        row_rdd = rdd.map(lambda x: Row(**{k: str(v) if v is not None else None for k, v in x.items()}))
        
        df = spark.createDataFrame(row_rdd, schema=schema)
    
    return df

In [None]:
# Function to call the API for each day within the date range and aggregate results
def fetch_rp_data_for_date_range(start_date, end_date, api_url, access_token, spark):
    current_date = start_date
    aggregated_df = None
    
    while current_date <= end_date:
        date_filter = current_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        params = {
            "$filter": f"submissionDateTime gt {date_filter}"
        }
        
        # Make the API call
        daily_response = call_api_with_token(api_url, params, access_token)
        
        # Process the response and create a DataFrame
        if daily_response and 'value' in daily_response:
            daily_items = daily_response['value']
            if daily_items:
                daily_df = spark.createDataFrame([Row(**item) for item in daily_items])
                if aggregated_df:
                    aggregated_df = aggregated_df.union(daily_df)
                else:
                    aggregated_df = daily_df
        
        current_date += timedelta(days=1)
    
    return aggregated_df


In [None]:
def execute_api_calls(api_endpoints_with_params, gt_date, token_url, token_body, spark, bronze_path, max_records):
    access_token = get_bearer_token(token_url, token_body)
    all_assignments_df = None  # Initialize a DataFrame to hold all assignments

    for endpoint_name, (api_url, params) in api_endpoints_with_params.items():
        if endpoint_name == "classes":
            print(f"Fetching data for: {endpoint_name}")
            classes_df = fetch_graph_data(gt_date, api_url, access_token, spark, params, max_records)
            
            if classes_df:
                classes_path = f"{bronze_path}/{endpoint_name}/"
                classes_df.coalesce(1).write.mode('overwrite').json(classes_path)
                print(f"Class data saved to: {classes_path}")

                # Extract class IDs to fetch assignments
                class_ids = [row.id for row in classes_df.collect()]
                
                for class_id in class_ids:
                    print(f"Fetching assignments for class ID: {class_id}")
                    assignments_df = fetch_assignments_for_class(class_id, gt_date, access_token, spark, max_records)
                    
                    # Aggregate assignments into a single DataFrame
                    if assignments_df:
                        if all_assignments_df is None:
                            all_assignments_df = assignments_df
                        else:
                            all_assignments_df = all_assignments_df.union(assignments_df)
        else:
            # Generic handling for other endpoints
            print(f"Fetching data for: {endpoint_name}")
            df = fetch_graph_data(gt_date, api_url, access_token, spark, params, max_records)
            if df:
                path = f"{bronze_path}/{endpoint_name}/"
                df.coalesce(1).write.mode('overwrite').json(path)
                print(f"Data for {endpoint_name} saved to: {path}")
    
    # Save the aggregated assignments DataFrame to a single JSON file
    if all_assignments_df:
        assignments_path = f"{bronze_path}/assignments/"
        all_assignments_df.coalesce(1).write.mode('overwrite').json(assignments_path)
        print(f"All assignments data saved to: {assignments_path}")


In [None]:
gt_date = datetime.strptime("2018-09-01", "%Y-%m-%d").date()
max_records = 15000

date_filter = gt_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

api_endpoints_with_params = {
    #"assignments": ("https://graph.microsoft.com/v1.0/education/assignments", {"$filter": f"createdDateTime ge {date_filter}"}),
    #Response Status: 400 - Bad Request
    #Non-success response received: {"error":{"code":"BadRequest","message":"Resource not found for the segment 'assignments'.","innerError":{"date":"2024-04-08T14:39:55","request-id":"450714eb-9239-41fa-af6f-fcb7d44d0571","client-request-id":"450714eb-9239-41fa-af6f-fcb7d44d0571"}}}
    "schools": ("https://graph.microsoft.com/v1.0/education/schools", {}),
    "users": ("https://graph.microsoft.com/v1.0/users", {"$filter": f"createdDateTime ge {date_filter}"}),    
    "classes": ("https://graph.microsoft.com/v1.0/education/classes", {}),
    #"reflect": ("https://graph.microsoft.com/beta/education/reports/reflectCheckInResponses", {}),
}

#  Assignment has a lastModifiedDateTime  

execute_api_calls(api_endpoints_with_params, gt_date, token_url, token_body, spark, bronze_path, max_records)

In [None]:
access_token = get_bearer_token(token_url, token_body)
# Define the date range
start_date = datetime.strptime("2024-04-01", "%Y-%m-%d").date()
end_date = yesterday #datetime.strptime("yesterday", "%Y-%m-%d").date()

# API endpoint
api_url = "https://graph.microsoft.com/beta/education/reports/readingassignmentsubmissions"

# Fetch data for the date range
df = fetch_rp_data_for_date_range(start_date, end_date, api_url, access_token, spark)

# Save the DataFrame to a JSON file
if df:
    path = f"{bronze_path}/readingprogress/"
    df.coalesce(1).write.mode("overwrite").json(path)