In [None]:
import requests
import json
import base64
from datetime import datetime
import os
import time

In [None]:
# Get client credentials from Azure Key Vault
client_id = mssparkutils.credentials.getSecret('AZURE_KEY_VAULT_URL','SECRET_ID') 
client_secret = mssparkutils.credentials.getSecret('AZURE_KEY_VAULT_URL','SECRET_ID') 

# Construct client credentials string and encode in Base64
identifier = f"{client_id}:{client_secret}"
credentials = base64.b64encode(identifier.encode()).decode()

### Get Data from Multiple API Endpoints and Store Using Best Practice Folder Hierarchy

In [None]:
personal_information = {
    'people' : 'URL',
    'disabilities' : 'URL',
    'maritalstatus' : 'URL',
    'dependants' : 'URL',
    'havassessment' : 'URL'
    }

contact_details = {
    'emergencycontacts' : 'URL',
    'communications' : 'URL',
    'addresses' : 'URL'
    }

bank_details = {
    'bankdetails' : 'URL'
    }

skills_and_qualifications = {
    'professionalqualifications' : 'URL',
    'education' : 'URL',
    'externalworkhistory' : 'URL',
    'languageskills' : 'URL',
    'personalskills' : 'URL',
    'licences' : 'URL'
}

employment = {
    'calendarassigments' : 'URL',
    'employments' : 'URL',
    'empoymentstatus' : 'URL',
    'empoymentcontracts' : 'URL',
    'smcr' : 'URL',
    'nationalcontractsalarylevels' : 'URL',
    'nationalcontractqualifications' : 'URL'
}

deployment = {
    'persongrades' : 'URL',
    'locationdeployments' : 'URL',
    'jobdeployments' : 'URL',
    'deploymentsorgunitorpositions' : 'URL',
    'costcentredeployments' : 'URL',
    'reportsto' : 'URL',
    'approvesbytype' : 'URL'
}

In [None]:
# List of dictionaries 
dicts = [personal_information, contact_details, bank_details, skills_and_qualifications, employment, deployment]

In [None]:
# Mapping variable names to dictionaries
dict_mapping = {
    "personal_information": personal_information,
    "contact_details": contact_details,
    "bank_details": bank_details,
    "skills_and_qualifications": skills_and_qualifications,
    "employment": employment,
    "deployment": deployment
}

In [None]:
# Initilize token request (defined in Functions notebook)
start_time = time.time()
access_token = get_access_token()

### Data Request

In [None]:
# Run series of GET request through all the defined endpoints
for name, dictionary in dict_mapping.items():
    try:
        # Save current date
        current_date = datetime.now().strftime("%Y%m%d")

        # Define output directory
        output_dir = f"/lakehouse/Files/API Endpoints/{name}/{current_date}/" 
        
        # Create directory if not exists
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if start_time > 260:
            access_token = get_access_token()

        headers = {
            "Authorization": "Bearer " + str(access_token),
            "Host": "SERVER"
        }

        for key, value in dictionary.items():
            try:
                request = requests.get(value, headers=headers)
                print(value)
                
                # Check if the response is successful (status code 200) and contains data
                if request.status_code == 200 and request.content:
                    json_file = request.json()
                    
                    # Add a subfolder based of key
                    inner_dir = f"{output_dir}/{key}"

                    if not os.path.exists(inner_dir):
                        os.makedirs(inner_dir)

                    # Define the output file path
                    output_file = f"{inner_dir}/{current_date}_APINAME_{key}.json"
                    
                    # Write the JSON data to the output file
                    with open(output_file, "w") as f:
                        json.dump(json_file, f)
                    
                    # Check the file size
                    file_size = os.path.getsize(output_file)
                    print("JSON data has been exported to:", output_file)
                    print("File size:", file_size, "bytes")
                    
                else:
                    print(f"Error: Empty or invalid response from the server (HTTP Status Code: {request.status_code})")
                    
            except requests.RequestException as e:
                print(f"Error: Request failed - {e}")
    
    except Exception as e:
        print("An error occurred:", e)

### Check of the last PULL and retry endpoint where files are missing

In [None]:
# List of directories 
folders = ["personal_information", "contact_details", "bank_details", "identification", "skills_and_qualifications", "employment", "deployment"]
data_list =[]

for folder in folders:
    parent_path = f"/lakehouse/Files/API Endpoints/{folder}/"
    latest_folder = find_latest_date_folder(parent_path)

    if latest_folder:
        parent_folder_path = os.path.join(parent_path, latest_folder)

        items =os.listdir(parent_folder_path)
        subfolders = [item for item in items if os.path.isdir(os.path.join(parent_folder_path, item))]

        subfolder_count = len(subfolders)

        modified_path = parent_folder_path.replace("/lakehouse/Files/API Endpoints/", "")
        modified_path = modified_path[:-9]

        check_file = spark.read.csv("Files/Reference csv/subfolder_check.csv", header=True) # subfolder_check.csv contains expected number of subfolders per endpoint

        data_list.append({
            "ModifiedPath": modified_path,
            "LastSubfolderCount": subfolder_count,
            "LatestFolder": latest_folder
        })

    else:
        print(f"No subfolders found in {parent_path}")

df =  spark.createDataFrame(data_list)

check = df.join(check_file, df.ModifiedPath == check_file.endpoint, "inner").drop("nr_of_endpoints", "ModifiedPath")

# Check if LastSubfolderCount matches expected_nr_subfolders
matched_count = check.filter(col("LastSubfolderCount") == col("expected_nr_subfolders")).count()

if matched_count < 25:
    missing_folders = check.filter(col("LastSubfolderCount") != col("expected_nr_subfolders"))
    print("Subfolders from endpoint is missing:")
    missing_folders.show(truncate=False)
else:
    missing_folders = None