In [3]:
import boto3 
from automation.glue_wrapper import GlueWrapper

## Load source data into Landing if needed

In [99]:
# /home/glue_user/project_lf/ETL-TDD/temp/202305211851-claim-full.csv
# /home/glue_user/project_lf/ETL-TDD/temp/202305221132-policyholder-full.csv
# /home/glue_user/project_lf/ETL-TDD/temp/202305221136-provider-full.csv

!aws s3 cp /home/glue_user/project_lf/ETL-TDD/temp/202305211851-claim-full.csv s3://project-lf/etl/landing/claim_db/claim/full/202305211851-claim-full.csv

!aws s3 cp /home/glue_user/project_lf/ETL-TDD/temp/202305221132-policyholder-full.csv s3://project-lf/etl/landing/claim_db/policyholder/full/202305221132-policyholder-full.csv

!aws s3 cp /home/glue_user/project_lf/ETL-TDD/temp/202305221136-provider-full.csv s3://project-lf/etl/landing/claim_db/provider/full/202305221136-provider-full.csv

upload: temp/202305211851-claim-full.csv to s3://project-lf/etl/landing/claim_db/claim/full/202305211851-claim-full.csv
upload: temp/202305221132-policyholder-full.csv to s3://project-lf/etl/landing/claim_db/policyholder/full/202305221132-policyholder-full.csv
upload: temp/202305221136-provider-full.csv to s3://project-lf/etl/landing/claim_db/provider/full/202305221136-provider-full.csv


## Reset every staging tier except for Landing

In [94]:
!aws s3 rm s3://project-lf/etl/raw/*
!aws s3 rm s3://project-lf/etl/access/*
!aws s3 rm s3://project-lf/etl/optimised/*


delete: s3://project-lf/etl/access/claim_db/policyholder/full/202306020029/part-00000-79e063c3-76d4-4096-8d1d-0ab6299992b6-c000.snappy.parquet
delete: s3://project-lf/etl/access/claim_db/provider/full/202306020038/part-00000-63ca972b-5cf3-4c3d-853d-f6b132b95c52-c000.snappy.parquet
delete: s3://project-lf/etl/access/claim_db/claim/full/202306020029/part-00000-eb404331-4bca-4933-acdc-34a99d27e8bd-c000.snappy.parquet
delete: s3://project-lf/etl/access/claim_db/policyholder/full/202306020021/part-00000-40827f9f-963d-438e-9288-f09a783c755a-c000.snappy.parquet
delete: s3://project-lf/etl/access/claim_db/policyholder/full/202306020038/part-00000-52fda0c6-49a8-42e1-8e9b-27212710e988-c000.snappy.parquet
delete: s3://project-lf/etl/access/claim_db/provider/full/202306020021/part-00000-2c0b4719-cb90-4648-b7ad-be6a399309e4-c000.snappy.parquet
delete: s3://project-lf/etl/access/claim_db/claim/full/202306020021/part-00000-b8df7cc2-9071-45e2-95dc-3e856a430801-c000.snappy.parquet
delete: s3://project-

## Delete every glue job

In [87]:
# delete all glue ETL jobs
glue = boto3.client('glue', region_name='us-east-1')
jobs = glue.get_jobs()
for job in jobs['Jobs']:
    if job['Name'].startswith('stage_'):
        glue.delete_job(JobName=job['Name'])

## Refresh every .py job file in `code/jobs/` and `code/dependencies` 

In [None]:
# delete the libraries.zip file at s3://project-lf/code/libraries.zip
!aws s3 rm s3://project-lf/code/libraries.zip

# create a new zip file
!zip -r libraries.zip *

# write cli that copies libraries.zip to s3://project-lf/code/libraries.zip
!aws s3 cp libraries.zip s3://project-lf/code/dependencies/libraries.zip

# delete everything but the libraries.zip file at s3://project-lf/code/
!aws s3 rm s3://project-lf/code/ --recursive --exclude "libraries.zip"

In [98]:
# write cli that copies libraries.zip to s3://project-lf/code/libraries.zip
!aws s3 cp libraries.zip s3://project-lf/code/dependencies/libraries.zip


upload: ./libraries.zip to s3://project-lf/code/dependencies/libraries.zip


## Upload .py job files to S3

In [84]:
import boto3

# Create an S3 client
s3_client = boto3.client('s3')

# Set the bucket and prefix
bucket = 'project-lf'
prefix = 'code/jobs/'

# Set the list of files to upload
files = [
    '/home/glue_user/project_lf/ETL-TDD/stage_claim_into_access.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_claim_into_optimised.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_claim_into_raw.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_date_into_optimised.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_location_into_optimised.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_policyholder_into_access.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_policyholder_into_optimised.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_policyholder_into_raw.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_procedure_into_optimised.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_provider_into_access.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_provider_into_optimised.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_provider_into_raw.py',
    '/home/glue_user/project_lf/ETL-TDD/stage_source_into_landing.py'
]

# Upload each file to the S3 bucket
for file_path in files:
    # Get the file name
    file_name = file_path.split('/')[-1]

    # Set the object key
    key = f'{prefix}{file_name}'

    # Upload the file
    s3_client.upload_file(file_path, bucket, key)

print(f'Uploaded {len(files)} files to s3://{bucket}/{prefix}')

Uploaded 13 files to s3://project-lf/code/jobs/


## Initialise Glue session

In [77]:
# for each script in s3://project-lf/code/jobs/ create a glue job
import boto3

# Create an AWS Glue client
glue_client = boto3.client('glue', region_name='us-east-1')

# Set the parameters for the new Glue job
glue_role = 'data-quality-lf'


## Get a list of all script objects in `project-lf/code/jobs/`

In [86]:
# get all keys in s3://project-lf/code/jobs/
s3_resource = boto3.resource('s3')
bucket = 'project-lf'
prefix = 'code/jobs/'
objects = s3_resource.Bucket(bucket).objects.filter(Prefix=prefix)
list(objects)

[s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_claim_into_access.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_claim_into_optimised.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_claim_into_raw.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_date_into_optimised.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_location_into_optimised.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_policyholder_into_access.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_policyholder_into_optimised.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_policyholder_into_raw.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_procedure_into_optimised.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_provider_into_access.py'),
 s3.ObjectSummary(bucket_name='project-lf', key='code/jobs/stage_provid

## Create a job for every script object

In [88]:
glue_client = boto3.client('glue', region_name='us-east-1')
job_names = []
for s3_object in objects:
    print(s3_object.key)
    
    # Create an AWS Glue client

    # Set the parameters for the new Glue job
    glue_role = 'data-quality-lf'
    script_location = f's3://project-lf/{s3_object.key}'
    job_name = script_location.split('/')[-1].split('.')[0]
    extra_py_files = 's3://project-lf/code/dependencies/libraries.zip'
    glue_version = '4.0'
    additional_python_modules = "holidays"

    try:
        # Create the new Glue job
        response = glue_client.create_job(
            Name=job_name,
            Role=glue_role,
            Command={'Name': 'glueetl', 'ScriptLocation': script_location},
            GlueVersion=glue_version,
            DefaultArguments={
                '--extra-py-files': extra_py_files,
                '--additional-python-modules': additional_python_modules
            },
            WorkerType='G.1X',
            NumberOfWorkers=2
        )
        print(response)
    except Exception as e:
        print("Skipping job creation for", job_name, e)
    
    job_names.append(job_name)

code/jobs/stage_claim_into_access.py
{'Name': 'stage_claim_into_access', 'ResponseMetadata': {'RequestId': 'a8d51c8f-04f6-4376-952d-615c1a7af271', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Thu, 01 Jun 2023 14:55:34 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '34', 'connection': 'keep-alive', 'x-amzn-requestid': 'a8d51c8f-04f6-4376-952d-615c1a7af271'}, 'RetryAttempts': 0}}
code/jobs/stage_claim_into_optimised.py
{'Name': 'stage_claim_into_optimised', 'ResponseMetadata': {'RequestId': 'b3365302-8774-4f81-a2b7-86a8f77b9afe', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Thu, 01 Jun 2023 14:55:35 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '37', 'connection': 'keep-alive', 'x-amzn-requestid': 'b3365302-8774-4f81-a2b7-86a8f77b9afe'}, 'RetryAttempts': 0}}
code/jobs/stage_claim_into_raw.py
{'Name': 'stage_claim_into_raw', 'ResponseMetadata': {'RequestId': '4705c06e-779c-4e6f-bad6-2573f07ce73d', 'HTTPStatusCode': 200, 'HTTPHeaders': {

## Define execution order of jobs

In [89]:
job_waves = [
     ['stage_claim_into_raw', 'stage_provider_into_raw', 'stage_policyholder_into_raw',
    'stage_date_into_optimised'],
     
    ['stage_claim_into_access','stage_provider_into_access',
        'stage_policyholder_into_access'],
    
    ['stage_location_into_optimised', 'stage_procedure_into_optimised'],
    
    ['stage_policyholder_into_optimised', 'stage_provider_into_optimised'],
    
    ['stage_claim_into_optimised']
]

## Define a function to run jobs and wait until they have completed

In [90]:
import boto3
import time
import colorama
from colorama import Fore, Style
colorama.init()

# Create an AWS Glue client
glue_client = boto3.client('glue', region_name='us-east-1')

# Function to run a list of jobs and wait for them to finish
def run_jobs(jobs):
    # Start each job and store its job run ID
    job_run_ids = {}
    for job_name in jobs:
        response = glue_client.start_job_run(JobName=job_name)
        job_run_id = response['JobRunId']
        job_run_ids[job_name] = job_run_id
        print(job_name, job_run_id)

    print("-"*80)
    
    # Wait for all jobs to finish
    while True:
        # Check the status of each job run
        all_finished = True
    
        for job_name, job_run_id in job_run_ids.items():
            try:
                response = glue_client.get_job(JobName=job_name)
                response = glue_client.get_job_run(JobName=job_name, RunId=job_run_id)
                status = response['JobRun']['JobRunState']
                if status not in ['SUCCEEDED', 'FAILED', 'STOPPED']:
                    all_finished = False
                else:
                    if status == 'SUCCEEDED':
                        print(Fore.GREEN + '\t' + job_name + ' has ' + status + '.' + Style.RESET_ALL)
                    elif status == 'FAILED':
                        error_message = response['JobRun'].get('ErrorMessage', 'No error message available')
                        print(Fore.RED + '\t' + job_name + ' has ' + status + '.' + Style.RESET_ALL)
                        print(Fore.RED + f'\t{job_name} has {status}. Error message: {error_message}' + Style.RESET_ALL)
                    else:
                        print(Fore.YELLOW + f'\t{job_name} has {status}.' + Style.RESET_ALL)
            except glue_client.exceptions.EntityNotFoundException:
                continue
        print("-"*80)
        # If all jobs have finished, exit the loop
        if all_finished:
            break
        # Otherwise, wait and check again
        time.sleep(10)

## Execute jobs in order

In [100]:
for count, job_set in enumerate(job_waves):
    print("*"*80)
    print(f"Starting on Run {count + 1}: {job_set}")
    print("*"*80)
    run_jobs(job_set)
print('All jobs finished')

********************************************************************************
Starting on Run 1: ['stage_claim_into_raw', 'stage_provider_into_raw', 'stage_policyholder_into_raw', 'stage_date_into_optimised']
********************************************************************************
stage_claim_into_raw jr_f013cdcd5ba3156e87729f26f19dc25d5ebcffcf263ab4858c02f2b56e4d5776
stage_provider_into_raw jr_06ab19d466f1cb9a05dd8fe770393be13bebf127b9434129619e81282a2d5166
stage_policyholder_into_raw jr_fdc34733a113a4eaba4bb9a66582be5bf7208a1807792cd3f7bcc72d1b389858
stage_date_into_optimised jr_0a425aa1c16043e5e8f600a973c86f7a4d4890574cd22b8b1a073d1428696990
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
------------

In [46]:
# use aws cli to create landing folders
# make s3://project-lf/landing/claim_db/claim/full/ for claim, provider and policyholder
!aws s3api put-object --bucket project-lf --key etl/landing/claim_db/claim/full/
!aws s3api put-object --bucket project-lf --key etl/landing/claim_db/provider/full/
!aws s3api put-object --bucket project-lf --key etl/landing/claim_db/policyholder/full/

# cp source data files into landing folders


{
    "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"", 
    "ServerSideEncryption": "AES256"
}
{
    "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"", 
    "ServerSideEncryption": "AES256"
}
{
    "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"", 
    "ServerSideEncryption": "AES256"
}
