## Delete all staged files

In [2]:

import boto3
s3 = boto3.resource('s3')
bucket = s3.Bucket('test-project-wm')

# ********************* RESET RAW *********************
for obj in bucket.objects.filter(Prefix='etl/raw'):
    s3.Object(bucket.name,obj.key).delete()

# **********************RESET ACCESS ******************
for obj in bucket.objects.filter(Prefix='etl/access'):
    s3.Object(bucket.name,obj.key).delete()

# ******************** RESET OPTIMISED ****************
for obj in bucket.objects.filter(Prefix='etl/optimised'):
    s3.Object(bucket.name,obj.key).delete()

## Upload latest version of .zip dependencies to `code/dependencies` folder

In [5]:
!aws s3 rm s3://test-project-wm/code/dependencies/libraries.zip
!aws s3 cp /home/glue_user/project_lf/ETL-TDD/libraries.zip s3://test-project-wm/code/dependencies/libraries.zip

upload: ../libraries.zip to s3://test-project-wm/code/dependencies/libraries.zip


## Upload latest version of .py batch files to `code/batches`

In [8]:
!aws s3 rm --recursive s3://test-project-wm/code/batches/

delete: s3://test-project-wm/code/batches/batch1.py
delete: s3://test-project-wm/code/batches/batch2.py
delete: s3://test-project-wm/code/batches/batch5.py
delete: s3://test-project-wm/code/batches/batch4.py
delete: s3://test-project-wm/code/batches/batch3.py


In [9]:
import boto3

# Create an S3 client
s3_client = boto3.client('s3', region_name='ap-southeast-1')

# Set the bucket and prefix
bucket = 'test-project-wm'
prefix = 'code/batches/'

# Set the list of files to upload
files = [
    '/home/glue_user/project_lf/ETL-TDD/batch1.py',
    '/home/glue_user/project_lf/ETL-TDD/batch2.py',
    '/home/glue_user/project_lf/ETL-TDD/batch3.py',
    '/home/glue_user/project_lf/ETL-TDD/batch4.py',
    '/home/glue_user/project_lf/ETL-TDD/batch5.py',
]

# Upload each file to the S3 bucket
for file_path in files:
    # Get the file name
    file_name = file_path.split('/')[-1]

    # Set the object key
    key = f'{prefix}{file_name}'

    # Upload the file
    s3_client.upload_file(file_path, bucket, key)

print(f'Uploaded {len(files)} files to s3://{bucket}/{prefix}')

Uploaded 5 files to s3://test-project-wm/code/batches/


## Create a job for every script object

In [11]:
s3_resource = boto3.resource('s3')
bucket = 'test-project-wm'
prefix = 'code/batches/'
objects = s3_resource.Bucket(bucket).objects.filter(Prefix=prefix)

In [12]:
glue_client = boto3.client('glue', region_name='ap-southeast-2')
job_names = []
for s3_object in objects:
    print(s3_object.key)
    
    # Create an AWS Glue client

    # Set the parameters for the new Glue job
    glue_role = 'data-quality-lf'
    script_location = f's3://test-project-wm/{s3_object.key}'
    job_name = script_location.split('/')[-1].split('.')[0]
    extra_py_files = 's3://test-project-wm/code/dependencies/libraries.zip'
    glue_version = '4.0'
    additional_python_modules = "holidays"

    try:
        # Create the new Glue job
        response = glue_client.create_job(
            Name=job_name,
            Role=glue_role,
            Command={'Name': 'glueetl', 'ScriptLocation': script_location},
            GlueVersion=glue_version,
            DefaultArguments={
                '--extra-py-files': extra_py_files,
                '--additional-python-modules': additional_python_modules
            },
            WorkerType='G.1X',
            NumberOfWorkers=2
        )
        print(response)
    except Exception as e:
        print("Skipping job creation for", job_name, e)
    
    job_names.append(job_name)

code/batches/batch1.py
{'Name': 'batch1', 'ResponseMetadata': {'RequestId': '71913a57-4cd4-4160-9cdb-b88ccf782cb0', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 07 Jun 2023 12:12:09 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '17', 'connection': 'keep-alive', 'x-amzn-requestid': '71913a57-4cd4-4160-9cdb-b88ccf782cb0'}, 'RetryAttempts': 0}}
code/batches/batch2.py
{'Name': 'batch2', 'ResponseMetadata': {'RequestId': '510432f9-b5a1-42e9-a15b-6c5aa9d31ac1', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 07 Jun 2023 12:12:09 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '17', 'connection': 'keep-alive', 'x-amzn-requestid': '510432f9-b5a1-42e9-a15b-6c5aa9d31ac1'}, 'RetryAttempts': 0}}
code/batches/batch3.py
{'Name': 'batch3', 'ResponseMetadata': {'RequestId': 'a369f4ba-8ae2-4281-a8c1-3392a3ed424f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 07 Jun 2023 12:12:09 GMT', 'content-type': 'application/x-amz-json-1.1', 'conte