## Dependencies

In [1]:
# Import libraries.
import boto3
import json
import time
import sys
import numpy

# Import glue dependencies.
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, DataFrame

# Add the project root so that this notebook can be executed anywhere in the 
# folder structure.
sys.path.append("/home/glue_user/project_lf/ETL-TDD")

# Import batch job's custom dependencies.
from databrew import DataBrew
from glue import GlueWrapper
from etl.paths.components import Bucket

# Import jobs.
import stage_claim_into_raw
import stage_policyholder_into_raw
import stage_provider_into_raw
import stage_claim_into_access
import stage_policyholder_into_access
import stage_provider_into_access
import stage_location_into_optimised
import stage_procedure_into_optimised
import stage_policyholder_into_optimised
import stage_provider_into_optimised
import stage_date_into_optimised
import stage_claim_into_optimised

## Initialise

In [2]:
# Initialise spark session with minimal logging.
sc = SparkContext()
sc.setLogLevel("ERROR")
glueContext = GlueContext(sc)
spark = glueContext.spark_session

# Wrap low-level AWS clients in a high-level object oriented API that uses
# S3 paths to create and coordinate AWS services.
glue = GlueWrapper('us-east-1')
databrew = DataBrew('us-east-1')

# Define the bucket to use (TEST or PROD).
env = Bucket.TEST

# Define the sequential job batches.
# Earlier batches must be completed before later batches.
batch1 = [stage_claim_into_raw, stage_policyholder_into_raw, stage_provider_into_raw]
batch2 = [stage_claim_into_access, stage_policyholder_into_access, stage_provider_into_access]
batch3 = [stage_location_into_optimised, stage_procedure_into_optimised]
batch4 = [stage_policyholder_into_optimised, stage_provider_into_optimised, stage_date_into_optimised]
batch5 = [stage_claim_into_optimised]

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Reload4jLoggerFactory]
log4j:WARN No appenders could be found for logger (org.apache.hadoop.metrics2.lib.MutableMetricsFactory).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.


## Delete objects in S3

In [3]:
# delete everything in the raw, access and optimised tiers
import boto3
s3 = boto3.resource('s3')
bucket = s3.Bucket('test-lf-wm')

# ********************* RESET RAW *********************
for obj in bucket.objects.filter(Prefix='etl/raw'):
    s3.Object(bucket.name,obj.key).delete()

# **********************RESET ACCESS ******************
for obj in bucket.objects.filter(Prefix='etl/access'):
    s3.Object(bucket.name,obj.key).delete()

# ******************** RESET OPTIMISED ****************
for obj in bucket.objects.filter(Prefix='etl/optimised'):
    s3.Object(bucket.name,obj.key).delete()
    


## Batch 1

In [4]:
outputs = []

for job in batch1:
    df, path = job.run(spark, env)
    outputs.append( (df, path) )

                                                                                

In [5]:
# Get a list of all job paths and a list of all job dataframes.
dfs = numpy.array(outputs).flatten()[::2]
paths = numpy.array(outputs).flatten()[1::2]

In [11]:
paths

array(['s3://test-lf-wm/etl/access/claim_db/claim/full/202306052244/',
       's3://test-lf-wm/etl/access/claim_db/policyholder/full/202306052244/',
       's3://test-lf-wm/etl/access/claim_db/provider/full/202306052244/'],
      dtype=object)

In [6]:
example_path = paths[-1]

# Crawl each job path as an S3 target.
glue.delete_crawler(example_path)
glue.create_crawler(paths)
glue.start_crawler(example_path)

############## Wait for crawler to finish #####################

glue.wait_for_crawler(example_path)

############## After crawlers finish. #####################

for path in paths:
    databrew.create_dataset(path)
    databrew.create_profile_job(path)
    databrew.start_job_run(path)

############## Wait for job to finish #####################

databrew.wait_for_job(example_path)

for path in paths:
    print(f"You can view the data profile for {path} here:\n\t{databrew.get_profile_link(path)}")

Couldn't delete crawler:
                Crawler entry with name test-raw does not exist


Created crawler test-raw.
Waiting for crawler to finish.............
Waiting for job to finish.......................
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/claim/full/202306052239/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-claim&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/policyholder/full/202306052239/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-policyholder&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/provider/full/202306052240/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-provider&tab=profile-overview


## Batch 2

In [7]:
outputs = []

for job in batch2:
    df, path = job.run(spark, env)
    outputs.append( (df, path) )

s3.Bucket(name='test-lf-wm') etl/raw/claim_db/claim/full


                                                                                

s3.Bucket(name='test-lf-wm') etl/raw/claim_db/policyholder/full


                                                                                

s3.Bucket(name='test-lf-wm') etl/raw/claim_db/provider/full


                                                                                

In [8]:

# Get a list of all job paths and a list of all job dataframes.
dfs = numpy.array(outputs).flatten()[::2]
paths = numpy.array(outputs).flatten()[1::2]

In [9]:
example_path = paths[-1]

# Crawl each job path as an S3 target.
glue.delete_crawler(example_path)
glue.create_crawler(paths)
glue.start_crawler(example_path)
glue.wait_for_crawler(example_path)

############## After crawlers finish. #####################
for path in paths:
     
    response = databrew.create_dataset(path)
    response = databrew.create_profile_job(path)        
    response = databrew.start_job_run(path)

############## Wait for job to finish #####################
databrew.wait_for_job(example_path)

# Print links.
for path in paths:
    print(f"You can view the data profile for {path} here:\n\t{databrew.get_profile_link(path)}")
    # databrew.show_data_profile_link(path)

Couldn't delete crawler:
                Crawler entry with name test-access does not exist


Created crawler test-access.
Waiting for crawler to finish..........


Couldn't create dataset:
                AWS Glue table access-claim was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-access-claim wasn't found
Couldn't start profile job run:
	The job test-access-claim wasn't found.
Couldn't create dataset:
                AWS Glue table access-policyholder was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-access-policyholder wasn't found
Couldn't start profile job run:
	The job test-access-policyholder wasn't found.


Waiting for job to finish......................
You can view the data profile for s3://test-lf-wm/etl/access/claim_db/claim/full/202306052244/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-access-claim&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/access/claim_db/policyholder/full/202306052244/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-access-policyholder&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/access/claim_db/provider/full/202306052244/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-access-provider&tab=profile-overview


In [10]:
############## Use data profile. #####################

# databrew.get_dq_results(paths[0])

ResourceNotFoundException: An error occurred (ResourceNotFoundException) when calling the ListJobRuns operation: The job test-access-claim wasn't found.

## Batch 3

In [None]:
outputs = []

for job in batch3:
    df, path = job.run(spark, env)
    outputs.append( (df, path) )

                                                                                

In [None]:
# Get a list of all job paths and a list of all job dataframes.
dfs = numpy.array(outputs).flatten()[::2]
paths = numpy.array(outputs).flatten()[1::2]

In [None]:
example_path = paths[-1]

# Crawl each job path as an S3 target.
glue.delete_crawler(example_path)
glue.create_crawler(paths)
glue.start_crawler(example_path)

############## Wait for crawler to finish #####################

glue.wait_for_crawler(example_path)

############## After crawlers finish. #####################

for path in paths:
    databrew.create_dataset(path)
    databrew.create_profile_job(path)
    databrew.start_job_run(path)

############## Wait for job to finish #####################

databrew.wait_for_job(example_path)

for path in paths:
    print(f"You can view the data profile for {path} here:\n\t{databrew.get_profile_link(path)}")
    # databrew.show_data_profile_link(path)


Created crawler test-raw.
Waiting for crawler to finish.........


Couldn't create dataset:
                AWS Glue table raw_claim was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-claim wasn't found
Couldn't start profile job run:
	The job test-raw-claim wasn't found.
Couldn't create dataset:
                AWS Glue table raw_policyholder was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-policyholder wasn't found
Couldn't start profile job run:
	The job test-raw-policyholder wasn't found.
Couldn't create dataset:
                AWS Glue table raw_provider was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-provider wasn't found
Couldn't start profile job run:
	The job test-raw-provider wasn't found.
Couldn't get job state:
	The job test-raw-provider wasn't found.


You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/claim/full/202306052137/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-claim&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/policyholder/full/202306052138/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-policyholder&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/provider/full/202306052138/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-provider&tab=profile-overview


## Batch 4

In [None]:
outputs = []

for job in batch4:
    df, path = job.run(spark, env)
    outputs.append( (df, path) )

                                                                                

In [None]:
# Get a list of all job paths and a list of all job dataframes.
dfs = numpy.array(outputs).flatten()[::2]
paths = numpy.array(outputs).flatten()[1::2]

In [None]:
example_path = paths[-1]

# Crawl each job path as an S3 target.
glue.delete_crawler(example_path)
glue.create_crawler(paths)
glue.start_crawler(example_path)

############## Wait for crawler to finish #####################

glue.wait_for_crawler(example_path)

############## After crawlers finish. #####################

for path in paths:
    databrew.create_dataset(path)
    databrew.create_profile_job(path)
    databrew.start_job_run(path)

############## Wait for job to finish #####################

databrew.wait_for_job(example_path)

for path in paths:
    print(f"You can view the data profile for {path} here:\n\t{databrew.get_profile_link(path)}")
    # databrew.show_data_profile_link(path)


Created crawler test-raw.
Waiting for crawler to finish.........


Couldn't create dataset:
                AWS Glue table raw_claim was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-claim wasn't found
Couldn't start profile job run:
	The job test-raw-claim wasn't found.
Couldn't create dataset:
                AWS Glue table raw_policyholder was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-policyholder wasn't found
Couldn't start profile job run:
	The job test-raw-policyholder wasn't found.
Couldn't create dataset:
                AWS Glue table raw_provider was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-provider wasn't found
Couldn't start profile job run:
	The job test-raw-provider wasn't found.
Couldn't get job state:
	The job test-raw-provider wasn't found.


You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/claim/full/202306052137/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-claim&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/policyholder/full/202306052138/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-policyholder&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/provider/full/202306052138/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-provider&tab=profile-overview


## Batch 5

In [None]:
outputs = []

for job in batch5:
    df, path = job.run(spark, env)
    outputs.append( (df, path) )

                                                                                

In [None]:
# Get a list of all job paths and a list of all job dataframes.
dfs = numpy.array(outputs).flatten()[::2]
paths = numpy.array(outputs).flatten()[1::2]

In [None]:
example_path = paths[-1]

# Crawl each job path as an S3 target.
glue.delete_crawler(example_path)
glue.create_crawler(paths)
glue.start_crawler(example_path)

############## Wait for crawler to finish #####################

glue.wait_for_crawler(example_path)

############## After crawlers finish. #####################

for path in paths:
    databrew.create_dataset(path)
    databrew.create_profile_job(path)
    databrew.start_job_run(path)

############## Wait for job to finish #####################

databrew.wait_for_job(example_path)

for path in paths:
    print(f"You can view the data profile for {path} here:\n\t{databrew.get_profile_link(path)}")
    # databrew.show_data_profile_link(path)


Created crawler test-raw.
Waiting for crawler to finish.........


Couldn't create dataset:
                AWS Glue table raw_claim was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-claim wasn't found
Couldn't start profile job run:
	The job test-raw-claim wasn't found.
Couldn't create dataset:
                AWS Glue table raw_policyholder was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-policyholder wasn't found
Couldn't start profile job run:
	The job test-raw-policyholder wasn't found.
Couldn't create dataset:
                AWS Glue table raw_provider was not found in database test-release-3 in catalogId 618572314333.
Couldn't create data profile job:
                Dataset test-raw-provider wasn't found
Couldn't start profile job run:
	The job test-raw-provider wasn't found.
Couldn't get job state:
	The job test-raw-provider wasn't found.


You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/claim/full/202306052137/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-claim&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/policyholder/full/202306052138/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-policyholder&tab=profile-overview
You can view the data profile for s3://test-lf-wm/etl/raw/claim_db/provider/full/202306052138/ here:
	https://us-east-1.console.aws.amazon.com/databrew/home?region=us-east-1#dataset-details?dataset=test-raw-provider&tab=profile-overview


## Load Redshift