### 1. Load AWS Configs

In [1]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('configs/global.cfg'))

KEY = config.get('AWS', 'AWS_ACCESS_KEY_ID')
SECRET = config.get('AWS','AWS_SECRET_ACCESS_KEY')

OUTPUT_DATA = config.get('AWS','OUTPUT_DATA_FOLDER')

In [2]:
import os
os.environ["AWS_ACCESS_KEY_ID"]= KEY
os.environ["AWS_SECRET_ACCESS_KEY"]= SECRET
os.environ["AWS_DEFAULT_REGION"]="us-west-2"

#### 2. Set AWS clients

In [3]:
import boto3
import time

In [4]:
emr = boto3.client( 'emr', 
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET, 
                   region_name='us-west-2')

In [5]:
ec2 = boto3.client('ec2', 
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET, 
                   region_name='us-west-2')

In [6]:
s3 = boto3.client( 's3', 
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET, 
                   region_name='us-west-2')

2.0.1 Upload flights URL list to S3

In [11]:
s3_bucket = [entry['Name'] for entry in s3.list_buckets()['Buckets'] if 'udacity' in entry['Name']  ]

In [None]:
# upload url lists of raw data to s3
s3.upload_file( 'input/data/flights', s3_bucket[0], 'flights' )
s3.upload_file( 'input/data/tweets', s3_bucket[0], 'tweets' )

2.1 Create EMR instance

In [7]:
ec2_credentials_name = 'emr_udacity'
instance_type = 'm5.xlarge'
nr_slave_nodes = 2
emr_name = 'udacity-capstone-' + time.strftime('%Y%m%d%H%M%S',time.gmtime())


In [8]:
# create EMR instance
emr.run_job_flow(
    Name=emr_name,
    ReleaseLabel='emr-6.2.0',
    Applications=[
        {
            'Name': 'Spark'
        },
    ],
    Instances={
        'InstanceGroups': [
            {
                'Name': "Master nodes",
                'Market': 'ON_DEMAND',
                'InstanceRole': 'MASTER',
                'InstanceType': instance_type,
                'InstanceCount': 1,
            },
            {
                'Name': "Slave nodes",
                'Market': 'ON_DEMAND',
                'InstanceRole': 'CORE',
                'InstanceType': instance_type,
                'InstanceCount': nr_slave_nodes
            }
        ],
        'Ec2KeyName': ec2_credentials_name,
        'KeepJobFlowAliveWhenNoSteps': True
    },
    VisibleToAllUsers=True,
    JobFlowRole='EMR_EC2_DefaultRole',
    ServiceRole='EMR_DefaultRole'  
)

{'JobFlowId': 'j-2CYW4R6S18ARG',
 'ClusterArn': 'arn:aws:elasticmapreduce:us-west-2:794266340481:cluster/j-2CYW4R6S18ARG',
 'ResponseMetadata': {'RequestId': '9aa34525-c7c3-427a-9410-ef569ca65247',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9aa34525-c7c3-427a-9410-ef569ca65247',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '118',
   'date': 'Wed, 10 Mar 2021 09:35:33 GMT'},
  'RetryAttempts': 0}}

In [10]:
# wait for emr created
time.sleep( 60 * 5 )

is_emr_started, nr_tries = False, 0

emr_jobid = ""

while not is_emr_started and nr_tries < 5:
    print( "Fetching running EMR cluster: {} try".format( nr_tries + 1 ) )
    
    emr_list = emr.list_clusters()
    
    for cluster in emr_list['Clusters']:
        if cluster['Name'] == emr_name and cluster['Status']['State'] == 'WAITING': 
            emr_jobid = cluster['Id']
            is_emr_started = True
            print( "EMR Cluster found: {} waiting for job".format(emr_name) )
            break # waiting cluster found
        elif cluster['Name'] == emr_name and 'TERMINATED' in cluster['Status']['State'] :
            raise Exception('EMR cluster \"{}\" status was {}. '.format(cluster['Name'], cluster['Status']['State']) )        
        else :
            print("Retrying in 2 mins.")
            time.sleep(120)
            
    nr_tries+=1

Fetching running EMR cluster: 1 try
EMR Cluster found: udacity-capstone-20210310093530 waiting for job


In [None]:
# add step to WAITING cluster
# mount s3fs to waiting EMR
# wget urls raw data
response = emr.add_job_flow_steps(
    JobFlowId = 'j-2CYW4R6S18ARG',
    Steps=[
        {
            'Name': 'Mount s3fs and download files',
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar': 's3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar',
                'Args': [
                    's3://udacity-awss/mount_s3fs.sh',
                ]
            }
        },
    ]
)
response

In [None]:
# add step to WAITING cluster
# run spark job on raw data
# ( see notebook 1.1 )
response = emr.add_job_flow_steps(
    JobFlowId = emr_jobid,
    Steps=[
        {
            'Name': 'Run Spark for downloaded data',
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': [
                    'spark-submit',
                     '--deploy-mode',
                     'cluster',
                     '--master',
                     'yarn',
                     OUTPUT_DATA + 'etl.py'
                ]
            }
        },
    ]
)
response

In [36]:
# kill all clusters
standby_clusters = emr.list_clusters( 
    ClusterStates=['RUNNING', 'WAITING'] 
)

clusters = []
for cluster in standby_clusters["Clusters"] : 
    clusters.append( cluster["Id"] )
    
print( "Nr clusters: {}".format( len(clusters) ))

Nr clusters: 1


In [37]:
if len(clusters) > 0 :
    emr.terminate_job_flows(
        JobFlowIds = clusters
    )
    print( "Terminating Clusters" )
else: 
    print( "No clusters found." )

Terminating Clusters
