### 1. Load AWS Configs

In [1]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('configs/global.cfg'))

KEY = config.get('AWS', 'AWS_ACCESS_KEY_ID')
SECRET = config.get('AWS','AWS_SECRET_ACCESS_KEY')

OUTPUT_DATA = config.get('AWS','OUTPUT_DATA_FOLDER')

In [2]:
import os
os.environ["AWS_ACCESS_KEY_ID"]= KEY
os.environ["AWS_SECRET_ACCESS_KEY"]= SECRET
os.environ["AWS_DEFAULT_REGION"]="us-west-2"

#### 2. Set AWS clients

In [3]:
import boto3
import time

In [4]:
emr = boto3.client( 'emr', 
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET, 
                   region_name='us-west-2')

In [5]:
ec2 = boto3.client('ec2', 
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET, 
                   region_name='us-west-2')

In [6]:
s3 = boto3.client( 's3', 
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET, 
                   region_name='us-west-2')

2.1 Create EMR instance

In [7]:
ec2_credentials_name = 'emr_udacity'
instance_type = 'm5.xlarge'
nr_slave_nodes = 2
emr_name = 'udacity-capstone-' + time.strftime('%Y%m%d%H%M%S',time.gmtime())


In [8]:
# create EMR instance
emr.run_job_flow(
    Name=emr_name,
    ReleaseLabel='emr-6.2.0',
    Applications=[
        {
            'Name': 'Spark'
        },
    ],
    Instances={
        'InstanceGroups': [
            {
                'Name': "Master nodes",
                'Market': 'ON_DEMAND',
                'InstanceRole': 'MASTER',
                'InstanceType': instance_type,
                'InstanceCount': 1,
            },
            {
                'Name': "Slave nodes",
                'Market': 'ON_DEMAND',
                'InstanceRole': 'CORE',
                'InstanceType': instance_type,
                'InstanceCount': nr_slave_nodes
            }
        ],
        'Ec2KeyName': ec2_credentials_name,
        'KeepJobFlowAliveWhenNoSteps': True
    },
    VisibleToAllUsers=True,
    JobFlowRole='EMR_EC2_DefaultRole',
    ServiceRole='EMR_DefaultRole'  
)

{'JobFlowId': 'j-2SANV0ZUH2VDH',
 'ClusterArn': 'arn:aws:elasticmapreduce:us-west-2:794266340481:cluster/j-2SANV0ZUH2VDH',
 'ResponseMetadata': {'RequestId': 'd07e11ca-7c89-43cb-8c2f-6fa463cb3c4b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd07e11ca-7c89-43cb-8c2f-6fa463cb3c4b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '118',
   'date': 'Tue, 09 Mar 2021 21:50:46 GMT'},
  'RetryAttempts': 0}}

In [11]:
# wait for emr created

time.sleep( 60 * 5 )

is_emr_started, nr_tries = False, 0

while not is_emr_started and nr_tries < 5:
    print( "Fetching running EMR cluster: {} try".format( nr_tries + 1 ) )
    
    emr_list = emr.list_clusters(ClusterStates=['RUNNING', 'WAITING'] )
    
    for cluster in emr_list['Clusters']:
        if cluster['Name'] == emr_name and cluster['Status']['State'] == 'WAITING': 
            is_emr_started = True
            print( "EMR Cluster found: {} waiting for job".format(emr_name) )
            break # waiting cluster found
        elif cluster['Name'] == emr_name and 'TERMINATED' in cluster['Status']['State'] :
            raise Exception('EMR cluster \"{}\" status was {}. '.format(cluster['Name'], cluster['Status']['State']) )        
        else :
            print("Retrying in 2 mins.")
            time.sleep(120)
            
    nr_tries+=1

Fetching running EMR cluster: 1 try
EMR Cluster found: udacity-capstone-20210309215043 waiting for job


In [None]:
# add step to WAITING cluster
response = emr.add_job_flow_steps(
    JobFlowId = emr_name,
    Steps=[
        {
            'Name': 'Spark from boto3',
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': [
                    'spark-submit',
                     '--deploy-mode',
                     'cluster',
                     '--master',
                     'yarn',
                     OUTPUT_DATA + 'etl.py'
                ]
            }
        },
    ]
)
response

In [95]:
# kill all clusters
standby_clusters = emr.list_clusters( ClusterStates=['RUNNING', 'WAITING'] )

clusters = []
for cluster in standby_clusters["Clusters"] : 
    clusters.append( cluster["Id"] )
    
print( "Nr clusters: {}".format( len(clusters) ))

Nr clusters: 1


In [96]:
if len(clusters) > 0 :
    emr.terminate_job_flows(
        JobFlowIds = clusters
    )
    print( "Terminating Clusters" )
else: 
    print( "No clusters found." )

Terminating Clusters


In [114]:
import zenodo_get

destination_file , zenodo_id= 'tweets', '4568860'
output_dir = 'TODO'
zenodo_get.zenodo_get( ['-w {}'.format(destination_file) , zenodo_id] )

In [111]:
zenodo_get.zenodo_get( ['-h'] )

Usage: ipykernel_launcher.py [options] RECORD_OR_DOI

Options:
  --version             show program's version number and exit
  -h, --help            show this help message and exit
  -c, --cite            print citation information
  -r RECORD, --record=RECORD
                        Zenodo record ID
  -d DOI, --doi=DOI     Zenodo DOI
  -m, --md5             Create md5sums.txt for verification.
  -w WGET, --wget=WGET  Create URL list for download managers. (Files will not
                        be downloaded.)
  -e, --continue-on-error
                        Continue with next file if error happens.
  -k, --keep            Keep files with invalid checksum. (Default: delete
                        them.)
  -n, --do-not-continue
                        Do not continue previous download attempt. (Default:
                        continue.)
  -R RETRY, --retry=RETRY
                        Retry on error N more times.
  -p PAUSE, --pause=PAUSE
                        Wait N second befor

SystemExit: 0

In [19]:
s3_bucket = [entry['Name'] for entry in s3.list_buckets()['Buckets'] if 'udacity' in entry['Name']  ]

In [21]:
s3.upload_file( 'input/data/flights', s3_bucket[0], 'flights' )