In [1]:
import pandas as pd
import boto3
import configparser

In [None]:
config = configparser.ConfigParser()
config.read_file(open('emr.cfg'))

In [None]:
key = os.environ.get('AWS_ACCESS_KEY_ID')
secret = os.environ.get('AWS_SECRET_ACCESS_KEY')

node_type = config.get('CLUSTER', 'NODE_TYPE')
log_uri = config.get('CLUSTER', 'LOG_URI')
emr_name = config.get('CLUSTER', 'NAME')
key_pair = config.get('CLUSTER', 'KEY_PAIR')
subnet = config.get('CLUSTER', 'SUBNET')
capstone_bucket = config.get('S3', 'S3_BUCKET')

In [None]:
s3 = boto3.resource('s3',
    region_name="us-west-2",
    aws_access_key_id=key,
    aws_secret_access_key=secret)

emr = boto3.client('emr',
    region_name="us-west-2",
    aws_access_key_id=key,
    aws_secret_access_key=secret)

In [None]:
emr_cluster = emr.run_job_flow(
    Name=emr_name,
    LogUri=log_uri,
    ReleaseLabel='emr-5.29.0',
    Applications=[{'Name': 'Spark'}],
    Instances={
        'InstanceGroups': [
            {'Name': 'master_node',
                'Market': 'ON_DEMAND',
                'InstanceRole': 'MASTER',
                'InstanceType': node_type,
                'InstanceCount': 1},
            {'Name': 'slave_node',
                'Market': 'ON_DEMAND',
                'InstanceRole': 'CORE',
                'InstanceType': node_type,
                'InstanceCount': 3}],
        'Ec2KeyName': key_pair,
        'KeepJobFlowAliveWhenNoSteps': True,
        'TerminationProtected': False,
        'Ec2SubnetId': subnet},
#     BootstrapActions=[
#         {'Name': 'install_python_modules',
#             'ScriptBootstrapAction': {
#                 'Path': capstone_bucket + 'install_python_modules.sh'}
#         }],
    VisibleToAllUsers=True,
    JobFlowRole='EMR_EC2_DefaultRole',
    ServiceRole='EMR_DefaultRole'
)

# print('Cluster created with the step...', emr_cluster['JobFlowId'])

In [None]:
def prettyEmrProps(props):
    pd.set_option('display.max_colwidth', -1)
    keys = ['Cluster', 'LogUri', 'MasterPublicDnsName', 'Ec2InstanceAttributes']
    cluster_subkeys = ['Id', 'Name', 'Status']
    ec2_subkeys = ['Ec2KeyName', 'Ec2SubnetId']
    x = []
    for k,v in props.items():
        if k in keys:
            if k == 'Cluster':
                for a,b in v.items():
                    if b in cluster_subkeys:
                        x.append(('Cluster.' + a, b))
            elif k == 'Ec2SubnetId':
                for c,d in v.items():
                    if c in ec2_subkeys:
                        x.append(('Ec2SubnetId.' + c, d))
            else:
                x.append((k,v))
    return pd.DataFrame(data=x, columns=["key", "value"])

In [None]:
# run until Cluser.Status = 'available'
props = emr.describe_clusters(ClusterId=emr_cluster['JobFlowId'])

In [None]:
props['Cluster']['Status']

In [None]:
props['MasterPublicDnsName']

In [None]:
prettyEmrProps(props)

In [None]:
copy_args = ['aws','s3','cp',capstone_bucket + 'etl.py','/','--recursive']
copy_script_step ={
    'Name': 'copy_pipeline_to_emr',
    'ActionOnFailure': 'CONTINUE',
    'HadoopJarStep': {
        'Jar': 'command-runner.jar',
        'Args': copy_args
        }
    }

spark_args = ['spark-submit', '--master', 'yarn', 'etl.py']
spark_step = {
    'Name': 'run-capstone-pipeline',
    'ActionOnFailure': 'CONTINUE',
    'HadoopJarStep': {
        'Jar': 'command-runner.jar',
        'Args': spark_args
        }
    }

actions = emr.add_job_flow_steps(JobFlowId=emr_cluster['JobFlowId'], Steps=[copy_step, spark_step])
print('Added steps: {}'.format(actions))

In [None]:
# to terminate the cluster
response = emr.terminate_job_flows(emr_cluster['JobFlowId'])