Load Data to Redshift


In [5]:
import json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config

config = Config(
   retries = {
      'max_attempts': 10,
      'mode': 'adaptive'
   }
)


iam = boto3.client('iam', config=config)
sts = boto3.client('sts')
redshift = boto3.client('redshift')
sm = boto3.client('sagemaker')
ec2 = boto3.client('ec2')

In [6]:
#sts policy

assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "redshift.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

In [7]:
iam_redshift_role_name = 'team3-Redshift-role'

In [8]:
try:
    iam_role_redshift = iam.create_role(
        RoleName=iam_redshift_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='Team3 Redshift Role'
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

Role already exists


In [9]:
role = iam.get_role(RoleName='team3-Redshift-role')
iam_role_redshift_arn = role['Role']['Arn']
print(iam_role_redshift_arn)

arn:aws:iam::211125778552:role/team3-Redshift-role


In [10]:
account_id = sts.get_caller_identity()['Account']
print(account_id)

211125778552


In [11]:
policy_redshift_S3_Role = f'arn:aws:iam::aws:policy/AmazonS3FullAccess'
print(policy_redshift_S3_Role)



arn:aws:iam::aws:policy/AmazonS3FullAccess


In [12]:
policy_redshift_Role = f'arn:aws:iam::aws:policy/AmazonRedshiftFullAccess'
print(policy_redshift_Role)

#arn:aws:iam::aws:policy/AmazonRedshiftFullAccess#

arn:aws:iam::aws:policy/AmazonRedshiftFullAccess


In [13]:
# Attach AmazonRedshiftFullAccess policy
try:
    response = iam.attach_role_policy(
        PolicyArn=policy_redshift_Role,
        RoleName=iam_redshift_role_name
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is ok.")
    else:
        print("Unexpected error: %s" % e)

In [14]:
# Attach policy_redshift_S3_Role policy
try:
    response = iam.attach_role_policy(
        PolicyArn=policy_redshift_S3_Role,
        RoleName=iam_redshift_role_name
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is ok.")
    else:
        print("Unexpected error: %s" % e)

Set Up Trust Relationships

In [15]:
my_redshift_to_sagemaker_assumerole = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "redshift.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    },
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "sagemaker.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
}  

In [16]:
try:
    response = iam.update_assume_role_policy(
        PolicyDocument=json.dumps(my_redshift_to_sagemaker_assumerole),
        RoleName=iam_redshift_role_name
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is ok.")
    else:
        print("Unexpected error: %s" % e)

## Set. Up Secrets

In [17]:
secretsmanager = boto3.client('secretsmanager')

try:
    response = secretsmanager.create_secret(
        Name='team3_redshift_login',
        Description='Team3 Redshift Login',
        SecretString='[{"username":"team3"},{"password":"Courtdata3"}]',
        Tags=[
            {
                'Key': 'name',
                'Value': 'team3_redshift_login'
            },
        ]
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'ResourceExistsException':
        print("Secret already exists. This is ok.")
    else:
        print("Unexpected error: %s" % e)

Secret already exists. This is ok.


In [18]:
import json

secret = secretsmanager.get_secret_value(SecretId='team3_redshift_login')
cred = json.loads(secret['SecretString'])

master_user_name = cred[0]['username']
master_user_pw = cred[1]['password']

print(master_user_pw)

courtdata


## Redshift Parameters

In [27]:
try:
    domain_id = sm.list_domains()['Domains'][0]['DomainId']
    describe_domain_response = sm.describe_domain(DomainId=domain_id)
    print(describe_domain_response)
    vpc_id = describe_domain_response['VpcId']
    security_groups = ec2.describe_security_groups()['SecurityGroups']
    for security_group in security_groups:
        if vpc_id == security_group['VpcId']:
            security_group_id = security_group['GroupId']
    print(security_group_id)    
except:
    pass

In [25]:
try:
    notebook_instance_name = sm.list_notebook_instances()['NotebookInstances'][0]['NotebookInstanceName']
    notebook_instance = sm.describe_notebook_instance(NotebookInstanceName=notebook_instance_name)
    security_group_id = notebook_instance['SecurityGroups'][0]
    print(security_group_id)    
except:
    pass

In [21]:
# Redshift configuration parameters
redshift_cluster_identifier = 'team3'
database_name = 'court-data'
cluster_type = 'multi-node'

# Note that only some Instance Types support Redshift Query Editor 
# (https://docs.aws.amazon.com/redshift/latest/mgmt/query-editor.html)
node_type = 'dc2.large'
number_nodes = '2' 

In [25]:
response = redshift.create_cluster(
        DBName=database_name,
        ClusterIdentifier=redshift_cluster_identifier,
        ClusterType=cluster_type,
        NodeType=node_type,
        NumberOfNodes=int(number_nodes),       
        MasterUsername=master_user_name,
        MasterUserPassword=master_user_pw,
        IamRoles=[iam_role_redshift_arn],
        VpcSecurityGroupIds=[security_group_id],
        Port=5439,
        PubliclyAccessible=False
)

print(response)

{'Cluster': {'ClusterIdentifier': 'team3', 'NodeType': 'dc2.large', 'ClusterStatus': 'creating', 'ClusterAvailabilityStatus': 'Modifying', 'MasterUsername': 'team3', 'DBName': 'court-data', 'AutomatedSnapshotRetentionPeriod': 1, 'ManualSnapshotRetentionPeriod': -1, 'ClusterSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-026d17753b6e52719', 'Status': 'active'}], 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0', 'ParameterApplyStatus': 'in-sync'}], 'ClusterSubnetGroupName': 'default', 'VpcId': 'vpc-0a19e74ac58edb30f', 'PreferredMaintenanceWindow': 'thu:06:00-thu:06:30', 'PendingModifiedValues': {'MasterUserPassword': '****'}, 'ClusterVersion': '1.0', 'AllowVersionUpgrade': True, 'NumberOfNodes': 2, 'PubliclyAccessible': False, 'Encrypted': False, 'Tags': [], 'EnhancedVpcRouting': False, 'IamRoles': [{'IamRoleArn': 'arn:aws:iam::211125778552:role/team3-Redshift-role', 'ApplyStatus': 'adding'}], 'MaintenanceTrackName': 'current', 'DeferredMaint