# Create Redshift Database

## 1. Setup AWS Manual Way
Intially setup your IAM roles, security groups, users, etc before doing it in a programmatic way.


##### Create Amazon IAM role
+ [Create an IAM role](https://console.aws.amazon.com/iam/home#/home)
+ Ensure role has administrator access to redshift, ec2, s3 and other areas.
                     
##### Create Amazon Security Group
+ [Create an Amazon Security Group](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:)
+ Amazon Redshift needs a port range  = 5439

##### Launch a Redshift Cluster
+ [Launch Redshift Cluster](https://console.aws.amazon.com/redshift/)

##### Create IAM User
+ [Create IAM User](https://console.aws.amazon.com/iam/)
+ Ensure user has programmatic access
+ Attach policies for redshift, s3 and any othern necessary policies

##### Create an S3 Bucket
+ [Create an S3 Bucket](https://s3.console.aws.amazon.com/s3/home?region=us-west-2#)

##### Create PostgreSQL RDS
+ [Create a PostgreSQL RDS](https://us-west-2.console.aws.amazon.com/rds/home?region=us-west-2)

## 2. Setup AWS Programmatic Way

After you have set up the initial AWS structure the manual way, you can create new Identity Access Management (IAM) Users using Python

In [None]:
#standard libraries
import os

In [None]:
# move up one directory for config files
os.chdir('..')
os.getcwd()

In [5]:
# Load configuration file
from configparser import ConfigParser
config = ConfigParser()
config.read_file(open('dwh.cfg'))

KEY=config.get('AWS','key')
SECRET= config.get('AWS','secret')

In [6]:
# Create IAM Client

from boto3 import client

iam = client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

In [7]:
# Create IAM Role for Redshift to have ReadOnly access to S3
# Important for Role to have high access like Administrator access

from json import dumps
from botocore.exceptions import ClientError

DB_ROLE_NAME = config.get("CLUSTER", "DB_ROLE_NAME")

try:
    print("Creating new IAM Role")
    dwhRole = iam.create_role(
        Path='/',
        RoleName = DB_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=dumps(
            {'Statement':[{'Action': 'sts:AssumeRole',
                          'Effect':'Allow',
                          'Principal': {'Service': 'redshift.amazonaws.com'}}],
                         'Version':'2012-10-17'})
    )
except Exception as e:
    print(e)


Creating new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name sparkify_redshift already exists.


In [8]:
# Attach Necessary Policies to Role
# Code is attaching the AmazonS3ReadOnlyAccess to Role

print('Attaching Policy')
iam.attach_role_policy(RoleName=DB_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")['ResponseMetadata']['HTTPStatusCode']

Attaching Policy


200

In [9]:
# Get Amazon Resource Names (ARN) 
# The roleArn variable is used when creating a Redshift 

roleArn = iam.get_role(RoleName=DB_ROLE_NAME)['Role']['Arn']
print(roleArn)

arn:aws:iam::341887061345:role/sparkify_redshift


In [None]:
# Create Redshift client
from boto3 import client

redshift = client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET)

In [None]:
# Load data variables to create database from configuration file

DB_CLUSTER_TYPE       = config.get("CLUSTER","DB_CLUSTER_TYPE")
DB_NUM_NODES          = config.get("CLUSTER","DB_NUM_NODES")
DB_NODE_TYPE          = config.get("CLUSTER","DB_NODE_TYPE")

DB_HOST               = config.get("CLUSTER","HOST")
DB_NAME               = config.get("CLUSTER","DB_NAME")
DB_USER               = config.get("CLUSTER","DB_USER")
DB_PASSWORD           = config.get("CLUSTER","DB_PASSWORD")
DB_PORT               = config.get("CLUSTER","DB_PORT")

In [None]:
# Create Redshift Database

try:
    response = redshift.create_cluster(        
        # Parameters for hardware
        ClusterType=DB_CLUSTER_TYPE,
        NodeType=DB_NODE_TYPE,
        NumberOfNodes=int(DB_NUM_NODES),
        
        # Parameters for identifiers & credentials
        DBName=DB_NAME,
        ClusterIdentifier=DB_HOST,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,
        
        # Parameter for role s3 access
        IamRoles=[roleArn]
    )
except Exception as e:
    print(e)

In [None]:
# Check on progess of creation and Redshift database type
import pandas as pd

def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DB_HOST)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

In [None]:
# Check on the progress of the database being created every minute

import time
starttime=time.time()
while (redshift.describe_clusters(ClusterIdentifier=DB_HOST)['Clusters'][0] == 'creating') == True:
    print("Creating Redshift")
    time.sleep(60.0 - ((time.time() - starttime) % 60.0))

In [None]:
# Save DB Endpoint and DB ROLE ARN to configuration file
DB_ENDPOINT = myClusterProps['Endpoint']['Address']
DB_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']

print("DWH_ENDPOINT :: ", DB_ENDPOINT)
print("DWH_ROLE_ARN :: ", DB_ROLE_ARN)

In [None]:
# Create EC2 Resource
from boto3 import resource

ec2 = resource('ec2',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

In [None]:
# Open incoming TCP Port to Access Endpoint if Not Done Already

try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName= defaultSg.group_name,  # TODO: fill out
        CidrIp='0.0.0.0/0',  # TODO: fill out
        IpProtocol='TCP',  # TODO: fill out
        FromPort=int(DB_PORT),
        ToPort=int(DB_PORT)
    )
except Exception as e:
    print(e)

In [None]:
# basic method not using psycopg2
%load_ext sql

conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, DB_ENDPOINT, DB_PORT,DB_NAME)
print(conn_string)
%sql $conn_string