In [3]:

import pandas as pd
import boto3
import json
import psycopg2
import configparser
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Loading Cluster Params

In [4]:
config = configparser.ConfigParser()
config.read_file(open('clusterparams.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":[DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })




Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


## Creating Redshift, S3 and IAM clients

In [78]:
iam = boto3.client("iam",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

iam_resource = boto3.resource("iam",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

redshift = boto3.client("redshift",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

s3 = boto3.resource("s3",
                    region_name="us-west-2",
                    aws_access_key_id=config.get('AWS','KEY'),
                    aws_secret_access_key=config.get('AWS','SECRET')
                    )

ec2 = boto3.resource("ec2",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

## Creating an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [111]:
# Create the IAM role
try:
    print('Creating a new IAM Role')
    dwhRole = iam.create_role(
           Path='/',
            RoleName="dwhRole",
            Description='Allow Redshift clusters to call AWS services on your behalf.',
            AssumeRolePolicyDocument=json.dumps(
                {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
               'Version': '2012-10-17'})
    )
except Exception as e:
    print(e)

Creating a new IAM Role


In [112]:
# Attach Policy
print('1.2 Attaching Policy')

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess",
                      )['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [113]:
# Get the IAM role ARN
print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)["Role"]['Arn']

1.3 Get the IAM role ARN


# CREATING REDSHIFT CLUSTER

In [114]:
try:
    response = redshift.create_cluster(        
        # hardware
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),
        
        # identifiers & credentials
            DBName=DWH_DB,
            ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
            MasterUsername=DWH_DB_USER,
            MasterUserPassword=DWH_DB_PASSWORD,
        
        # parameter for role (to allow s3 access)
         IamRoles=[roleArn]
       
    )
except Exception as e:
    print(e)

In [7]:
# See cluster status

def prettyRedshiftProps(props):
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)


Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,{'Address': 'dwhcluster.c2hvwovgwksn.us-west-2...
6,VpcId,vpc-6fa1a417
7,NumberOfNodes,4


In [11]:
redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]['Endpoint']['Address']

'dwhcluster.c2hvwovgwksn.us-west-2.redshift.amazonaws.com'

In [124]:
def get_VpcId(props):
    keysToShow = ['VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
vpcid = get_VpcId(myClusterProps)['Value'][0]
vpcid

'vpc-6fa1a417'

In [None]:
cluster_address = myClusterProps['Endpoint']['Address']
print('Cluster Address:', cluster_address)

In [None]:
IamRoleArn = myClusterProps['IamRoles'][0]['IamRoleArn']
print('IamRoleArn:', IamRoleArn)

## Opening an incoming TCP port to access the cluster ednpoint

In [55]:
ec2.Vpc(id=myClusterProps['VpcId'])
list(vpc.security_groups.all())[0]

ec2.SecurityGroup(id='sg-d4539de2')

In [147]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
 
    defaultSg.authorize_ingress(
        GroupName= defaultSg.group_name, 
        CidrIp='0.0.0.0/0',  
        IpProtocol='TCP',  
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-090660dff8f84df61')
ec2.SecurityGroup(id='sg-090660dff8f84df61')


In [157]:
ec2 = boto3.client("ec2",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

ec2.describe_security_groups(Filters= [{"Name": "group-name", "Values": "haha"}])

ParamValidationError: Parameter validation failed:
Invalid type for parameter Filters[0].Values, value: haha, type: <class 'str'>, valid types: <class 'list'>, <class 'tuple'>

In [129]:
list(vpc.security_groups.all())[0]

ec2.SecurityGroup(id='sg-d4539de2')

In [177]:
newsecname = 'secbamid1'

ec2.describe_security_groups(Filters= [{"Name": "group-name", "Values":['SG_Name']}])

{'SecurityGroups': [{'Description': 'Redshift security group',
   'GroupName': 'SG_Name',
   'IpPermissions': [{'FromPort': 5439,
     'IpProtocol': 'tcp',
     'IpRanges': [{'CidrIp': '0.0.0.0/0'}],
     'Ipv6Ranges': [],
     'PrefixListIds': [],
     'ToPort': 5439,
     'UserIdGroupPairs': []}],
   'OwnerId': '129357147351',
   'GroupId': 'sg-090660dff8f84df61',
   'IpPermissionsEgress': [{'IpProtocol': '-1',
     'IpRanges': [{'CidrIp': '0.0.0.0/0'}],
     'Ipv6Ranges': [],
     'PrefixListIds': [],
     'UserIdGroupPairs': []}],
   'VpcId': 'vpc-6fa1a417'}],
 'ResponseMetadata': {'RequestId': '9e640499-f2a5-46c7-871c-441bf79ece9b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9e640499-f2a5-46c7-871c-441bf79ece9b',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'content-type': 'text/xml;charset=UTF-8',
   'content-length': '1519',
   'date': 'Wed, 04 Aug 2021 17:14:24 GMT',
   'server': 'Amazon

In [209]:
import boto3
from botocore.exceptions import ClientError

ec2 = boto3.client("ec2",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )


def get_VpcId(props):
    keysToShow = ['VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
vpcid = get_VpcId(myClusterProps)['Value'][0]



def Create_Security_Group(Groupid):
    ec2 = boto3.client("ec2",
                        region_name="us-west-2",
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                        )

    try: 
        response = ec2.describe_security_groups(Filters= [{"Name": "group-name", "Values": [Groupid]}])
        
    except ClientError as e:
        print(f"Error: {e}")
        
    if len(response['SecurityGroups']) > 0:
        print('Security Group already exists: ' + response['SecurityGroups'][0]['GroupId'])
        return response['SecurityGroups'][0]['GroupId']
    
    else:
        response = None
        
    
    if response is None:
        # Assuming the security goroup doesn't exist, go ahead and create it
        try:
            ###### Make sure security group name is in config
            response = ec2.create_security_group(GroupName= Groupid,
                                                 Description='Redshift security group',
                                                 VpcId=get_VpcId(myClusterProps)['Value'][0])
           
            security_group_id = response['GroupId']
            print("Security Group created")
            print(security_group_id, get_VpcId(myClusterProps)['Value'][0])
        
            ec2.authorize_security_group_ingress(
            GroupId=str(security_group_id),
            CidrIp='0.0.0.0/0',  
            IpProtocol='TCP',  
            FromPort=int(DWH_PORT),
            ToPort=int(DWH_PORT)
        )
            return security_group_id
        except Exception as e:
            print(e)            

            
            
Create_Security_Group('sg_badconnect2')

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster dwhcluster not found.

In [173]:
ec2.describe_security_groups(Filters= [{"Name": "group-name", "Values": ['HAHAHA']}])

{'SecurityGroups': [],
 'ResponseMetadata': {'RequestId': 'd346cfa6-c50c-42fe-856d-06d5323d0fd9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd346cfa6-c50c-42fe-856d-06d5323d0fd9',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'content-type': 'text/xml;charset=UTF-8',
   'content-length': '243',
   'date': 'Wed, 04 Aug 2021 17:12:30 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

In [None]:
def create_cluster_security_group():
  
    try:
        response = ec2.describe_security_groups(GroupIds=['sg-d4539de2'])
    except ClientError as e:
        print(e)

  if len(response['SecurityGroups']) > 0:
    print('Security Group already exists: ' + response['SecurityGroups'][0]['GroupId'])
    return response['SecurityGroups'][0]['GroupId']
  else:
    response = None

  if response is None:
    vpc_id = config.get('SECURITY', 'VPC_ID')
    if vpc_id == "":
      response = ec2_client.describe_vpcs()
      vpc_id = response.get('Vpcs', [{}])[0].get('VpcId', '')

    try:
        response = ec2_client.create_security_group(GroupName=config.get('SECURITY', 'SG_Name'),Description='Redshift security group',VpcId=vpc_id)
        security_group_id = response['GroupId']
        print('Security Group Created %s in vpc %s.' % (security_group_id, vpc_id))

        ec2_client.authorize_security_group_ingress(
            GroupId=security_group_id,
            IpPermissions=[
                {'IpProtocol': 'tcp',
                 'FromPort': 80,
                 'ToPort': 80,
                 'IpRanges': [{'CidrIp': '0.0.0.0/0'}]},
                {'IpProtocol': 'tcp',
                 'FromPort': 5439,
                 'ToPort': 5439,
                 'IpRanges': [{'CidrIp': '0.0.0.0/0'}]}
            ])
        return security_group_id
    except ClientError as e:
        print(e)

# Loading DB Params

In [85]:
configETL = configparser.ConfigParser()
configETL.read_file(open('func.cfg'))
LOG_DATA = configETL.get("S3","log_data")
LOGPATH = configETL.get("S3","log_jsonpath")
SONG_DATA = configETL.get("S3","song_data")
IAMROLE = configETL.get("AWS", "rolearn")

## CONNECTING TO CLUSTER DB

In [90]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*configETL['CLUSTER'].values()))
cur = conn.cursor()
cur

<cursor object at 0x000001E359C8A3C0; closed: 0>

## CONNECTING to S3 "udacitiy-dend" Bucket and Preview list of files

In [None]:
song_data = [filename.key for filename in s3.Bucket("udacity-dend").objects.filter(Prefix='song_data')]
song_data[:5]

In [None]:
log_data = [filename.key for filename in s3.Bucket("udacity-dend").objects.filter(Prefix='log_data')]
log_data[:5]

## DROP TABLES IF EXISTS

In [87]:
drop_staging_events = "DROP TABLE IF EXISTS staging_events"
drop_staging_songs = "DROP TABLE IF EXISTS staging_songs"
drop_fact_songplay = "DROP TABLE IF EXISTS fact_songplay"
drop_dim_users = "DROP TABLE IF EXISTS dim_users"
drop_dim_songs = "DROP TABLE IF EXISTS dim_songs"
drop_dim_artists = "DROP TABLE IF EXISTS dim_artists"
drop_dim_time = "DROP TABLE IF EXISTS dim_time"

tables_to_drop = [drop_staging_events,drop_staging_songs,drop_fact_songplay, 
                  drop_dim_users, drop_dim_songs,drop_dim_artists,drop_dim_time]

for table in tables_to_drop:
    cur.execute(table)
    conn.commit()
    print(table)

DROP TABLE IF EXISTS staging_events
DROP TABLE IF EXISTS staging_songs
DROP TABLE IF EXISTS fact_songplay
DROP TABLE IF EXISTS dim_users
DROP TABLE IF EXISTS dim_songs
DROP TABLE IF EXISTS dim_artists
DROP TABLE IF EXISTS dim_time


## DESIGNING STAGING, FACT & DIMENSION TABLES

In [88]:
# STAGING tables are used to stage before modeling into Star Schema

create_staging_events = ("""CREATE TABLE IF NOT EXISTS staging_events(
artist VARCHAR,
auth VARCHAR,
firstName VARCHAR,
gender VARCHAR,
itemInSession INTEGER,
lastName VARCHAR,
length FLOAT,
level VARCHAR,
location VARCHAR,
method VARCHAR,
page VARCHAR,
registration BIGINT,
sessionId INTEGER,
song VARCHAR,
status INTEGER,
ts TIMESTAMP,
userAgent VARCHAR,
userId INTEGER
)
""")


create_staging_songs = ("""CREATE TABLE IF NOT EXISTS staging_songs(
num_songs VARCHAR,
artist_id VARCHAR, 
artist_latitude FLOAT, 
artist_longitude FLOAT, 
artist_location VARCHAR, 
artist_name VARCHAR, 
song_id VARCHAR, 
title VARCHAR, 
duration FLOAT,
year INT
)
""")

create_fact_songplay = ("""CREATE TABLE IF NOT EXISTS fact_songplay
(
songplay_id INTEGER IDENTITY(0,1) PRIMARY KEY sortkey,
start_time TIMESTAMP,
user_id INTEGER, 
level VARCHAR, 
song_id VARCHAR,
artist_id VARCHAR,
session_id INTEGER,
location VARCHAR,
user_agent VARCHAR)
""")

create_dim_users = ("""CREATE TABLE IF NOT EXISTS dim_users
(
user_id INTEGER PRIMARY KEY distkey,
first_name VARCHAR,
last_name VARCHAR,
gender VARCHAR,
level VARCHAR)
""")

create_dim_songs = ("""CREATE TABLE IF NOT EXISTS dim_songs
(
song_id VARCHAR PRIMARY KEY,
title VARCHAR, 
artist_id VARCHAR distkey,
year INTEGER, 
duration FLOAT)
""")

create_dim_artists = ("""CREATE TABLE IF NOT EXISTS dim_artists
(
artist_id VARCHAR PRIMARY KEY distkey,
name VARCHAR, 
location VARCHAR, 
lattitude FLOAT, 
longitude FLOAT)
""")

create_dim_time = ("""CREATE TABLE IF NOT EXISTS dim_time
(
start_time TIMESTAMP PRIMARY KEY sortkey distkey, 
hour INTEGER, 
day INTEGER, 
week INTEGER, 
month INTEGER, 
year INTEGER, 
weekday INTEGER)
""")

## CREATING STAGING, FACT & DIMENSION TABLES

In [89]:
tables_to_create =[create_staging_events, create_staging_songs, create_fact_songplay, create_dim_users, create_dim_songs,
                   create_dim_artists, create_dim_time]

for table in tables_to_create:
    cur.execute(table)
    print("Table created")
    conn.commit()

Table created
Table created
Table created
Table created
Table created
Table created
Table created


### COPY staging_events

In [20]:
copy_staging_events = ("""
COPY staging_events FROM {}
CREDENTIALS 'aws_iam_role={}'
COMPUPDATE OFF region 'us-west-2'
TIMEFORMAT as 'epochmillisecs'
TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL
FORMAT AS JSON {};
""").format(LOG_DATA, IAMROLE, LOGPATH)

cur.execute(copy_staging_events)
print("staging events copied")
conn.commit()

staging events copied


In [21]:
# PREVIEW staging_events
##  artist, auth, firstName, gender, itemInSession, lastName, length, level, location, method, page, registration, sessionId, song, status, ts, userAgent, userId

query = cur.execute("""SELECT * FROM staging_events""")
for i in range(2):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(None, 'Logged In', 'Adler', 'M', 0, 'Barrera', None, 'free', 'New York-Newark-Jersey City, NY-NJ-PA', 'GET', 'Home', 1540835983796, 248, None, 200, datetime.datetime(2018, 11, 6, 2, 12, 44, 796000), '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"', 100)

('Gustavo Cerati', 'Logged In', 'Adler', 'M', 1, 'Barrera', 249.44281, 'free', 'New York-Newark-Jersey City, NY-NJ-PA', 'PUT', 'NextSong', 1540835983796, 248, 'Uno Entre 1000', 200, datetime.datetime(2018, 11, 6, 2, 13, 3, 796000), '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"', 100)



## COPY staging_songs

In [22]:
copy_staging_songs = ("""
COPY staging_songs FROM {}
CREDENTIALS 'aws_iam_role={}'
COMPUPDATE OFF region 'us-west-2'
FORMAT AS JSON 'auto'
TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL
""").format(SONG_DATA, IAMROLE)

cur.execute(copy_staging_songs)
print("staging songs copied")
conn.commit()

staging songs copied


In [23]:
# PREVIEW staging_songs
## num_songs, artist_id,  artist_latitude ,  artist_longitude ,  artist_location,  artist_name,  song_id,  title,  duration, year

query = cur.execute("""SELECT * FROM staging_songs""")
for i in range(2):
        row = cur.fetchone()
        if row == None:
            break
        print(row)

('1', 'ARZ5H0P1187B98A1DD', 33.76672, -118.1924, 'Long Beach, CA', 'Snoop Dogg', 'SOAPERH12A58A787DC', 'The One And Only (Edited)', 230.42567, 0)
('1', 'ARNQAVF11F4C844C04', None, None, None, 'Despina Vandi', 'SOVXMTN12A8C135A18', 'OUTE ENA EFHARISTO', 303.09832, 0)


## INSERT INTO FACT_songplay

In [24]:
# PREVIEW QUERY B4 INSERTING
query = cur.execute("""SELECT DISTINCT e.ts,
                e.userId as user_id,
                e.level as level,
                s.song_id as song_id,
                s.artist_id as artist_id,
                e.sessionId as session_id,
                e.location as location,
                e.userAgent as user_agent
FROM staging_events e
JOIN staging_songs s ON e.song = s.title AND e.artist = s.artist_name
WHERE e.page='NextSong'
""")

for i in range(2):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(datetime.datetime(2018, 11, 30, 4, 57, 3, 796000), 49, 'paid', 'SOEMXXF12A6D4F932C', 'ARI4S0E1187B9B06C0', 1079, 'San Francisco-Oakland-Hayward, CA', 'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0')

(datetime.datetime(2018, 11, 20, 1, 24, 48, 796000), 25, 'paid', 'SOULTKQ12AB018A183', 'ARKQQZA12086C116FC', 594, 'Marinette, WI-MI', '"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"')



In [25]:
cur.execute("""INSERT INTO fact_songplay (start_time, user_id, level, song_id, artist_id, session_id, location, user_agent)
                                    SELECT DISTINCT e.ts,
                                                    e.userId as user_id,
                                                    e.level as level,
                                                    s.song_id as song_id,
                                                    s.artist_id as artist_id,
                                                    e.sessionId as session_id,
                                                    e.location as location,
                                                    e.userAgent as user_agent
                                    FROM staging_events e
                                    JOIN staging_songs s ON e.song = s.title AND e.artist = s.artist_name
                                    WHERE e.page='NextSong'
                                  """)

# Preview newly created fact_songplay table
query = cur.execute("""SELECT * FROM fact_songplay""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(0, datetime.datetime(2018, 11, 27, 18, 22, 58, 796000), 36, 'paid', 'SODFRAX12A8C13274B', 'ARP29T31187B98DD5F', 957, 'Janesville-Beloit, WI', '"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"')

(15, datetime.datetime(2018, 11, 13, 20, 20, 44, 796000), 29, 'paid', 'SOKUAEP12A8C13BE19', 'ARLLWJQ1187B9B06A7', 556, 'Atlanta-Sandy Springs-Roswell, GA', '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"')

(22, datetime.datetime(2018, 11, 24, 15, 47, 42, 796000), 49, 'paid', 'SOCGOZK12A8151BD5D', 'ARM0P6Z1187FB4D466', 849, 'San Francisco-Oakland-Hayward, CA', 'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0')



## INSERT INTO dim_users 

In [26]:
# PREVIEW QUERY B4 INSERTING
query = cur.execute("""SELECT DISTINCT userId as user_id,
                firstName as first_name,
                lastName as last_name,
                gender as gender,
                level as level
FROM staging_events
where userId IS NOT NULL;
""")

for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(100, 'Adler', 'Barrera', 'M', 'free')

(61, 'Samuel', 'Gonzalez', 'M', 'free')

(55, 'Martin', 'Johnson', 'M', 'free')



In [27]:
cur.execute("""INSERT INTO dim_users(user_id, first_name, last_name, gender, level)
                        SELECT DISTINCT userId as user_id,
                                        firstName as first_name,
                                        lastName as last_name,
                                        gender as gender,
                                        level as level
                        FROM staging_events
                        where userId IS NOT NULL;

""")

# Preview newly created dim_users table 
query = cur.execute("""SELECT * FROM dim_users""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(26, 'Ryan', 'Smith', 'M', 'free')

(12, 'Austin', 'Rosales', 'M', 'free')

(17, 'Makinley', 'Jones', 'F', 'free')



## INSERT INTO dim_songs

In [28]:
# PREVIEW query b4 inserting
query = cur.execute("""SELECT DISTINCT song_id, title, artist_id, year, duration
                    FROM staging_songs
                    WHERE song_id IS NOT NULL
                    """)
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)

('SODZURJ12A6D4F938C', "In God's Name (Album Version)", 'ARHK7VE1187B994E06', 2003, 308.45342)
('SOSGQQS12A8C137AF5', 'Mile High', 'AROYNG01187FB56BA9', 1987, 239.85587)
('SOMPEQV12AF72A616A', 'Nixon', 'AREMPER1187B9AEB42', 2005, 189.962)


In [29]:
cur.execute("""INSERT INTO dim_songs (song_id, title, artist_id, year, duration)
                SELECT DISTINCT song_id, title, artist_id, year, duration
                FROM staging_songs
                WHERE song_id IS NOT NULL
                """)


# Preview newly created dim_users table 
query = cur.execute("""SELECT * FROM dim_songs""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")


('SOUIZBU12A8C1458F8', 'By The Way', 'AR70CDT1187FB5796F', 1970, 181.57669)

('SOVDXIY12A8C142D40', 'Your Mantra', 'ARYVVLF1241B9CC03B', 0, 159.84281)

('SOAUGJA12AB01869AF', 'The Ambush', 'ARV2X851187FB41A78', 2004, 136.98567)



## INSERTING INTO dim_artist

In [30]:
# PREVIEW query b4 inserting
query = cur.execute("""SELECT DISTINCT artist_id, e.artist as name, s.artist_location, s.artist_latitude, s.artist_longitude
                        FROM staging_events e
                        JOIN staging_songs s ON e.artist = s.artist_name
                        WHERE e.artist IS NOT NULL
                    """)
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)

('ARN0GFV1187FB508CC', 'Wiz Khalifa', None, None, None)
('AR9GUZF1187FB4D1BC', '10_000 Maniacs', 'Jamestown, NY', None, None)
('ARR6LWJ1187FB44C8B', 'R.E.M.', 'Athens, GA', None, None)


In [31]:
cur.execute("""INSERT INTO dim_artists(artist_id, name, location, lattitude, longitude)
                   SELECT DISTINCT artist_id, e.artist as name, s.artist_location, s.artist_latitude, s.artist_longitude
                   FROM staging_events e
                   JOIN staging_songs s ON e.artist = s.artist_name
                   WHERE e.artist IS NOT NULL
            """)

# Preview newly created dim_artist table 
query = cur.execute("""SELECT * FROM dim_artists""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

('ARQT8QM1187FB3E3CB', 'The Bats', 'Christchurch, New Zealand', -43.53131, 172.6373)

('ARVWNTI1269FB34C59', 'SOJA', None, None, None)

('ARDGES11187B9B7DDF', 'Roxy Music', 'Newcastle', None, None)



## INSERTING INTO dim_time

In [32]:
# Preview query to insert
cur.execute("""SELECT DISTINCT ts, 
               extract(h from ts) AS hour, 
               extract(d from ts) AS day, 
               extract(w from ts) AS week, 
               extract(mon from ts) AS month, 
               extract(year from ts) AS year, 
               extract(dow from ts) AS weekday
               FROM staging_events WHERE ts IS NOT NULL
""")
cur.fetchone()

(datetime.datetime(2018, 11, 6, 2, 12, 44, 796000), 2, 6, 45, 11, 2018, 2)

In [33]:
cur.execute("""INSERT INTO dim_time(start_time, hour, day, week, month, year, weekday)
               SELECT DISTINCT ts, 
               extract(h from ts) AS hour, 
               extract(d from ts) AS day, 
               extract(w from ts) AS week, 
               extract(mon from ts) AS month, 
               extract(year from ts) AS year, 
               extract(dow from ts) AS weekday
               FROM staging_events WHERE ts IS NOT NULL
            """)

# Preview newly created dim_time table 
query = cur.execute("""SELECT * FROM dim_time""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(datetime.datetime(2018, 11, 1, 21, 1, 46, 796000), 21, 1, 44, 11, 2018, 4)

(datetime.datetime(2018, 11, 1, 21, 5, 52, 796000), 21, 1, 44, 11, 2018, 4)

(datetime.datetime(2018, 11, 2, 9, 1, 21, 796000), 9, 2, 44, 11, 2018, 5)



In [99]:
conn.commit()

In [104]:
# Join each dim table to the fact table and preview it

cur.execute("""SELECT * FROM fact_songplay fs
JOIN dim_users on fs.user_id = dim_users.user_id
JOIN dim_artists on fs.artist_id = dim_artists.artist_id
JOIN dim_songs on fs.song_id = dim_songs.song_id
JOIN dim_time on fs.start_time = dim_time.start_time
LIMIT 1
""")
df = pd.DataFrame(cur.fetchone()).T



df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,219,2018-11-15 10:44:29.796,80,paid,SOSDZFY12A8C143718,AR748W61187B9B6AB8,611,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",80,...,AR748W61187B9B6AB8,1999,226.11546,2018-11-15 10:44:29.796,10,15,46,11,2018,4


# Delete your cluster and resources after no longer needed

In [35]:
# Delete Cluster
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'dwhuser',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.c2hvwovgwksn.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2021, 8, 11, 16, 19, 24, 705000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-d4539de2',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-6fa1a417',
  'AvailabilityZone': 'us-west-2c',
  'PreferredMaintenanceWindow': 'sat:07:00-sat:07:30',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible

In [36]:
# Check deletion status
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)


Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,{'Address': 'dwhcluster.c2hvwovgwksn.us-west-2...
6,VpcId,vpc-6fa1a417
7,NumberOfNodes,4


In [84]:
# Detach role policy & DELETE 
iam.detach_role_policy(RoleName="dwhRole", PolicyArn='arn:aws:iam::aws:policy/ReadOnlyAccess')
iam.delete_role(RoleName="dwhRole")

NoSuchEntityException: An error occurred (NoSuchEntity) when calling the DetachRolePolicy operation: The role with name dwhRole cannot be found.

In [79]:
 iam.detach_role_policy(RoleName="dwhRole", PolicyArn=Policy_Arn)

ParamValidationError: Parameter validation failed:
Missing required parameter in input: "PolicyArn"

In [92]:
input()

h


'h'