In [43]:
import pandas as pd
import boto3
import json
import psycopg2
import configparser

## Loading Cluster Params

In [44]:
config = configparser.ConfigParser()
config.read_file(open('clusterparams.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":[DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


## Creating Redshift, S3 and IAM clients

In [45]:
iam = boto3.client("iam",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

redshift = boto3.client("redshift",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

s3 = boto3.resource("s3",
                    region_name="us-west-2",
                    aws_access_key_id=config.get('AWS','KEY'),
                    aws_secret_access_key=config.get('AWS','SECRET')
                    )

ec2 = boto3.resource("ec2",
                    region_name="us-west-2",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

## Creating an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [46]:
# Create the IAM role
try:
    print('Creating a new IAM Role')
    dwhRole = iam.create_role(
           Path='/',
            RoleName=DWH_IAM_ROLE_NAME,
            Description='Allow Redshift clusters to call AWS services on your behalf.',
            AssumeRolePolicyDocument=json.dumps(
                {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
               'Version': '2012-10-17'})
    )
except Exception as e:
    print(e)

Creating a new IAM Role


In [47]:
# Attach Policy
print('1.2 Attaching Policy')

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                      PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess",
                      )['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [50]:
# Get the IAM role ARN
print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)["Role"]['Arn']

1.3 Get the IAM role ARN


# CREATING REDSHIFT CLUSTER

In [51]:
try:
    response = redshift.create_cluster(        
        # hardware
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),
        
        # identifiers & credentials
            DBName=DWH_DB,
            ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
            MasterUsername=DWH_DB_USER,
            MasterUserPassword=DWH_DB_PASSWORD,
        
        # parameter for role (to allow s3 access)
         IamRoles=[roleArn]
       
    )
except Exception as e:
    print(e)

In [58]:
# See cluster status

def prettyRedshiftProps(props):
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)


Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.c2hvwovgwksn.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-6fa1a417
7,NumberOfNodes,4


In [59]:
cluster_address = myClusterProps['Endpoint']['Address']
print('Cluster Address:', cluster_address)

Cluster Address: dwhcluster.c2hvwovgwksn.us-west-2.redshift.amazonaws.com


In [60]:
IamRoleArn = myClusterProps['IamRoles'][0]['IamRoleArn']
print('IamRoleArn:', IamRoleArn)

IamRoleArn: arn:aws:iam::129357147351:role/dwhRole


## Opening an incoming TCP port to access the cluster ednpoint

In [None]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName= defaultSg.group_name, 
        CidrIp='0.0.0.0/0',  
        IpProtocol='TCP',  
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

# Loading DB Params

In [61]:
configETL = configparser.ConfigParser()
configETL.read_file(open('dwh.cfg'))
LOG_DATA = configETL.get("S3","LOG_DATA")
LOGPATH = configETL.get("S3","LOG_JSONPATH")
SONG_DATA = configETL.get("S3","SONG_DATA")
IAMROLE = configETL.get("IAM_ROLE", "ARN")

## CONNECTING TO CLUSTER DB

In [62]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*configETL['CLUSTER'].values()))
cur = conn.cursor()
cur

<cursor object at 0x000001CDA5BA43C0; closed: 0>

## CONNECTING to S3 "udacitiy-dend" Bucket and Preview list of files

In [16]:
song_data = [filename.key for filename in s3.Bucket("udacity-dend").objects.filter(Prefix='song_data')]
song_data[:5]

['song_data/',
 'song_data/A/A/A/TRAAAAK128F9318786.json',
 'song_data/A/A/A/TRAAAAV128F421A322.json',
 'song_data/A/A/A/TRAAABD128F429CF47.json',
 'song_data/A/A/A/TRAAACN128F9355673.json']

In [47]:
log_data = [filename.key for filename in s3.Bucket("udacity-dend").objects.filter(Prefix='log_data')]
log_data[:5]

['log_data/',
 'log_data/2018/11/2018-11-01-events.json',
 'log_data/2018/11/2018-11-02-events.json',
 'log_data/2018/11/2018-11-03-events.json',
 'log_data/2018/11/2018-11-04-events.json']

## DROP TABLES IF EXISTS

In [105]:
drop_staging_events = "DROP TABLE IF EXISTS staging_events"
drop_staging_songs = "DROP TABLE IF EXISTS staging_songs"
drop_fact_songplay = "DROP TABLE IF EXISTS fact_songplay"
drop_dim_users = "DROP TABLE IF EXISTS dim_users"
drop_dim_songs = "DROP TABLE IF EXISTS dim_songs"
drop_dim_artists = "DROP TABLE IF EXISTS dim_artists"
drop_dim_time = "DROP TABLE IF EXISTS dim_time"

tables_to_drop = [drop_staging_events,drop_staging_songs,drop_fact_songplay, 
                  drop_dim_users, drop_dim_songs,drop_dim_artists,drop_dim_time]

for table in tables_to_drop:
    cur.execute(table)
    conn.commit()
    print(table)

DROP TABLE IF EXISTS staging_events
DROP TABLE IF EXISTS staging_songs
DROP TABLE IF EXISTS fact_songplay
DROP TABLE IF EXISTS dim_users
DROP TABLE IF EXISTS dim_songs
DROP TABLE IF EXISTS dim_artists
DROP TABLE IF EXISTS dim_time


## DESIGNING STAGING, FACT & DIMENSION TABLES

In [106]:
# STAGING tables are used to stage before modeling into Star Schema

create_staging_events = ("""CREATE TABLE IF NOT EXISTS staging_events(
artist VARCHAR,
auth VARCHAR,
firstName VARCHAR,
gender VARCHAR,
itemInSession INTEGER,
lastName VARCHAR,
length FLOAT,
level VARCHAR,
location VARCHAR,
method VARCHAR,
page VARCHAR,
registration BIGINT,
sessionId INTEGER,
song VARCHAR,
status INTEGER,
ts TIMESTAMP,
userAgent VARCHAR,
userId INTEGER
)
""")


create_staging_songs = ("""CREATE TABLE IF NOT EXISTS staging_songs(
num_songs VARCHAR,
artist_id VARCHAR, 
artist_latitude FLOAT, 
artist_longitude FLOAT, 
artist_location VARCHAR, 
artist_name VARCHAR, 
song_id VARCHAR, 
title VARCHAR, 
duration FLOAT,
year INT
)
""")

create_fact_songplay = ("""CREATE TABLE IF NOT EXISTS fact_songplay
(
songplay_id INTEGER IDENTITY(0,1) PRIMARY KEY sortkey,
start_time TIMESTAMP,
user_id INTEGER, 
level VARCHAR, 
song_id VARCHAR,
artist_id VARCHAR,
session_id INTEGER,
location VARCHAR,
user_agent VARCHAR)
""")

create_dim_users = ("""CREATE TABLE IF NOT EXISTS dim_users
(
user_id INTEGER PRIMARY KEY distkey,
first_name VARCHAR,
last_name VARCHAR,
gender VARCHAR,
level VARCHAR)
""")

create_dim_songs = ("""CREATE TABLE IF NOT EXISTS dim_songs
(
song_id VARCHAR PRIMARY KEY,
title VARCHAR, 
artist_id VARCHAR distkey,
year INTEGER, 
duration FLOAT)
""")

create_dim_artists = ("""CREATE TABLE IF NOT EXISTS dim_artists
(
artist_id VARCHAR PRIMARY KEY distkey,
name VARCHAR, 
location VARCHAR, 
lattitude FLOAT, 
longitude FLOAT)
""")

create_dim_time = ("""CREATE TABLE IF NOT EXISTS dim_time
(
start_time TIMESTAMP PRIMARY KEY sortkey distkey, 
hour INTEGER, 
day INTEGER, 
week INTEGER, 
month INTEGER, 
year INTEGER, 
weekday INTEGER)
""")

## CREATING STAGING, FACT & DIMENSION TABLES

In [107]:
tables_to_create =[create_staging_events, create_staging_songs, create_fact_songplay, create_dim_users, create_dim_songs,
                   create_dim_artists, create_dim_time]

for table in tables_to_create:
    cur.execute(table)
    print("Table created")
    conn.commit()

Table created
Table created
Table created
Table created
Table created
Table created
Table created


### COPY staging_events

In [108]:
copy_staging_events = ("""
COPY staging_events FROM {}
CREDENTIALS 'aws_iam_role={}'
COMPUPDATE OFF region 'us-west-2'
TIMEFORMAT as 'epochmillisecs'
TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL
FORMAT AS JSON {};
""").format(LOG_DATA, IAMROLE, LOGPATH)

cur.execute(copy_staging_events)
print("staging events copied")
conn.commit()

staging events copied


In [109]:
# PREVIEW staging_events
##  artist, auth, firstName, gender, itemInSession, lastName, length, level, location, method, page, registration, sessionId, song, status, ts, userAgent, userId

query = cur.execute("""SELECT * FROM staging_events""")
for i in range(2):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

('A Fine Frenzy', 'Logged In', 'Anabelle', 'F', 0, 'Simpson', 267.91138, 'free', 'Philadelphia-Camden-Wilmington, PA-NJ-DE-MD', 'PUT', 'NextSong', 1541044398796, 256, 'Almost Lover (Album Version)', 200, datetime.datetime(2018, 11, 5, 0, 33, 12, 796000), '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', 69)

('Nirvana', 'Logged In', 'Aleena', 'F', 0, 'Kirby', 214.77832, 'paid', 'Waterloo-Cedar Falls, IA', 'PUT', 'NextSong', 1541022995796, 237, 'Serve The Servants', 200, datetime.datetime(2018, 11, 5, 1, 27, 22, 796000), 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0', 44)



## COPY staging_songs

In [110]:
copy_staging_songs = ("""
COPY staging_songs FROM {}
CREDENTIALS 'aws_iam_role={}'
COMPUPDATE OFF region 'us-west-2'
FORMAT AS JSON 'auto'
TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL
""").format(SONG_DATA, IAMROLE)

cur.execute(copy_staging_songs)
print("staging songs copied")
conn.commit()

staging songs copied


In [111]:
# PREVIEW staging_songs
## num_songs, artist_id,  artist_latitude ,  artist_longitude ,  artist_location,  artist_name,  song_id,  title,  duration, year

query = cur.execute("""SELECT * FROM staging_songs""")
for i in range(2):
        row = cur.fetchone()
        if row == None:
            break
        print(row)

('1', 'ARXQBR11187B98A2CC', None, None, 'Liverpool, England', 'Frankie Goes To Hollywood', 'SOBRKGM12A8C139EF6', 'Welcome to the Pleasuredome', 821.05424, 1985)
('1', 'AR62OVB1187FB48D09', None, None, None, 'H.O.S.H.', 'SOTUITZ12AB0181A80', 'Ben Johnson', 431.96036, 0)


## INSERT INTO FACT_songplay

In [112]:
# PREVIEW QUERY B4 INSERTING
query = cur.execute("""SELECT DISTINCT e.ts,
                e.userId as user_id,
                e.level as level,
                s.song_id as song_id,
                s.artist_id as artist_id,
                e.sessionId as session_id,
                e.location as location,
                e.userAgent as user_agent
FROM staging_events e
JOIN staging_songs s ON e.song = s.title AND e.artist = s.artist_name
WHERE e.page='NextSong'
""")

for i in range(2):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(datetime.datetime(2018, 11, 15, 21, 11, 36, 796000), 44, 'paid', 'SOCNCGL127D9786D66', 'AREHK7O1187B9ADDD7', 619, 'Waterloo-Cedar Falls, IA', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0')

(datetime.datetime(2018, 11, 7, 7, 58, 28, 796000), 100, 'free', 'SODTPBM12A8C1339D7', 'AR4OH581187B9B7157', 301, 'New York-Newark-Jersey City, NY-NJ-PA', '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"')



In [113]:
cur.execute("""INSERT INTO fact_songplay (start_time, user_id, level, song_id, artist_id, session_id, location, user_agent)
                                    SELECT DISTINCT e.ts,
                                                    e.userId as user_id,
                                                    e.level as level,
                                                    s.song_id as song_id,
                                                    s.artist_id as artist_id,
                                                    e.sessionId as session_id,
                                                    e.location as location,
                                                    e.userAgent as user_agent
                                    FROM staging_events e
                                    JOIN staging_songs s ON e.song = s.title AND e.artist = s.artist_name
                                    WHERE e.page='NextSong'
                                  """)

# Preview newly created fact_songplay table
query = cur.execute("""SELECT * FROM fact_songplay""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(0, datetime.datetime(2018, 11, 20, 17, 46, 38, 796000), 49, 'paid', 'SOCHRXB12A8AE48069', 'ARTDQRC1187FB4EFD4', 758, 'San Francisco-Oakland-Hayward, CA', 'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0')

(15, datetime.datetime(2018, 11, 6, 16, 38, 15, 796000), 2, 'free', 'SOSMXVH12A58A7CA6C', 'AR6PJ8R1187FB5AD70', 126, 'Plymouth, IN', '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"')

(22, datetime.datetime(2018, 11, 18, 20, 48, 21, 796000), 29, 'paid', 'SOWGZFG12A8151AF41', 'ARC8CQZ1187B98DECA', 589, 'Atlanta-Sandy Springs-Roswell, GA', '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"')



## INSERT INTO dim_users 

In [114]:
# PREVIEW QUERY B4 INSERTING
query = cur.execute("""SELECT DISTINCT userId as user_id,
                firstName as first_name,
                lastName as last_name,
                gender as gender,
                level as level
FROM staging_events
where userId IS NOT NULL;
""")

for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(69, 'Anabelle', 'Simpson', 'F', 'free')

(44, 'Aleena', 'Kirby', 'F', 'paid')

(52, 'Theodore', 'Smith', 'M', 'free')



In [115]:
cur.execute("""INSERT INTO dim_users(user_id, first_name, last_name, gender, level)
                        SELECT DISTINCT userId as user_id,
                                        firstName as first_name,
                                        lastName as last_name,
                                        gender as gender,
                                        level as level
                        FROM staging_events
                        where userId IS NOT NULL;

""")

# Preview newly created dim_users table 
query = cur.execute("""SELECT * FROM dim_users""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(43, 'Jahiem', 'Miles', 'M', 'free')

(26, 'Ryan', 'Smith', 'M', 'free')

(12, 'Austin', 'Rosales', 'M', 'free')



## INSERT INTO dim_songs

In [116]:
# PREVIEW query b4 inserting
query = cur.execute("""SELECT DISTINCT song_id, title, artist_id, year, duration
                    FROM staging_songs
                    WHERE song_id IS NOT NULL
                    """)
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)

('SOCOKEW12A8C136D13', 'Labyrinth', 'AR8Y6HV1187FB5546D', 2008, 352.67873)
('SOVISQQ12AB0184B6D', 'Sina Mory', 'AR3ZGUC1187FB57721', 2010, 267.59791)
('SOKBBXV12AB0186B8A', 'Papel Quemado', 'ARMIQRB12298900AFB', 0, 309.96853)


In [117]:
cur.execute("""INSERT INTO dim_songs (song_id, title, artist_id, year, duration)
                SELECT DISTINCT song_id, title, artist_id, year, duration
                FROM staging_songs
                WHERE song_id IS NOT NULL
                """)


# Preview newly created dim_users table 
query = cur.execute("""SELECT * FROM dim_songs""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")


('SOBBRKE12AB0181388', 'Tricky Woman', 'AR6NJ1P1187FB59DAB', 1995, 215.11791)

('SOMJJJT12AB0182379', 'Foolish Silence', 'ARISGWX119B8669508', 0, 276.89751)

('SONCPQC12A58A7D3A7', 'Spark', 'ARWHM281187FB3D381', 2002, 220.96934)



## INSERTING INTO dim_artist

In [118]:
# PREVIEW query b4 inserting
query = cur.execute("""SELECT DISTINCT artist_id, e.artist as name, s.artist_location, s.artist_latitude, s.artist_longitude
                        FROM staging_events e
                        JOIN staging_songs s ON e.artist = s.artist_name
                        WHERE e.artist IS NOT NULL
                    """)
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)

('AR6C8EJ1187FB3F473', 'Nelson Ned', ' Ubá, Minas Gerais', None, None)
('ARUB0K61187B9B9AC3', 'Hot Water Music', 'Gainesville, FL', 29.65195, -82.32318)
('ARSL5SP1187B9A7AE0', 'N.W.A.', 'Compton, California, USA.', 34.05349, -118.24532)


In [119]:
cur.execute("""INSERT INTO dim_artists(artist_id, name, location, lattitude, longitude)
                   SELECT DISTINCT artist_id, e.artist as name, s.artist_location, s.artist_latitude, s.artist_longitude
                   FROM staging_events e
                   JOIN staging_songs s ON e.artist = s.artist_name
                   WHERE e.artist IS NOT NULL
            """)

# Preview newly created dim_artist table 
query = cur.execute("""SELECT * FROM dim_artists""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

('ARB60IV1187FB370FE', 'Reverend Horton Heat', 'Corpus Christi, TX', 32.77815, -96.7954)

('AR1ZYLH1187B98C159', 'Naughty By Nature', 'East Orange, NJ', None, None)

('AR5J8XN1187B9B712E', 'Extreme', None, None, None)



## INSERTING INTO dim_time

In [120]:
# Preview query to insert
cur.execute("""SELECT DISTINCT ts, 
               extract(h from ts) AS hour, 
               extract(d from ts) AS day, 
               extract(w from ts) AS week, 
               extract(mon from ts) AS month, 
               extract(year from ts) AS year, 
               extract(dow from ts) AS weekday
               FROM staging_events WHERE ts IS NOT NULL
""")
cur.fetchone()

(datetime.datetime(2018, 11, 5, 0, 33, 12, 796000), 0, 5, 45, 11, 2018, 1)

In [121]:
cur.execute("""INSERT INTO dim_time(start_time, hour, day, week, month, year, weekday)
               SELECT DISTINCT ts, 
               extract(h from ts) AS hour, 
               extract(d from ts) AS day, 
               extract(w from ts) AS week, 
               extract(mon from ts) AS month, 
               extract(year from ts) AS year, 
               extract(dow from ts) AS weekday
               FROM staging_events WHERE ts IS NOT NULL
            """)

# Preview newly created dim_time table 
query = cur.execute("""SELECT * FROM dim_time""")
for i in range(3):
        row = cur.fetchone()
        if row == None:
            break
        print(row)
        print("")

(datetime.datetime(2018, 11, 1, 21, 52, 5, 796000), 21, 1, 44, 11, 2018, 4)

(datetime.datetime(2018, 11, 1, 22, 23, 14, 796000), 22, 1, 44, 11, 2018, 4)

(datetime.datetime(2018, 11, 2, 2, 42, 48, 796000), 2, 2, 44, 11, 2018, 5)



In [140]:
# Join each dim table to the fact table and preview it

cur.execute("""SELECT * FROM fact_songplay fs
JOIN dim_users on fs.user_id = dim_users.user_id
JOIN dim_artists on fs.artist_id = dim_artists.artist_id
JOIN dim_songs on fs.song_id = dim_songs.song_id
JOIN dim_time on fs.start_time = dim_time.start_time
""")
df = pd.DataFrame(cur.fetchone()).T


for value in list(df.values[0]):
    print(value)
df

182
2018-11-09 19:35:24.796000
36
paid
SOBJUKG12A58A7DCA8
AR9W3X91187FB3994C
392
Janesville-Beloit, WI
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
36
Matthew
Jones
M
paid
AR9W3X91187FB3994C
Phil Collins
Chiswick, London, England
None
None
SOBJUKG12A58A7DCA8
Two Hearts
AR9W3X91187FB3994C
1988
204.19873
2018-11-09 19:35:24.796000
19
9
45
11
2018
5


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,182,2018-11-09 19:35:24.796,36,paid,SOBJUKG12A58A7DCA8,AR9W3X91187FB3994C,392,"Janesville-Beloit, WI","""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",36,...,AR9W3X91187FB3994C,1988,204.19873,2018-11-09 19:35:24.796,19,9,45,11,2018,5


# Delete your cluster and resources after no longer needed

In [141]:
# Delete Cluster
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'dwhuser',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.c2hvwovgwksn.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2021, 7, 30, 13, 57, 36, 686000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-d4539de2',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-6fa1a417',
  'AvailabilityZone': 'us-west-2d',
  'PreferredMaintenanceWindow': 'sun:07:30-sun:08:00',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible

In [142]:
# Check deletion status
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)


Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.c2hvwovgwksn.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-6fa1a417
7,NumberOfNodes,4


In [143]:
# Detach role policy & DELETE role
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)

{'ResponseMetadata': {'RequestId': '06950b7a-69b9-421f-b225-29f425e4c7c1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '06950b7a-69b9-421f-b225-29f425e4c7c1',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Fri, 30 Jul 2021 16:42:16 GMT'},
  'RetryAttempts': 0}}