In [None]:
# S3-Sample-Loader
import boto3
import configparser
import os

### Check S3 storage for a partial set of the data, download JSON files and load them into DB
print("DATA SAMPLE LOAD ....")

# Some configurations and initialisations
numfil = 10 # number of sample files to download
config = configparser.ConfigParser()
config.read('dwh.cfg')
KEY=config.get('AWS','KEY')
SECRET= config.get('AWS','SECRET')
BUCKET = config.get('S3', 'S3_BUCKET')
songdatafiles = list(())
eventfiles = list(())

# Connect to S3
print("Setting up S3 connection for ", BUCKET)
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                     )
projectBucket =  s3.Bucket(BUCKET)

# Check S3 for song data using "song-data" as prefix
print("Searching song data")
for songfile in projectBucket.objects.filter(Prefix="song-data"):
    if "json" in songfile.key:
        songdatafiles.append(songfile.key)
print("Finished checking, found ", len(songdatafiles), " files")

# Check S3 for events using "log_data" as prefi
print("Searching log events data")
for eventfile in projectBucket.objects.filter(Prefix="log_data"):
    if "json" in eventfile.key:
        eventfiles.append(eventfile.key)
print("Finished checking, found ", len(eventfiles), " files")

# Just use the first x files and download them locally
partialsongdata = songdatafiles[0:(numfil-1)]
partialevents = eventfiles[0:(numfil-1)]

# Download song data
print("Starting download:")
for key in partialsongdata:
    fullpath = ('s3://udacity-dend/' + key)
    splitted_path = fullpath.split("/")
    filename = str(splitted_path[-1])
    location = "tmp/" + filename
    print("Downloading ... ", fullpath, " to ", os.getcwd())
    with open(location, 'wb') as file:
        s3.Bucket("udacity-dend").download_file(key, location)
print("Downloaded ", len(partialsongdata), " files")

# Download event data
print("Checking for event data on S3 storage...")
print("Starting download:")
for key in partialevents:
    fullpath = ('s3://udacity-dend/' + key)
    splitted_path = fullpath.split("/")
    filename = str(splitted_path[-1])
    location = "tmp/" + filename
    print("Downloading ... ", fullpath, " to ", os.getcwd())
    with open(location, 'wb') as file:
        s3.Bucket("udacity-dend").download_file(key, location)
   
print("Downloaded ", len(partialsongdata), " files")
print("DATA SAMPLE LOAD FROM S3 DONE")

In [29]:
# S3-File List Creator (creates a manifest JSON file)
import boto3
import configparser
import os
from datetime import datetime
import pandas as pd

### Check S3 storage for a partial set of the data, download JSON files and load them into DB
print("FILE LIST CREATION START....")

# Some configurations and initialisations
numfil = 10 # number of sample files to download
config = configparser.ConfigParser()
config.read('dwh.cfg')
KEY=config.get('AWS','KEY')
SECRET= config.get('AWS','SECRET')
BUCKET = config.get('S3', 'S3_BUCKET')
S3_BCKT_ROOT = config.get('S3', 'S3_BCKT_ROOT')
songdatafiles = list(())
eventfiles = list(())

# Connect to S3
print("Setting up S3 connection for ", BUCKET)
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                     )
projectBucket =  s3.Bucket(BUCKET)

# Check S3 for song data using "song-data" as prefix
print(datetime.now())
print("Searching song data")
for songfile in projectBucket.objects.filter(Prefix="song-data").limit(6):
    if "json" in songfile.key:
        songdatafiles.append(S3_BCKT_ROOT + '/song_data/' + songfile.key)
print("Finished checking, found ", len(songdatafiles), " files")

# Check S3 for events using "log_data" as prefi
print("Searching log events data")
for eventfile in projectBucket.objects.filter(Prefix="log_data").limit(6):
    if "json" in eventfile.key:
        eventfiles.append(S3_BCKT_ROOT + '/song_data/' + eventfile.key)
print("Finished checking, found ", len(eventfiles), " files")
print(datetime.now())

import json
events = pd.DataFrame(eventfiles, columns=['url'])
events_str = events.to_json(orient='records')
events_str = json.dumps({ 'entries': events_str})
events_str = events_str.replace('\\', '')
print(events_str)

with open('events.manifest', 'w') as f:
    f.write(events_str)
    f.close()
print('File ', f.name, ' created successfully')

songs = pd.DataFrame(eventfiles, columns=['url'])
songs_str = songs.to_json(orient='records')
songs_str = json.dumps({ 'entries': songs_str})
songs_str = songs_str.replace('\\', '')
print(songs_str)

with open('songs.manifest', 'w') as f:
    f.write(songs_str)
    f.close()

print('File ', f.name, ' created successfully')
print("SCRIPT DONE")

FILE LIST CREATION START....
Setting up S3 connection for  udacity-dend
2021-01-18 11:14:16.531331
Searching song data
Finished checking, found  5  files
Searching log events data
Finished checking, found  5  files
2021-01-18 11:14:17.949877
{"entries": "[{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-11-01-events.json"},{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-11-02-events.json"},{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-11-03-events.json"},{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-11-04-events.json"},{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-11-05-events.json"}]"}
File  events.manifest  created successfully
{"entries": "[{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-11-01-events.json"},{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-11-02-events.json"},{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-11-03-events.json"},{"url":"s3://udacity-dend/song_data/log_data/2018/11/2018-1

In [17]:
import socket
DWH_ENDPOINT = 'dwhclustersts.cdzpwfcnkher.us-west-2.redshift.amazonaws.com'
DWH_PORT = '439'
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
    s.connect((DWH_ENDPOINT, DWH_PORT))
    print(s.getpeername())

TypeError: an integer is required (got type str)

In [None]:
# Data Load into staging tables
import pandas as pd
import psycopg2
import configparser
import os
from sql_queries import staging_events_insert, staging_songs_insert

# Some configurations and initialisations
config = configparser.ConfigParser()
config.read('dwh.cfg')
# Search "tmp/" for JSON files to upload into DB
filelist = os.listdir('tmp/')
print("LOAD DATA TO DB STAGING TABLES")
print("Found ", len(filelist), " files")
print(filelist)

# Set up connection
print("Connecting to database")
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
conn.set_session(autocommit=True)
cur = conn.cursor()

# Load data into DB
for filename in filelist:
    location = "tmp/" + filename
    print("Load ", location)
    if "events" in filename:
        df = pd.read_json(location, lines=True, dtype=True)
        events_table_columns = df[["artist", "auth", "firstName", "gender", "itemInSession", "lastName", "length", "level", "location", "method", "page", "registration", "sessionId", "song", "status", "ts", "userAgent", "userId"]]
        events_data = events_table_columns.values[0].tolist()
        try:
            cur.execute(staging_events_insert, events_data)
            print("Success")
        except psycopg2.Error as e:
            print("Insert failed: ", e)
    else:
        if not filename.startswith("."):
            df = pd.read_json(location, lines=True, dtype=True)
            songs_table_columns = df[["num_songs","artist_id","artist_latitude","artist_longitude","artist_location", "artist_name", "song_id", "title", "duration", "year"]]
            songs_data = songs_table_columns.values[0].tolist()
            try:
                cur.execute(staging_songs_insert, songs_data)
                print("Success")
            except psycopg2.Error as e:
                print("Insert failed: ", e)
 
conn.close()
print("LOADING DONE")

In [5]:
# Check if tables contain data
import configparser
import psycopg2
from sql_queries import create_table_queries, drop_table_queries
import boto3
from botocore.exceptions import ClientError
import pandas as pd
from datetime import datetime

def create_redshift_client(KEY, SECRET):
    try:
        print("1.1 Setting up connections") 
        redshift = boto3.client('redshift',
                               region_name="us-west-2",
                               aws_access_key_id=KEY,
                               aws_secret_access_key=SECRET
                               )
        print("SUCCESS creating REDSHIFT client ", type(redshift))
    except Exception as e:
        print("FAILED creating REDSHIFT client: ", e)
    return redshift

config = configparser.ConfigParser()
config.read('dwh.cfg')
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")
redshift_client = create_redshift_client(KEY, SECRET)
DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
myClusterProps = redshift_client.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
params = [DWH_ENDPOINT, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT]

print("Trying db connection...")
try:
    conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*params))
    conn.set_session(autocommit=True)
    cur = conn.cursor()
    print("DB connected...")
except Exception as e:
    print(e)

select = "SELECT * FROM "
tables = ["staging_songs", "staging_events", "users", "songs", "time", "songplays"]
delim = ";"

for tab in tables:
    query = select + tab + delim
    cur.execute(query)
    response = cur.fetchall()
    print("Table ", tab, " has ", len(response), " entries.")
    
    
song_stage_columns=['st_song_id', 'num_songs', 'artist_id', 'artist_latitude', 'artist_longitude', 
                        'artist_location', 'artist_name', 'song_id', 'title', 'length', 'year']
event_stage_columns=['artist', 'auth', 'firstname', 'gender', 'iteminsession', 
                         'lastname', 'length', 'level', 'location', 'method', 'page', 'registration', 
                         'sessionid', 'song', 'status', 'ts', 'useragent', 'userid']
cur.execute('SELECT * from staging_events') 
staging_events =  pd.DataFrame(cur.fetchall(), columns=event_stage_columns)
staging_songs = pd.DataFrame(cur.fetchall(), columns=song_stage_columns)

with open('results.xlsx', 'w') as f:
    staging_events.to_excel(f, sheet_name='Staging_events')
    staging_songs.to_excel(f, sheet_name='Staging_songs')

    
print(datetime.now(), ': Selected ', len(staging_events), ' rows from staging_events')
staging_events = staging_events.drop_duplicates()
cur.execute('SELECT * from staging_songs') 
staging_songs = pd.DataFrame(cur.fetchall(), columns=song_stage_columns)
print(datetime.now(), ': Selected ', len(staging_songs), ' rows from staging_songs')
staging_songs = staging_songs.drop_duplicates()
song_columns=['song_id','title','artist_id','year','length']
songplay_columns=['ts', 'userid', 'level', 'song_id', 'artist_id', 'sessionid', 'location', 'useragent']
artist_columns=['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']


# Filter events table for songplay events and remove missing data for key columns
events_df = staging_events.query('page == "NextSong"').copy()
events_df.dropna(subset=['userid', 'ts', 'song'], how='any')

artist_df = staging_songs[artist_columns].copy()
artist_df.dropna(subset=['artist_name'])
artist_df.drop_duplicates(subset='artist_id')
#print(artist_df.head())

songs_df = staging_songs[song_columns].copy()
songs_df.drop_duplicates('song_id')
sp_events = events_df[['ts', 'userid', 'level', 'sessionid', 'location', 'useragent', 'song']]
sp_songs = songs_df[['song_id', 'artist_id', 'title']]
sp_df = sp_events.merge(sp_songs, left_on='song', right_on='title')
sp_df = sp_df[songplay_columns]
print(songs_df.loc[songs_df['title'] == 'Becoming Insane'].head(), events_df.loc[events_df['song'] == 'Becoming Insane'].head())
#print(sp_df.head())
    
conn.close()

print("Connection closed")

1.1 Setting up connections
SUCCESS creating REDSHIFT client  <class 'botocore.client.Redshift'>
Trying db connection...
DB connected...
Table  staging_songs  has  14896  entries.
Table  staging_events  has  8056  entries.
Table  users  has  105  entries.
Table  songs  has  29792  entries.
Table  time  has  8056  entries.
Table  songplays  has  3432  entries.


IllegalCharacterError: 

In [None]:
import pandas as pd
from datetime import datetime
load = %load_ext sql
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
connected = %sql $conn_string

#%sql insert into songplays (start_time, user_id, level, song_id, artist_id, session_id, location, user_agent) select e.ts, e.userid, e.level, s.song_id, s.artist_id, e.sessionid, e.location, e.useragent from songs as s left outer join staging_events as e on e.song like s.title where e.page like 'NextSong'
#%sql select e.ts, e.userid, e.level, s.song_id, s.artist_id, e.sessionid, e.location, e.useragent from songs as s left outer join staging_events as e on e.song like s.title where e.page like 'NextSong'

%sql select * from songplays

In [91]:
import os
import glob
import psycopg2
import pandas as pd
from sql_queries import *


def process_song_file(cur, filepath):
    """Takes the json file provided in filepath and reads the file
    Then song data and artist data are selected and inserted into the corresponding tables"""
    # Open song file using the path variable "filepath" and create a dataframe from it
    df = pd.read_json(filepath, lines=True)
    
    # Insert songs record
    # Select song data columns from df and store in a new dataframe
    songs_table_columns = df[["num_songs","artist_id","artist_latitude","artist_longitude","artist_location", "artist_name", "song_id", "title", "duration", "year"]]

    # Select only the values from df and store in a list
    song_data = songs_table_columns.values[0].tolist()
    # Write to songs table
    cur.execute(song_table_insert, song_data)
    
def process_event_file(cur, filepath)
    df = pd.read_json(filepath, lines=True)
    events_table_columns = df[["artist", "auth", "firstName", "gender", "itemInSession", "lastName", "length", "level", "location", "method", "page", "registration", "sessionId", "song", "status", "ts", "userAgent", "userId"]]
    events_data = events_table_columns.values[0].tolist()

    
def process_data(cur, conn, filepath, func):
    """Recursively search given filepath and process all json files.
    Calls for each file the function "func" (which is either directing from here to "process_song_file" or "process_log_file
    Nothing returned from here, but the changes from the sub-functions are committed"""
    # et all files matching extension from directory
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))

    # get total number of files found
    num_files = len(all_files)
    print('{} files found in {}'.format(num_files, filepath))

    # iterate over files and process
    for i, datafile in enumerate(all_files, 1):
        func(cur, datafile)
        conn.commit()
        print('{}/{} files processed.'.format(i, num_files))

SyntaxError: invalid syntax (<ipython-input-91-69571cc4365a>, line 23)

In [18]:
### CREATE IAM ROLE AND POLICY
from botocore.exceptions import ClientError
import boto3
import configparser
import pandas as pd
import json

# Read config items from file and setup connections
print("1.1 Reading config file") 
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")
DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")
DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")
pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })
print("Done")

try:
    print("1.2 Setting up connections") 
    iam = boto3.client('iam',
                           region_name="us-west-2",
                           aws_access_key_id=KEY,
                           aws_secret_access_key=SECRET
                           )
    print("SUCCESS creating IAM client ", type(iam))
except Exception as e:
    print("FAILED creating IAM client: ", e)

try:
    print("1.3 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
print("1.4 Attaching Policy")
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']
print("1.5 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
print(roleArn)

1.1 Reading config file
Done
1.2 Setting up connections
SUCCESS creating IAM client  <class 'botocore.client.IAM'>
1.3 Creating a new IAM Role
1.4 Attaching Policy
1.5 Get the IAM role ARN
arn:aws:iam::422675603730:role/dwhRole


In [None]:
### CREATE REDSHIFT CLUSTER
from botocore.exceptions import ClientError
import boto3
import configparser
import pandas as pd
import json

# Read config items from file and setup connections
print("1.1 Reading config file") 
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")
DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")
DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")
pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })
print("Done")

try:
    print("1.1 Setting up connections") 
    redshift = boto3.client('redshift',
                           region_name="us-west-2",
                           aws_access_key_id=KEY,
                           aws_secret_access_key=SECRET
                           )
    print("SUCCESS creating REDSHIFT client ", type(redshift))
except Exception as e:
    print("FAILED creating REDSHIFT client: ", e)
contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. For example, here are filepaths to two files in this dataset.



try:
    print("1.2 Create cluster") 
    response = redshift.create_cluster(        
        #HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        #NumberOfNodes=int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        c
    )
    print("SUCCESS creating cluster: ", str(response.get("ClusterCreateTime")))
except Exception as e:
    print("FAILED creating cluster: ", e)

def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

In [None]:
# ENABLE TCP CONNECTION
from botocore.exceptions import ClientError
import boto3
import configparser
import pandas as pd

# Wait for Redshift instance to come up
redshift_waiter = redshift.get_waiter('cluster_available')
redshift_waiter.wait(ClusterIdentstr(sys.argv)ifier='dwhClusterSTS')
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]

try:
    print("1.1 Setting up connections") 
    ec2 = boto3.resource('ec2',
                           region_name="us-west-2",
                           aws_access_key_id=KEY,
                           aws_secret_access_key=SECRET
                           )
    print("SUCCESS creating EC2 client ", type(iam))
except Exception as e:
    print("FAILED creating EC2 client: ", e)

print("1.2 Open TCP Port")
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
    print("SUCCESS opening TCP Port")
except Exception as e:
    print("FAILED opening TCP Port ", e)

In [19]:
%load_ext sql
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
%sql $conn_string
# List Load Errors
%sql select filename, line_number, colname, type, position, raw_line, err_code, err_reason from stl_load_errors;
# Show Load Error details
%sql select d.query, substring(d.filename,14,20), d.line_number as line, substring(d.value,1,16) as value, substring(le.err_reason,1,48) as err_reason from stl_loaderror_detail d, stl_load_errors le where d.query = le.query and d.query = pg_last_copy_id();

 * postgresql://dwhuser:***@dwhclustersts.cdzpwfcnkher.us-west-2.redshift.amazonaws.com:5439/dwh
2 rows affected.
 * postgresql://dwhuser:***@dwhclustersts.cdzpwfcnkher.us-west-2.redshift.amazonaws.com:5439/dwh
0 rows affected.


query,substring,line,value,err_reason


In [None]:
### DATA LOAD
import pandas as pd
import psycopg2
import configparser
import os
from datetime import datetime
from sql_queries import staging_events_insert, staging_songs_insert
from create_tables import create_client

# Some configurations and initialisations
now = datetime.now()
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")
redshift = create_client(KEY, SECRET, 'redshift')
config = configparser.ConfigParser()
config.read('dwh.cfg')
S3_BCKT_ROOT                    = config.get('S3','S3_BCKT_ROOT') + "/"
root = pd.Series(S3_BCKT_ROOT)
S3_ROOT                    = config.get('S3','S3_BUCKET')
LOG_JSONPATH                    = config.get('S3','LOG_JSONPATH')
SONG_DATA                    = config.get('S3','SONG_DATA')
LOG_DATA                    = config.get('S3','LOG_DATA')

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
region_name="us-west-2"

# Set up connection
print(now, ": Connecting to database")
conn = psycopg2.connect(host=DWH_ENDPOINT, dbname=DWH_DB, user=DWH_DB_USER, password=DWH_DB_PASSWORD, port=DWH_PORT)
conn.set_session(autocommit=True)
cur = conn.cursor()

# Search "tmp/" for JSON files to upload into DB
filelist = os.listdir('tmp/')
songsample = list()
eventsample = list()
print("LOAD DATA TO DB STAGING TABLES")
for file in filelist:
    if "json" in file:
        if "events" in file:
            eventsample.append(file)
        else:
            songsample.append(file)
print("Found ", (len(eventsample) + len(songsample)), " files")

s3songs = pd.Series(songdatafiles)
s3events = pd.Series(eventfiles)
s3songsample = pd.Series(s3songs[0:10])
s3songsample = root.values + s3songsample.values
s3eventsample = pd.Series(s3events[0:10])
s3eventsample = root.values + s3eventsample.values

for file in s3eventsample:
    copy_events = "copy staging_events from %s iam_role %s compupdate off blanksasnull emptyasnull region %s format as json \'auto ignorecase\'"
    try:
        cur.execute(copy_events, [file, DWH_ROLE_ARN, region_name])
        print(cur.fetchall())
    except Exception as e:
        print(e)
    print("Loading ", file, " done")
    
for file in s3songsample:
    copy_events = "copy staging_songs from %s iam_role %s compupdate off blanksasnull emptyasnull region %s format as json \'auto ignorecase\'"
    try:
        cur.execute(copy_events, [file, DWH_ROLE_ARN, region_name])
        print(cur.fetchall())
        print("Loading ", file, " done")
    except Exception as e:
        print(e)


In [None]:
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
#iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)