# EDA - Log Files

This Notebook is to analyze the Data after loading into Dummy Staging Tables with long Varchar Types

In [1]:
import configparser
import psycopg2
import pandas as pd

In [2]:
def func_connect_redshift():
    """Create a Connection to the Cluster"""
    config = configparser.ConfigParser()
    config.read('dwh.cfg')

    config_db_name = config.get('CLUSTER','db_name')
    config_db_user = config.get('CLUSTER','db_user')
    config_db_password = config.get('CLUSTER','db_password')
    config_dwh_end_point = config.get('CLUSTER','dwh_end_point')
    config_db_port = config.get('CLUSTER','db_port')
    
    conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(config_dwh_end_point,config_db_name,config_db_user,config_db_password,config_db_port))
    cur = conn.cursor()
    
    return conn,cur

In [3]:
conn, cur = func_connect_redshift()

In [4]:
conn

<connection object at 0x7f4f0dfd8508; dsn: 'user=sparkify_user password=xxx dbname=sparkify_db host=sparkify-dwh.ct9qgawfx2gi.us-west-2.redshift.amazonaws.com port=5439', closed: 0>

## Creating the Dummy Staging Schema and Tables for Data Ingestion then Profling

In [5]:
cur.execute('CREATE SCHEMA IF NOT EXISTS STAGING_SCHEMA;')

## Song Dummy Staging Table DDL

In [None]:
cur.execute("""
CREATE TABLE IF NOT EXISTS STAGING_SCHEMA.STG_DUMMY_SONG
(
    num_songs VARCHAR(500),
    artist_id VARCHAR(500),
    artist_latitude VARCHAR(500),
    artist_longitude VARCHAR(500),
    artist_location VARCHAR(500),
    artist_name VARCHAR(500),
    song_id VARCHAR(500),
    title VARCHAR(500),
    duration VARCHAR(500),
    year VARCHAR(500)
);
""")

In [None]:
conn.commit()

## Loading Song Data into the Dummy Staging Song Table

In [None]:
cur.execute("""
COPY STAGING_SCHEMA.STG_DUMMY_SONG 
from 's3://udacity-dend/song_data/' 
CREDENTIALS 'aws_iam_role=arn:aws:iam::131785130434:role/redshift_IAM_role'
JSON 'auto' REGION 'us-west-2';
""")

In [None]:
# conn.commit()

In [None]:
# Checking if there are any errors arose during the data load
cur.execute("""SELECT * FROM stl_load_errors LIMIT 5;""")

In [None]:
# No Errors during the Data Load
cur.fetchall()

In [None]:
cur.execute('SELECT COUNT(*) FROM STAGING_SCHEMA.STG_DUMMY_SONG')

In [None]:
# # As we can see the loaded data are 14896 which can be fitted into a pandas dataframe easily without any problems
# cur.fetchall()[0][0]

In [None]:
cur.execute('SELECT * FROM STAGING_SCHEMA.STG_DUMMY_SONG LIMIT 5;')

In [None]:
cur.fetchall()

## Log Dummy Staging Table DDL

In [6]:
cur.execute("""
CREATE TABLE IF NOT EXISTS STAGING_SCHEMA.STG_DUMMY_LOG
(
    artist VARCHAR(500),
    auth VARCHAR(500),
    firstname VARCHAR(500),
    gender VARCHAR(500),
    iteminsession VARCHAR(500),
    lastname VARCHAR(500),
    length VARCHAR(500),
    level VARCHAR(500),
    location VARCHAR(500),
    method VARCHAR(500),
    page VARCHAR(500),
    registration VARCHAR(500),
    sessionid VARCHAR(500),
    song VARCHAR(500),
    status VARCHAR(500),
    ts VARCHAR(500),
    useragent VARCHAR(500),
    userid VARCHAR(500)
)
""")

In [7]:
conn.commit()

## Loading Song Data into the Dummy Staging Log Table

In [None]:
# cur.execute("""DROP TABLE STAGING_SCHEMA.STG_DUMMY_LOG""")
# conn.commit()

In [8]:
cur.execute("""
COPY STAGING_SCHEMA.STG_DUMMY_LOG 
from 's3://udacity-dend/log_data/' 
CREDENTIALS 'aws_iam_role=arn:aws:iam::131785130434:role/redshift_IAM_role'
JSON 's3://udacity-dend/log_json_path.json' REGION 'us-west-2';
""")

In [9]:
conn.commit()

In [10]:
cur.execute("""SELECT COUNT(*) FROM STAGING_SCHEMA.STG_DUMMY_LOG;""")

In [11]:
cur.fetchall()

[(8056,)]

In [12]:
cur.execute("""
SELECT MAX(LEN(artist)),'artist' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(auth)),'auth' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(firstName)),'firstName' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(gender)),'gender' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(ItemInSession)),'ItemInSession' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(lastName)),'lastName' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(length)),'length' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(level)),'level' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(location)),'location' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(method)),'method' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(page)),'page' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(registration)),'registration' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(sessionId)),'sessionId' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(song)),'song' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(status)),'status' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(ts)),'ts' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(userAgent)),'userAgent' FROM STAGING_SCHEMA.STG_DUMMY_LOG
UNION
SELECT MAX(LEN(userId )),'userId' FROM STAGING_SCHEMA.STG_DUMMY_LOG
""")

In [13]:
cur.fetchall()

[(4, 'sessionId'),
 (18, 'length'),
 (89, 'artist'),
 (3, 'method'),
 (9, 'lastName'),
 (3, 'status'),
 (10, 'firstName'),
 (10, 'auth'),
 (46, 'location'),
 (13, 'ts'),
 (151, 'song'),
 (4, 'level'),
 (16, 'page'),
 (1, 'gender'),
 (3, 'userId'),
 (13, 'registration'),
 (139, 'userAgent'),
 (3, 'ItemInSession')]