# EDA - Song Files

This Notebook is to analyze the Data after loading into the main Staging Tables

The Main Objectives are
- Defining the best Data types for all the Columns 
- Checking the Quality of the Data to handle corrupted data
- Defining the PKs, Distribution & Sorting Keys for optimization

In [1]:
import configparser
import psycopg2
import pandas as pd
from IAC_create_redshift_cluster import func_connect_to_redshift

In [2]:
try:
    # Connecting to the Cluster
    conn, cur = func_connect_to_redshift('dwh.cfg')

    dict_conn_info = conn.get_dsn_parameters()

    print(" Connected to {}, Host: {}, User: {}".format \
            (
            dict_conn_info.get('dbname')
            , dict_conn_info.get('host')
            , dict_conn_info.get('user')
        )
    )

except Exception as e:
    print(" Faild to Connect to the Cluster, {}".format(e))
    sys.exit(-1)

 Connected to sparkify_db, Host: sparkify-dwh.ct9qgawfx2gi.us-west-2.redshift.amazonaws.com, User: sparkify_user


In [3]:
conn

<connection object at 0x7f6d1d03a048; dsn: 'user=sparkify_user password=xxx dbname=sparkify_db host=sparkify-dwh.ct9qgawfx2gi.us-west-2.redshift.amazonaws.com port=5439', closed: 0>

In [4]:
config = configparser.ConfigParser()
config.read('dwh.cfg')
config_dwh_arn = config.get('CLUSTER','dwh_arn')

In [5]:
cur.execute('CREATE SCHEMA IF NOT EXISTS STAGING_SCHEMA;')

## Song Data Staging Table DDL

In [6]:
cur.execute("""CREATE TABLE IF NOT EXISTS STAGING_SCHEMA.STG_SONG
(
    num_songs VARCHAR(10),
    artist_id VARCHAR(50),
    artist_latitude VARCHAR(50),
    artist_longitude VARCHAR(50),
    artist_location VARCHAR(250),
    artist_name VARCHAR(250),
    song_id VARCHAR(50),
    title VARCHAR(250),
    duration VARCHAR(50),
    year VARCHAR(10)
)""")
conn.commit()

In [7]:
cur.execute("""
COPY STAGING_SCHEMA.STG_SONG 
from 's3://udacity-dend/song_data/' 
CREDENTIALS 'aws_iam_role={}'
JSON 'auto' REGION 'us-west-2';
""".format(config_dwh_arn))

In [8]:
conn.commit()

In [9]:
cur.execute("""SELECT COUNT(*) FROM STAGING_SCHEMA.STG_SONG;""")

In [10]:
cur.fetchall()[0][0]

14896

In [11]:
cur.execute("""SELECT * FROM STAGING_SCHEMA.STG_SONG;""")

In [12]:
# This Dataframe contains all the Song Data
df_song_data = pd.DataFrame(data=cur.fetchall(),columns=['num_songs','artist_id','artist_latitude','artist_longitude','artist_location','artist_name','song_id','title','duration','year'])

## Songs Data Files Profiling

In [13]:
df_song_data.head()

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
0,1,ARBZIN01187FB362CC,1.32026,103.78871,27,Paris Hilton,SOERIDA12A6D4F8506,I Want You (Album Version),192.28689,2006
1,1,ARKIQCZ1187B9A7C7C,52.23974,-0.8857599999999999,"Northampton, Northamptonshire, En",Bauhaus,SOSIJKW12A8C1330E3,A God In An Alcove (Session Version),248.65914,0
2,1,ARQVORN11F50C4EFEC,,,,Bedlight For Blue Eyes,SOMFRKT12A8C146C67,Without You,165.38077,0
3,1,AR7WK5411A348EF5EA,48.85692,2.34121,PARIS - NANTES,Minitel Rose,SOTCOTZ12A8C136BCB,Elevator,248.31955,2008
4,1,AR0IT221187B999C4D,50.50101,4.47684,BELGIUM,The Weathermen,SOFJPHQ12A6D4FBA32,Let Them Come To Berlin,246.17751,0


In [14]:
df_song_data.describe(include='O')

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
count,14896,14896,5277.0,5277.0,14895.0,14896,14896,14896,14896.0,14896
unique,1,9553,1109.0,1110.0,2084.0,9936,14896,14402,8460.0,60
top,1,ARYPTWE1187FB49D64,40.71455,-74.00712,,Badly Drawn Boy,SOFRAZJ12A8C143CDB,Intro,199.78404,0
freq,14896,9,278.0,278.0,6694.0,9,1,25,11.0,4762


From the Above cell we can see the below:
- All Data loaded successfully in the DF
- We have data for 60 Years
- Song ID is the Unique column in the Song Data
- Many Null Values in artist_latitude & artist_longitude, and one null value in artist_location
- num_songs is useless column because it contains only one value

### Checking Data Completness

In [15]:
df_song_data.isnull().sum()

num_songs              0
artist_id              0
artist_latitude     9619
artist_longitude    9619
artist_location        1
artist_name            0
song_id                0
title                  0
duration               0
year                   0
dtype: int64

In [16]:
# Checking the empty Strings in all the Columns
df_song_data.applymap(lambda x:x=="").any()

num_songs           False
artist_id           False
artist_latitude     False
artist_longitude    False
artist_location      True
artist_name         False
song_id             False
title               False
duration            False
year                False
dtype: bool

In [17]:
# 6694 empty Strings in artist_location
len(df_song_data[df_song_data.artist_location == ''])

6694

In [18]:
# Checking '0' in the Data
df_song_data.applymap(lambda x:x=='0').any()

num_songs           False
artist_id           False
artist_latitude     False
artist_longitude    False
artist_location     False
artist_name         False
song_id             False
title               False
duration            False
year                 True
dtype: bool

In [19]:
# 4762 Zeroes in year column
len(df_song_data[df_song_data.year == '0'])

4762

In [20]:
# Checking if Year contains non integer Values
df_song_data[['year']].applymap(lambda x : x.isdigit()).all()

year    True
dtype: bool

### Checking the Lengthes of all the Columns

In [21]:
# Checking the Lengthes of all the Columns to define the approperiate Length & Data Type
for col in df_song_data.dtypes[df_song_data.dtypes.values == 'object'].index:
    print("{} : {}".format(col,max(df_song_data[col].str.len().values)))

num_songs : 1
artist_id : 18
artist_latitude : 19.0
artist_longitude : 22.0
artist_location : 176.0
artist_name : 177
song_id : 18
title : 173
duration : 18
year : 4


In [22]:
len(max(df_song_data[~df_song_data.artist_latitude.isnull()]['artist_latitude']))

18

In [23]:
len(max(df_song_data[~df_song_data.artist_longitude.isnull()]['artist_longitude']))

18

## Loading Song Table Data

In this Section we will define the DDL for Song Table & the Insert/Merge Query

14896 records should be inserted into the Song table

In [24]:
cur.execute("""CREATE SCHEMA IF NOT EXISTS SPARKIFY_SCHEMA;""")

In [25]:
conn.commit()

In [26]:
cur.execute("""CREATE TABLE IF NOT EXISTS SPARKIFY_SCHEMA.SONG_TBL
(
    SONG_ID VARCHAR(50),
    ARTIST_ID VARCHAR(50),
    DURATION DOUBLE PRECISION,
    SONG_TITLE VARCHAR(250) DEFAULT 'Unknwon',
    SONG_YEAR INTEGER DEFAULT -9999,
    PRIMARY KEY (SONG_ID,ARTIST_ID,DURATION)
);""")
conn.commit()

In [27]:
cur.execute("""
DELETE FROM SPARKIFY_SCHEMA.SONG_TBL 
USING STAGING_SCHEMA.STG_SONG
WHERE SONG_TBL.song_id = STG_SONG.SONG_ID
AND SONG_TBL.artist_id = STG_SONG.ARTIST_ID
AND SONG_TBL.duration = CAST(STG_SONG.DURATION AS DOUBLE PRECISION);
""")

In [28]:
conn.commit()

In [29]:
cur.rowcount

14896

In [30]:
cur.execute("""
INSERT INTO SPARKIFY_SCHEMA.SONG_TBL
SELECT DISTINCT 
TRIM(song_id),
TRIM(artist_id),
CAST(duration AS DOUBLE PRECISION) AS DURATION,
TRIM(title) AS SONG_TITLE,
CAST(CASE WHEN year = '0' THEN '-9999' ELSE year END AS INTEGER) AS SONG_YEAR
FROM STAGING_SCHEMA.STG_SONG;
""")

In [31]:
cur.rowcount

14896

In [32]:
conn.commit()

In [33]:
cur.execute('SELECT DISTINCT SONG_YEAR FROM SPARKIFY_SCHEMA.SONG_TBL')

In [34]:
cur.fetchall()

[(1999,),
 (-9999,),
 (2002,),
 (2008,),
 (2006,),
 (1994,),
 (1966,),
 (1977,),
 (1954,),
 (1996,),
 (1995,),
 (1991,),
 (1990,),
 (1988,),
 (1984,),
 (1972,),
 (1989,),
 (1981,),
 (1969,),
 (1963,),
 (1979,),
 (1973,),
 (1958,),
 (2000,),
 (2009,),
 (1992,),
 (1987,),
 (1971,),
 (1974,),
 (1927,),
 (1964,),
 (1944,),
 (1956,),
 (2005,),
 (1997,),
 (1998,),
 (2001,),
 (1975,),
 (2010,),
 (1985,),
 (1980,),
 (1965,),
 (1978,),
 (1959,),
 (1970,),
 (1962,),
 (1960,),
 (1961,),
 (1952,),
 (2004,),
 (2007,),
 (2003,),
 (1993,),
 (1983,),
 (1986,),
 (1982,),
 (1976,),
 (1968,),
 (1967,),
 (1957,)]

## Loading Artist Table Data

In [35]:
list_artist_cols = ['artist_id','artist_name','artist_location','artist_latitude','artist_longitude']

In [36]:
df_song_data[list_artist_cols].head()

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude
0,ARBZIN01187FB362CC,Paris Hilton,27,1.32026,103.78871
1,ARKIQCZ1187B9A7C7C,Bauhaus,"Northampton, Northamptonshire, En",52.23974,-0.8857599999999999
2,ARQVORN11F50C4EFEC,Bedlight For Blue Eyes,,,
3,AR7WK5411A348EF5EA,Minitel Rose,PARIS - NANTES,48.85692,2.34121
4,AR0IT221187B999C4D,The Weathermen,BELGIUM,50.50101,4.47684


### Defining Artist Table PK

According to the Current design of the schema & provided data, we should extract Artist Records with Unique Artist Id, but according to the profiling 9553 out of 14896 are unique Artist Ids, so we will check if these IDs really represents the same data for all the Records with the same Artist Id

9553 Records should be inserted into Artist table

In [37]:
# None of the Columns is Unique to be a PK
for col in list_artist_cols:
    print(col+' : '+str(df_song_data[col].is_unique))

artist_id : False
artist_name : False
artist_location : False
artist_latitude : False
artist_longitude : False


In [38]:
len(df_song_data)

14896

In [39]:
df_song_data.columns

Index(['num_songs', 'artist_id', 'artist_latitude', 'artist_longitude',
       'artist_location', 'artist_name', 'song_id', 'title', 'duration',
       'year'],
      dtype='object')

According to the Cell above Artist ID is the approperiate PK for Artist Table

In [40]:
for i in range(1,len(list_artist_cols)+1):
    print(",".join(list_artist_cols[:i])+" : "+str(len(df_song_data[list_artist_cols[:i]].drop_duplicates())))

artist_id : 9553
artist_id,artist_name : 9993
artist_id,artist_name,artist_location : 10021
artist_id,artist_name,artist_location,artist_latitude : 10025
artist_id,artist_name,artist_location,artist_latitude,artist_longitude : 10025


The above cell shows that, there are many different Names for some artist Ids, so we will check the values and decide what is the best records to be extracted

In [41]:
# creating a RowNumber Column which will be 1's for all the Unique Artist Ids
df_song_data['RN_ARTIST_ID'] = df_song_data.groupby(['artist_id']).cumcount() + 1

In [42]:
df_song_data['RN_ARTIST_ID_NAME'] = df_song_data.groupby(['artist_id','artist_name']).cumcount() + 1

In [43]:
df_song_data.head()

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year,RN_ARTIST_ID,RN_ARTIST_ID_NAME
0,1,ARBZIN01187FB362CC,1.32026,103.78871,27,Paris Hilton,SOERIDA12A6D4F8506,I Want You (Album Version),192.28689,2006,1,1
1,1,ARKIQCZ1187B9A7C7C,52.23974,-0.8857599999999999,"Northampton, Northamptonshire, En",Bauhaus,SOSIJKW12A8C1330E3,A God In An Alcove (Session Version),248.65914,0,1,1
2,1,ARQVORN11F50C4EFEC,,,,Bedlight For Blue Eyes,SOMFRKT12A8C146C67,Without You,165.38077,0,1,1
3,1,AR7WK5411A348EF5EA,48.85692,2.34121,PARIS - NANTES,Minitel Rose,SOTCOTZ12A8C136BCB,Elevator,248.31955,2008,1,1
4,1,AR0IT221187B999C4D,50.50101,4.47684,BELGIUM,The Weathermen,SOFJPHQ12A6D4FBA32,Let Them Come To Berlin,246.17751,0,1,1


In [44]:
list_artist_cols = list_artist_cols+['RN_ARTIST_ID','RN_ARTIST_ID_NAME']
list_artist_cols

['artist_id',
 'artist_name',
 'artist_location',
 'artist_latitude',
 'artist_longitude',
 'RN_ARTIST_ID',
 'RN_ARTIST_ID_NAME']

In [45]:
# Array contains all unique Artist Ids with Multiple Names to be evaluated
arr_artist_ids = df_song_data[df_song_data.RN_ARTIST_ID != df_song_data.RN_ARTIST_ID_NAME].artist_id.unique()

In [46]:
df_song_data[df_song_data.artist_id.isin(arr_artist_ids)][list_artist_cols].sort_values(by=['artist_id','artist_name','RN_ARTIST_ID','RN_ARTIST_ID_NAME']).head(20)

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude,RN_ARTIST_ID,RN_ARTIST_ID_NAME
1590,AR03BDP1187FB5B324,Britney Spears,"Kentwood, LA; Los Angeles, CA",34.05349,-118.24532,1,1
10472,AR03BDP1187FB5B324,Britney Spears,"Kentwood, LA; Los Angeles, CA",34.05349,-118.24532,3,2
2624,AR03BDP1187FB5B324,Britney Spears feat. Pharrell Williams,"Kentwood, LA; Los Angeles, CA",34.05349,-118.24532,2,1
7053,AR040M31187B98CA41,The Bug Featuring Ricky Ranking,,,,1,1
12333,AR040M31187B98CA41,The Bug Featuring Spaceape,,,,2,1
1728,AR04S8J1187FB48358,Clifford Brown,"Wilmington, DE",39.74023,-75.55083999999998,1,1
11321,AR04S8J1187FB48358,Clifford Brown / Max Roach Quintet,"Wilmington, DE",39.74023,-75.55083999999998,2,1
14125,AR04S8J1187FB48358,Clifford Brown / Max Roach Quintet,"Wilmington, DE",39.74023,-75.55083999999998,3,2
13581,AR065TW1187FB4C3A5,Nearly God,"Knowle West, Bristol, Avon, Engla",51.43558,-2.57518,4,1
3096,AR065TW1187FB4C3A5,Tricky,"Knowle West, Bristol, Avon, Engla",,,2,1


In [47]:
df_song_data[(df_song_data.artist_id == 'AR1Y2PT1187FB5B9CE')][list_artist_cols]

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude,RN_ARTIST_ID,RN_ARTIST_ID_NAME
49,AR1Y2PT1187FB5B9CE,John Wesley,Brandon,27.94017,-82.32546999999998,1,1
1078,AR1Y2PT1187FB5B9CE,John Wesley_ John Wesley,Brandon,27.94017,-82.32546999999998,2,1


In [48]:
df_song_data[(df_song_data.artist_id == 'AR9YWMS1187FB43A34')][list_artist_cols]

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude,RN_ARTIST_ID,RN_ARTIST_ID_NAME
1429,AR9YWMS1187FB43A34,Dinosaur Jr.,"Amherst, MA",,,1,1
1821,AR9YWMS1187FB43A34,Dinosaur Jr,"Amherst, MA",,,2,1
4332,AR9YWMS1187FB43A34,Dinosaur Jr.,"Amherst, MA",,,3,2


In [49]:
df_song_data[(df_song_data.artist_id == 'ARD46C811C8A414F3F')][list_artist_cols]

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude,RN_ARTIST_ID,RN_ARTIST_ID_NAME
757,ARD46C811C8A414F3F,Kid Cudi,"Cleveland, Ohio",41.50471,-81.69074,1,1
2910,ARD46C811C8A414F3F,Kid Cudi / Kanye West / Common,"Cleveland, Ohio",41.50471,-81.69074,2,1


In [50]:
df_song_data[(df_song_data.artist_id == 'ARYPTWE1187FB49D64')][list_artist_cols]

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude,RN_ARTIST_ID,RN_ARTIST_ID_NAME
59,ARYPTWE1187FB49D64,Aphex Twin,,,,1,1
80,ARYPTWE1187FB49D64,Polygon Window,,,,2,1
191,ARYPTWE1187FB49D64,Polygon Window,,,,3,2
691,ARYPTWE1187FB49D64,Aphex Twin,,,,4,2
4197,ARYPTWE1187FB49D64,Aphex Twin,,,,5,3
5945,ARYPTWE1187FB49D64,Aphex Twin,,,,6,4
10022,ARYPTWE1187FB49D64,Aphex Twin,,,,7,5
10461,ARYPTWE1187FB49D64,Aphex Twin,,,,8,6
13177,ARYPTWE1187FB49D64,Aphex Twin,,,,9,7


In [51]:
df_song_data[(df_song_data.artist_id == 'AR2AVSC1187B991634')][list_artist_cols]

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude,RN_ARTIST_ID,RN_ARTIST_ID_NAME
5654,AR2AVSC1187B991634,Amon Tobin,Brazil,,,1,1
13014,AR2AVSC1187B991634,Cujo,Brazil,,,2,1
13445,AR2AVSC1187B991634,Amon Tobin,Brazil,,,3,2


### According to the Above Cells, Maultiple Artists Names can come to a single Artist ID, so we will pick the shortest String as it seems to be more informative, because there are multiple 'Featured' artists with the Main artist, so if we pick the Main one will be better.

In [52]:
cur.execute("""
CREATE TABLE IF NOT EXISTS SPARKIFY_SCHEMA.ARTIST_TBL
(
    ARTIST_ID VARCHAR(50),
    ARTIST_NAME VARCHAR(250),
    ARTIST_LOCATION VARCHAR(250) DEFAULT 'Unknown',
    ARTIST_LATITUDE DOUBLE PRECISION DEFAULT -9999,
    ARTIST_LONGITUTE DOUBLE PRECISION DEFAULT -9999,
    PRIMARY KEY (ARTIST_ID)
);
"""
)
conn.commit()

In [53]:
cur.execute("""
DELETE FROM SPARKIFY_SCHEMA.ARTIST_TBL 
USING STAGING_SCHEMA.STG_SONG
WHERE ARTIST_TBL.ARTIST_ID = STG_SONG.artist_id;
""")
conn.commit()

In [54]:
cur.execute("""SELECT COUNT(*) FROM SPARKIFY_SCHEMA.ARTIST_TBL""")

In [55]:
cur.fetchall()

[(0,)]

In [56]:
cur.execute("""
INSERT INTO SPARKIFY_SCHEMA.ARTIST_TBL
SELECT 
ARTIST_ID,
ARTIST_NAME,
ARTIST_LOCATION,
CAST(ARTIST_LATITUDE AS DOUBLE PRECISION),
CAST(ARTIST_LONGITUTE AS DOUBLE PRECISION)
FROM 
(
SELECT DISTINCT 
artist_id AS ARTIST_ID,
artist_name AS ARTIST_NAME,
ROW_NUMBER() OVER (PARTITION BY artist_id ORDER BY LEN(artist_name)) AS ROW_NUM,
CASE WHEN trim(artist_location) = '' OR artist_location IS NULL THEN 'Unknown' ELSE artist_location END AS ARTIST_LOCATION,
CASE WHEN trim(artist_latitude) = '' OR artist_latitude IS NULL THEN '-9999' ELSE artist_latitude END AS ARTIST_LATITUDE,
CASE WHEN trim(artist_longitude) = '' OR artist_longitude IS NULL THEN '-9999' ELSE artist_longitude END AS ARTIST_LONGITUTE
FROM staging_schema.stg_song 
) WHERE ROW_NUM = 1""")
conn.commit()

In [57]:
cur.rowcount

9553