# EDA - Log Files

<p>This notebook is to analyze the Data files to understand how to load the data, Data Quality, Define Data Types & Define Constraints</p>

In [1]:
import pandas as pd
import os
from itertools import zip_longest
import numpy as np

In [2]:
def FUNC_GetAllSubFiles(param_dir):
    
    LIST_AllDirs = []
    
    for dir_path, dir_names, file_names in os.walk(param_dir):
        file_names = list(filter(lambda filename : filename.find('checkpoint')==-1 and filename.find('.json')!=-1, file_names))
        if file_names != []:
            LIST_AllDirs.extend(list(zip_longest([dir_path],file_names,fillvalue=dir_path)))
        
    return tuple(map(lambda x : os.path.join(x[0],x[1]),LIST_AllDirs))

In [3]:
# Number of Files to process
TUPLE_AllFiles = FUNC_GetAllSubFiles('data/song_data/')
len(TUPLE_AllFiles)

71

In [4]:
DF_AllLogs = pd.DataFrame(columns=['artist_id', 'artist_latitude', 'artist_location', 'artist_longitude','artist_name', 'duration', 'num_songs', 'song_id', 'title', 'year'])

In [5]:
for filePath in TUPLE_AllFiles:
    DF_Temp = pd.read_json(filePath, lines=True)
    DF_Temp["SRC_FILE"] = filePath.split('/')[-1]
    DF_AllLogs = DF_AllLogs.append(DF_Temp,sort=False)

In [6]:
DF_AllLogs.head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,SRC_FILE
0,AR8IEZO1187B99055E,,,,Marc Shaiman,149.86404,1,SOINLJW12A8C13314C,City Slickers,2008,TRABCRU128F423F449.json
0,AR558FS1187FB45658,,,,40 Grit,75.67628,1,SOGDBUF12A8C140FAA,Intro,2003,TRABCTK128F934B224.json
0,ARVBRGZ1187FB4675A,,,,Gwen Stefani,290.55955,1,SORRZGD12A6310DBC3,Harajuku Girls,2004,TRABCUQ128E0783E2B.json
0,ARNF6401187FB57032,40.79086,"New York, NY [Manhattan]",-73.96644,Sophie B. Hawkins,305.162,1,SONWXQJ12A8C134D94,The Ballad Of Sleeping Beauty,1994,TRABCIX128F4265903.json
0,AREVWGE1187B9B890A,-13.442,Noci (BA),-41.9952,Bitter End,282.43546,1,SOFCHDR12AB01866EF,Living Hell,0,TRABCYE128F934CE1D.json


In [7]:
DF_AllLogs.dtypes

artist_id            object
artist_latitude     float64
artist_location      object
artist_longitude    float64
artist_name          object
duration            float64
num_songs            object
song_id              object
title                object
year                 object
SRC_FILE             object
dtype: object

In [8]:
# Casting Columns to the right data types
DF_AllLogs.num_songs = DF_AllLogs.num_songs.astype('int', copy=True)
DF_AllLogs.year = DF_AllLogs.year.astype('int', copy=True)

In [9]:
SERIES_DTypes = DF_AllLogs.dtypes
SERIES_DTypes

artist_id            object
artist_latitude     float64
artist_location      object
artist_longitude    float64
artist_name          object
duration            float64
num_songs             int64
song_id              object
title                object
year                  int64
SRC_FILE             object
dtype: object

### Checking Uniqueness

In [10]:
len(DF_AllLogs)

71

In [11]:
len(DF_AllLogs.song_id.unique())

71

<p> Each file contains only one Song</p>

### Checking Nullability

In [12]:
DF_AllLogs.isna().sum()

artist_id            0
artist_latitude     40
artist_location      0
artist_longitude    40
artist_name          0
duration             0
num_songs            0
song_id              0
title                0
year                 0
SRC_FILE             0
dtype: int64

<p>artist_latitude & artist_longitude Columns Contain Nulls, these nulls will be handled in Python using default Value</p>

In [13]:
DF_AllLogs.applymap(lambda x:x=="").any()

artist_id           False
artist_latitude     False
artist_location      True
artist_longitude    False
artist_name         False
duration            False
num_songs           False
song_id             False
title               False
year                False
SRC_FILE            False
dtype: bool

In [14]:
len(DF_AllLogs[DF_AllLogs.artist_location==""])

28

<p>artist_location Contains 28 empty strings, These values need to be replaced in Python</p> 

In [15]:
DF_AllLogs[['duration', 'num_songs', 'song_id', 'title', 'year']].head()

Unnamed: 0,duration,num_songs,song_id,title,year
0,149.86404,1,SOINLJW12A8C13314C,City Slickers,2008
0,75.67628,1,SOGDBUF12A8C140FAA,Intro,2003
0,290.55955,1,SORRZGD12A6310DBC3,Harajuku Girls,2004
0,305.162,1,SONWXQJ12A8C134D94,The Ballad Of Sleeping Beauty,1994
0,282.43546,1,SOFCHDR12AB01866EF,Living Hell,0


In [16]:
# 43 Years are zeros this needs to be handled in Python
len(DF_AllLogs[DF_AllLogs.year ==0])

43

Getting the Length of all the Columns to define Tables DDLs

In [17]:
for col in SERIES_DTypes[SERIES_DTypes.values == 'object'].index:
    print("{} : {}".format(col,max(DF_AllLogs[col].str.len().values)))

artist_id : 18
artist_location : 29
artist_name : 94
song_id : 18
title : 52
SRC_FILE : 23


In [18]:
DF_AllLogs[DF_AllLogs.artist_name.str.len() == 94]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,SRC_FILE
0,ARDR4AC1187FB371A1,,,,Montserrat Caballé;Placido Domingo;Vicente Sar...,511.16363,1,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,0,TRABBOR128F4286200.json


## Song Table Data Processing

In [19]:
def FUNC_ProcessSongData(param_df):
    """A Function to Process Song Table Data, recieves a Dataframe then selects, processes & returns Song Data ready to be inserted in the Db"""
    try:
        DF_SongData = param_df[['song_id','title','artist_id','year','duration']].copy()
        DF_SongData.year = DF_SongData.year.apply(lambda x: -9999 if x in (0,np.nan) else x)# Changing 0 years into the default value of missing numeric value -9999
        
        return DF_SongData
    except Exception as e:
        print("ERROR: {}".format(e))

In [20]:
FUNC_ProcessSongData(DF_AllLogs).head()

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOINLJW12A8C13314C,City Slickers,AR8IEZO1187B99055E,2008,149.86404
0,SOGDBUF12A8C140FAA,Intro,AR558FS1187FB45658,2003,75.67628
0,SORRZGD12A6310DBC3,Harajuku Girls,ARVBRGZ1187FB4675A,2004,290.55955
0,SONWXQJ12A8C134D94,The Ballad Of Sleeping Beauty,ARNF6401187FB57032,1994,305.162
0,SOFCHDR12AB01866EF,Living Hell,AREVWGE1187B9B890A,-9999,282.43546


In [21]:
FUNC_ProcessSongData(DF_AllLogs).head(10)

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOINLJW12A8C13314C,City Slickers,AR8IEZO1187B99055E,2008,149.86404
0,SOGDBUF12A8C140FAA,Intro,AR558FS1187FB45658,2003,75.67628
0,SORRZGD12A6310DBC3,Harajuku Girls,ARVBRGZ1187FB4675A,2004,290.55955
0,SONWXQJ12A8C134D94,The Ballad Of Sleeping Beauty,ARNF6401187FB57032,1994,305.162
0,SOFCHDR12AB01866EF,Living Hell,AREVWGE1187B9B890A,-9999,282.43546
0,SONSKXP12A8C13A2C9,Native Soul,AR0IAWL1187B9A96D0,2003,197.19791
0,SODAUVL12A8C13D184,Prognosis,ARWB3G61187FB49404,2000,363.85914
0,SOSWKAV12AB018FC91,Midnight Star,ARULZCI1241B9C8611,-9999,335.51628
0,SOLYIBD12A8C135045,Music is what we love,AR051KA1187B98B2FF,-9999,261.51138
0,SOWQTQZ12A58A7B63E,Streets On Fire (Explicit Album Version),ARPFHN61187FB575F6,-9999,279.97995


In [22]:
len(DF_AllLogs.song_id.unique())

71

In [23]:
len(FUNC_ProcessSongData(DF_AllLogs).song_id.unique())

71

## Artist Table Data Processing

In [24]:
def FUNC_ProcessArtistData(param_df):
    """A Function to Process Artist Table Data, recieves a Dataframe then selects, processes & returns Song Data ready to be inserted in the Db"""
    
    try:
        DF_ArtistData = param_df[['artist_id','artist_name','artist_location','artist_latitude','artist_longitude']].copy()
        
        DF_ArtistData.artist_location = DF_ArtistData.artist_location.apply(lambda x: 'Unknown' if (x == "" or None) else x)# Changing 0 years into the default value of missing numeric value -9999
        DF_ArtistData.artist_latitude.replace(np.nan,-9999, inplace=True)# Replacing Nulls in artist_latitude with the default value of missing numeric value -9999
        DF_ArtistData.artist_longitude.replace(np.nan,-9999, inplace=True)# Replacing Nulls in artist_longitude with the default value of missing numeric value -9999
        
        return DF_ArtistData
    except Exception as e:
        print("ERROR: {}".format(e))

In [25]:
FUNC_ProcessArtistData(DF_AllLogs).head(10)

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude
0,AR8IEZO1187B99055E,Marc Shaiman,Unknown,-9999.0,-9999.0
0,AR558FS1187FB45658,40 Grit,Unknown,-9999.0,-9999.0
0,ARVBRGZ1187FB4675A,Gwen Stefani,Unknown,-9999.0,-9999.0
0,ARNF6401187FB57032,Sophie B. Hawkins,"New York, NY [Manhattan]",40.79086,-73.96644
0,AREVWGE1187B9B890A,Bitter End,Noci (BA),-13.442,-41.9952
0,AR0IAWL1187B9A96D0,Danilo Perez,Panama,8.4177,-80.11278
0,ARWB3G61187FB49404,Steve Morse,"Hamilton, Ohio",-9999.0,-9999.0
0,ARULZCI1241B9C8611,Luna Orbit Project,Unknown,-9999.0,-9999.0
0,AR051KA1187B98B2FF,Wilks,Unknown,-9999.0,-9999.0
0,ARPFHN61187FB575F6,Lupe Fiasco,"Chicago, IL",41.88415,-87.63241


In [26]:
len(FUNC_ProcessArtistData(DF_AllLogs).artist_id.unique())

69

In [27]:
len(DF_AllLogs.artist_id.unique())

69