#### Data Exploration
This notebook explores the data within ALL genres included in the original CSV. The model has been trained with a smaller number of genres. See ```data_exploration_model.ipynb``` for data exploration using only the data that the model was trained on.

In [2]:
# dependencies
import pandas as pd
import numpy as np
import requests

In [3]:
# read csv
songs_df = pd.read_csv('../Resources/songs_normalize.csv')
songs_df.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop


#### Genre Distribution

In [4]:
# verify number of unique genres in csv
songs_df['genre'].nunique()

59

#### Top 5 Genres

In [5]:
# verify top 5 genres by count in csv
genre_counts = songs_df['genre'].value_counts()
genre_counts.head(5)

genre
pop                      428
hip hop, pop             277
hip hop, pop, R&B        244
pop, Dance/Electronic    221
pop, R&B                 178
Name: count, dtype: int64

In [6]:
# create dataframe containing rows with top 5 genres only
top_5 = [
    'pop', 
    'hip hop, pop', 
    'hip hop, pop, R&B', 
    'pop, Dance/Electronic', 
    'pop, R&B'
    ]

top_5_df = songs_df[songs_df['genre'].isin(top_5)]

# verify genre column only contains top 5 genres
top_5_df['genre'].unique()

array(['pop', 'hip hop, pop, R&B', 'pop, R&B', 'pop, Dance/Electronic',
       'hip hop, pop'], dtype=object)

In [7]:
# preview dataframe
top_5_df.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop
5,Sisqo,Thong Song,253733,True,1999,69,0.706,0.888,2,-6.959,1,0.0654,0.119,9.6e-05,0.07,0.714,121.549,"hip hop, pop, R&B"
8,Destiny's Child,Say My Name,271333,False,1999,75,0.713,0.678,5,-3.525,0,0.102,0.273,0.0,0.149,0.734,138.009,"pop, R&B"
10,Gigi D'Agostino,L'Amour Toujours,238759,False,2011,1,0.617,0.728,7,-7.932,1,0.0292,0.0328,0.0482,0.36,0.808,139.066,pop


#### Top 5 Genres (Energy, Danceability, Valence)

In [8]:
# verify avg energy, danceability, valence in top 5 genres

# ENERGY

print(f'Average Energy for Top 5 Genres:')

# pop
genre_1_energy = top_5_df[top_5_df['genre'] == 'pop']
avg_energy_1 = genre_1_energy['energy'].mean()
avg_energy_1

print(f'[pop] = {avg_energy_1:.4f}')

# hip hop, pop
genre_2_energy = top_5_df[top_5_df['genre'] == 'hip hop, pop']
avg_energy_2 = genre_2_energy['energy'].mean()
avg_energy_2

print(f'[hip hop, pop] = {avg_energy_2:.4f}')

# hip hop, pop, R&B
genre_3_energy = top_5_df[top_5_df['genre'] == 'hip hop, pop, R&B']
avg_energy_3 = genre_3_energy['energy'].mean()
avg_energy_3

print(f'[hip hop, pop, R&B] = {avg_energy_3:.4f}')

# pop, Dance/Electronic
genre_4_energy = top_5_df[top_5_df['genre'] == 'pop, Dance/Electronic']
avg_energy_4 = genre_4_energy['energy'].mean()
avg_energy_4

print(f'[pop, Dance/Electronic] = {avg_energy_4:.4f}')

# pop, R&B
genre_5_energy = top_5_df[top_5_df['genre'] == 'pop, R&B']
avg_energy_5 = genre_5_energy['energy'].mean()
avg_energy_5

print(f'[pop, R&B] = {avg_energy_5:.4f}')

Average Energy for Top 5 Genres:
[pop] = 0.7164
[hip hop, pop] = 0.6987
[hip hop, pop, R&B] = 0.6619
[pop, Dance/Electronic] = 0.7602
[pop, R&B] = 0.6579


In [9]:
# verify avg energy, danceability, valence in top 5 genres

# DANCEABILITY

print(f'Average Danceability for Top 5 Genres')

# pop
genre_1_dance = top_5_df[top_5_df['genre'] == 'pop']
avg_dance_1 = genre_1_dance['danceability'].mean()
avg_dance_1

print(f'[pop] = {avg_dance_1:.4f}')

# hip hop, pop
genre_2_dance = top_5_df[top_5_df['genre'] == 'hip hop, pop']
avg_dance_2 = genre_2_dance['danceability'].mean()
avg_dance_2

print(f'[hip hop, pop] = {avg_dance_1:.4f}')

# hip hop, pop, R&B
genre_3_dance = top_5_df[top_5_df['genre'] == 'hip hop, pop, R&B']
avg_dance_3 = genre_3_dance['danceability'].mean()
avg_dance_3

print(f'[hip hop, pop, R&B] = {avg_dance_1:.4f}')

# pop, Dance/Electronic
genre_4_dance = top_5_df[top_5_df['genre'] == 'pop, Dance/Electronic']
avg_dance_4 = genre_4_dance['danceability'].mean()
avg_dance_4

print(f'[pop, Dance/Electronic] = {avg_dance_4:.4f}')

# pop, R&B
genre_5_dance = top_5_df[top_5_df['genre'] == 'pop, R&B']
avg_dance_5 = genre_5_dance['danceability'].mean()
avg_dance_5

print(f'[pop] = {avg_dance_5:.4f}')

Average Danceability for Top 5 Genres
[pop] = 0.6479
[hip hop, pop] = 0.6479
[hip hop, pop, R&B] = 0.6479
[pop, Dance/Electronic] = 0.6548
[pop] = 0.6571


In [10]:
# verify avg energy, danceability, valence in top 5 genres

# VALENCE

print(f'Average Valence for Top 5 Genres')

# pop
genre_1_valence = top_5_df[top_5_df['genre'] == 'pop']
avg_valence_1 = genre_1_valence['valence'].mean()
avg_valence_1

print(f'[pop] = {avg_valence_1:.4f}')

# hip hop, pop
genre_2_valence = top_5_df[top_5_df['genre'] == 'hip hop, pop']
avg_valence_2 = genre_2_valence['valence'].mean()
avg_valence_2

print(f'[hip hop, pop] = {avg_valence_2:.4f}')

# hip hop, pop, R&B
genre_3_valence = top_5_df[top_5_df['genre'] == 'hip hop, pop, R&B']
avg_valence_3 = genre_3_valence['valence'].mean()
avg_valence_3

print(f'[hip hop, pop, R&B] = {avg_valence_3:.4f}')

# pop, Dance/Electronic
genre_4_valence = top_5_df[top_5_df['genre'] == 'pop, Dance/Electronic']
avg_valence_4 = genre_4_valence['valence'].mean()
avg_valence_4
print(f'[pop, Dance/Electronic] = {avg_valence_4:.4f}')

# pop, R&B
genre_5_valence = top_5_df[top_5_df['genre'] == 'pop, R&B']
avg_valence_5 = genre_1_valence['valence'].mean()
avg_valence_5

print(f'[pop, R&B] = {avg_valence_5:.4f}')

Average Valence for Top 5 Genres
[pop] = 0.5549
[hip hop, pop] = 0.5560
[hip hop, pop, R&B] = 0.5759
[pop, Dance/Electronic] = 0.5287
[pop, R&B] = 0.5549


#### Top 5 Genres (Speechiness, Acousticness, Instrumentalness)

In [11]:
# verify avg speechiness, acousticness, instrumentalness in top 5 genres

# SPEECHINESS 

print(f'Average Speechiness for Top 5 Genres')

# pop
genre_1_speech = top_5_df[top_5_df['genre'] == 'pop']
avg_speech_1 = genre_1_speech['speechiness'].mean()
avg_speech_1

print(f'[pop] = {avg_speech_1:.4f}')

# hip hop, pop
genre_2_speech = top_5_df[top_5_df['genre'] == 'hip hop, pop']
avg_speech_2 = genre_2_speech['speechiness'].mean()
avg_speech_2

print(f'[hip hop, pop] = {avg_speech_2:.4f}')

# hip hop, pop, R&B
genre_3_speech = top_5_df[top_5_df['genre'] == 'hip hop, pop, R&B']
avg_speech_3 = genre_3_speech['speechiness'].mean()
avg_speech_3

print(f'[hip hop, pop, R&B] = {avg_speech_3:.4f}')

# pop, Dance/Electronic
genre_4_speech = top_5_df[top_5_df['genre'] == 'pop, Dance/Electronic']
avg_speech_4 = genre_4_speech['speechiness'].mean()
avg_speech_4

print(f'[pop, Dance/Electronic] = {avg_speech_4:.4f}')
# pop, R&B
genre_5_speech = top_5_df[top_5_df['genre'] == 'pop, R&B']
avg_speech_5 = genre_5_speech['speechiness'].mean()
avg_speech_5

print(f'[pop, R&B] = {avg_speech_5:.4f}')

Average Speechiness for Top 5 Genres
[pop] = 0.0734
[hip hop, pop] = 0.1717
[hip hop, pop, R&B] = 0.1275
[pop, Dance/Electronic] = 0.0732
[pop, R&B] = 0.0956


In [15]:
# verify avg speechiness, acousticness, instrumentalness in top 5 genres

# ACOUSTICNESS
 
print(f'Average Acousticness for Top 5 Genres')

# pop
genre_1_acoustic = top_5_df[top_5_df['genre'] == 'pop']
avg_acoustic_1 = genre_1_acoustic['acousticness'].mean()
avg_acoustic_1

print(f'[pop] = {avg_acoustic_1:.4f}')

# hip hop, pop
genre_2_acoustic = top_5_df[top_5_df['genre'] == 'hip hop, pop']
avg_acoustic_2 = genre_2_acoustic['acousticness'].mean()
avg_acoustic_2

print(f'[hip hop, pop] = {avg_acoustic_2:.4f}')
# hip hop, pop, R&B
genre_3_acoustic = top_5_df[top_5_df['genre'] == 'hip hop, pop, R&B']
avg_acoustic_3 = genre_3_acoustic['acousticness'].mean()
avg_acoustic_3

print(f'[hip hop, pop, R&B] = {avg_acoustic_3:.4f}')

# pop, Dance/Electronic
genre_4_acoustic = top_5_df[top_5_df['genre'] == 'pop, Dance/Electronic']
avg_acoustic_4 = genre_4_acoustic['acousticness'].mean()
avg_acoustic_4

print(f'[pop, Dance/Electronic] = {avg_acoustic_4:.4f}')

# pop, R&B
genre_5_acoustic = top_5_df[top_5_df['genre'] == 'pop, R&B']
avg_acoustic_5 = genre_5_acoustic['acousticness'].mean()
avg_acoustic_5

print(f'[pop, R&B] = {avg_acoustic_5:.4f}')

Average Acousticness for Top 5 Genres
[pop] = 0.1522
[hip hop, pop] = 0.0993
[hip hop, pop, R&B] = 0.1314
[pop, Dance/Electronic] = 0.1146
[pop, R&B] = 0.1826


In [13]:
# verify avg speechiness, acousticness, instrumentalness in top 5 genres

# INSTRUMENTALNESS

print(f'Average Instrumentalness for Top 5 Genres')

# pop
genre_1_instrument = top_5_df[top_5_df['genre'] == 'pop']
avg_instrument_1 = genre_1_instrument['instrumentalness'].mean()
avg_instrument_1

print(f'[pop] = {avg_instrument_1:.4f}')

# hip hop, pop
genre_2_instrument = top_5_df[top_5_df['genre'] == 'hip hop, pop']
avg_instrument_2 = genre_2_instrument['instrumentalness'].mean()
avg_instrument_2

print(f'[hip hop, pop] = {avg_instrument_2:.4f}')

# hip hop, pop, R&B
genre_3_instrument = top_5_df[top_5_df['genre'] == 'hip hop, pop, R&B']
avg_instrument_3 = genre_3_instrument['instrumentalness'].mean()
avg_instrument_3

print(f'[hip hop, pop, R&B] = {avg_instrument_3:.4f}')

# pop, Dance/Electronic
genre_4_instrument = top_5_df[top_5_df['genre'] == 'pop, Dance/Electronic']
avg_instrument_4 = genre_4_instrument['instrumentalness'].mean()
avg_instrument_4

print(f'[pop, Dance/Electronic] = {avg_instrument_4:.4f}')

# pop, R&B
genre_5_instrument = top_5_df[top_5_df['genre'] == 'pop, R&B']
avg_instrument_5 = genre_5_instrument['instrumentalness'].mean()
avg_instrument_5

print(f'[pop, R&B] = {avg_instrument_5:.4f}')

Average Instrumentalness for Top 5 Genres
[pop] = 0.0075
[hip hop, pop] = 0.0021
[hip hop, pop, R&B] = 0.0049
[pop, Dance/Electronic] = 0.0345
[pop, R&B] = 0.0083


#### Top 5 Genres (Tempo)

In [16]:
# verify avg tempo in top 5 genres

print(f'Average Tempo for Top 5 Genres')

# pop
genre_1_tempo = top_5_df[top_5_df['genre'] == 'pop']
avg_tempo_1 = genre_1_tempo['tempo'].mean()
avg_tempo_1

print(f'[pop] = {avg_tempo_1:.4f}')

# hip hop, pop
genre_2_tempo = top_5_df[top_5_df['genre'] == 'hip hop, pop']
avg_tempo_2 = genre_2_tempo['tempo'].mean()
avg_tempo_2

print(f'[hip hop, pop] = {avg_tempo_2:.4f}')

# hip hop, pop, R&B
genre_3_tempo = top_5_df[top_5_df['genre'] == 'hip hop, pop, R&B']
avg_tempo_3 = genre_3_tempo['tempo'].mean()
avg_tempo_3

print(f'[hip hop, pop, R&B] = {avg_tempo_3:.4f}')

# pop, Dance/Electronic
genre_4_tempo = top_5_df[top_5_df['genre'] == 'pop, Dance/Electronic']
avg_tempo_4 = genre_4_tempo['tempo'].mean()
avg_tempo_4

print(f'[pop, Dance/Electronic] = {avg_tempo_1:.4f}')

# pop, R&B
genre_5_tempo = top_5_df[top_5_df['genre'] == 'pop, R&B']
avg_tempo_5 = genre_5_tempo['tempo'].mean()
avg_tempo_5

print(f'[pop, R&B] = {avg_tempo_5:.4f}')

Average Tempo for Top 5 Genres
[pop] = 120.5545
[hip hop, pop] = 118.9581
[hip hop, pop, R&B] = 115.1300
[pop, Dance/Electronic] = 120.5545
[pop, R&B] = 117.0981
