In [128]:
import pandas as pd

# First, let's import the code and clean the code.

## I'll take you through step by step. To start, let's import it!

In [129]:
Billboard_Charts = pd.read_csv('BillboardCharts.csv')

Spotify_Data = pd.read_csv('SpotifyTracks.csv')

### Let's do a quick check to make sure we properly sourced the data:

In [130]:
Billboard_Charts.head()

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14
3,2021-11-06,4,Fancy Like,Walker Hayes,4.0,3,19
4,2021-11-06,5,Bad Habits,Ed Sheeran,5.0,2,18


In [131]:
Spotify_Data.head()

Unnamed: 0,number,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


### Next, let's check to see if either of our data sets have any missing data.

In [132]:
print(Billboard_Charts.isnull().sum())

date                  0
rank                  0
song                  0
artist                0
last-week         32312
peak-rank             0
weeks-on-board        0
dtype: int64


In [133]:
print(Spotify_Data.isnull().sum())

number              0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


### Okay! So, now we see that there are some missing values in the Spotify_Data CSV. Let's drop those!

In [134]:
Spotify_Data_Cleaned = Spotify_Data.dropna()

### Let's make sure it dropped properly:

In [135]:
print(Spotify_Data_Cleaned.isnull().sum())

number              0
track_id            0
artists             0
album_name          0
track_name          0
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


### Because both of these data sets have so many songs listed in them, let's make sure there's no duplicate songs that have snuck their way in to either data set, just in case.

In [136]:
billboard_duplicates = Billboard_Charts.duplicated()

spotify_duplicates = Spotify_Data_Cleaned.duplicated()

print('In the Billboard data, there are ' + str(billboard_duplicates.sum()) + ' duplicate rows.')

print('In the Spotify Data, there are ' + str(spotify_duplicates.sum()) + ' duplicate rows.')

In the Billboard data, there are 0 duplicate rows.
In the Spotify Data, there are 0 duplicate rows.


### Great! Now, here is where we drop some columns from the cleaned data. 

There are a lot of columns, and we don't need them all to get the information we are seeking!



In [None]:
Billboard_Cleaned = Billboard_Charts.drop(columns=['last-week'])

Spotify_Cleaned = Spotify_Data_Cleaned.drop(columns=['duration_ms', 'popularity', 'explicit', 'track_id', 'number

### Let's check to make sure the correct columns are gone!

In [138]:
Billboard_Cleaned.head()

Unnamed: 0,date,rank,song,artist,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,1,14
3,2021-11-06,4,Fancy Like,Walker Hayes,3,19
4,2021-11-06,5,Bad Habits,Ed Sheeran,2,18


In [139]:
Spotify_Cleaned.head()

Unnamed: 0,artists,album_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,Comedy,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,Hold On,Hold On,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic
