In [8]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import linregress
pd.set_option('max_colwidth', 600)

# Dataset 1 - Spotify Songs

In [13]:
# Read the data into a Pandas DataFrame
raw_spotify_data_df= pd.read_csv('Raw/best_songs_on_spotify_from_2000_to_2023.csv',sep=";")

raw_spotify_data_df.head(100)

raw_spotify_data_df.rename(columns={'top genre':'Genre'},inplace=True)


In [14]:
# Define the mapping of genres
genre_mapping = {
    'hip hop|rap|r&b': 'Hip hop/Rap/R&b',
    'edm|electronic|house|dubstep|trance|electro': 'EDM',
    'pop|dance': 'Pop',
    'rock|metal|thrash|emo|alternative': 'Rock/Metal',
    'latin|reggaeton': 'Latin/Reggaeton'
}

# Iterate over each row in the DataFrame
for index, row in raw_spotify_data_df.iterrows():
    genre = row['Genre']
    updated_genre = 'Other'  # Default value if the genre doesn't match any criteria

    # Check if the genre matches any mapping
    for key, value in genre_mapping.items():
        if any(pd.Series(genre).str.contains(r'\b{}\b'.format(g), case=False, regex=True).any() for g in key.split('|')):
            updated_genre = value
            break  # Stop searching once a match is found

    # Update the 'Genre' column with the mapped genre
    raw_spotify_data_df.at[index, 'Genre'] = updated_genre
    
raw_spotify_data_df.head()



Unnamed: 0,title,artist,Genre,year,bpm,energy,danceability,dB,liveness,valence,duration,acousticness,speechiness,popularity
0,Flowers,Miley Cyrus,Pop,2023,118,68,71,-4,3,65,200,6,7,98
1,Cupid - Twin Ver.,FIFTY FIFTY,Pop,2023,120,59,78,-8,35,73,174,44,3,97
2,BESO,ROSALÍA,Pop,2023,95,64,77,-7,17,53,195,74,14,96
3,Boy's a liar Pt. 2,PinkPantheress,Other,2023,133,81,70,-8,25,86,131,25,5,96
4,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,Hip hop/Rap/R&b,2022,98,62,72,-6,8,17,222,42,5,96


In [16]:
cols = list(raw_spotify_data_df.columns)
renamed_cols = [str(x).capitalize() for x in cols]
print(renamed_cols)

['Title', 'Artist', 'Genre', 'Year', 'Bpm', 'Energy', 'Danceability ', 'Db', 'Liveness', 'Valence', 'Duration', 'Acousticness', 'Speechiness ', 'Popularity']


In [17]:
column_mapping = {}

for col in cols:
    
    col_index = cols.index(col)
    renamed_col = renamed_cols[col_index]
    column_mapping[col] = renamed_col
print(column_mapping)    

{'title': 'Title', 'artist': 'Artist', 'Genre': 'Genre', 'year': 'Year', 'bpm': 'Bpm', 'energy': 'Energy', 'danceability ': 'Danceability ', 'dB': 'Db', 'liveness': 'Liveness', 'valence': 'Valence', 'duration': 'Duration', 'acousticness': 'Acousticness', 'speechiness ': 'Speechiness ', 'popularity': 'Popularity'}


In [18]:
raw_spotify_data_df.rename(columns=column_mapping,inplace=True)

raw_spotify_data_df.head()

Unnamed: 0,Title,Artist,Genre,Year,Bpm,Energy,Danceability,Db,Liveness,Valence,Duration,Acousticness,Speechiness,Popularity
0,Flowers,Miley Cyrus,Pop,2023,118,68,71,-4,3,65,200,6,7,98
1,Cupid - Twin Ver.,FIFTY FIFTY,Pop,2023,120,59,78,-8,35,73,174,44,3,97
2,BESO,ROSALÍA,Pop,2023,95,64,77,-7,17,53,195,74,14,96
3,Boy's a liar Pt. 2,PinkPantheress,Other,2023,133,81,70,-8,25,86,131,25,5,96
4,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,Hip hop/Rap/R&b,2022,98,62,72,-6,8,17,222,42,5,96


# Exporting Cleaned Data

In [19]:
raw_spotify_data_df.to_csv("Clean/cleaned_spotify_data.csv",sep=";", index=False)

#raw_country_genres_df.to_csv("Clean/cleaned_country_data.csv", index=False)

In [30]:
raw_spotify_data_df["YearlyPopularityRank"] = raw_spotify_data_df.groupby("Year")["Popularity"].rank(ascending=False)

In [31]:
raw_spotify_data_df.head()

Unnamed: 0,Title,Artist,Genre,Year,Bpm,Energy,Danceability,Db,Liveness,Valence,Duration,Acousticness,Speechiness,Popularity,YearlyPopularityRank
0,Flowers,Miley Cyrus,Pop,2023,118,68,71,-4,3,65,200,6,7,98,1.0
1,Cupid - Twin Ver.,FIFTY FIFTY,Pop,2023,120,59,78,-8,35,73,174,44,3,97,2.0
2,BESO,ROSALÍA,Pop,2023,95,64,77,-7,17,53,195,74,14,96,4.5
3,Boy's a liar Pt. 2,PinkPantheress,Other,2023,133,81,70,-8,25,86,131,25,5,96,4.5
4,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,Hip hop/Rap/R&b,2022,98,62,72,-6,8,17,222,42,5,96,1.0


In [35]:
#Top 20 songs per year
spotify_top20_by_year= raw_spotify_data_df[raw_spotify_data_df["YearlyPopularityRank"] <= 20]

spotify_top20_by_year.head(100)

Unnamed: 0,Title,Artist,Genre,Year,Bpm,Energy,Danceability,Db,Liveness,Valence,Duration,Acousticness,Speechiness,Popularity,YearlyPopularityRank
0,Flowers,Miley Cyrus,Pop,2023,118,68,71,-4,3,65,200,6,7,98,1.0
1,Cupid - Twin Ver.,FIFTY FIFTY,Pop,2023,120,59,78,-8,35,73,174,44,3,97,2.0
2,BESO,ROSALÍA,Pop,2023,95,64,77,-7,17,53,195,74,14,96,4.5
3,Boy's a liar Pt. 2,PinkPantheress,Other,2023,133,81,70,-8,25,86,131,25,5,96,4.5
4,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,Hip hop/Rap/R&b,2022,98,62,72,-6,8,17,222,42,5,96,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,7 rings,Ariana Grande,Pop,2019,140,32,78,-11,9,33,179,59,33,86,8.5
101,A Sky Full of Stars,Coldplay,Other,2014,125,68,55,-6,21,16,268,1,3,86,8.0
102,Adore You,Harry Styles,Pop,2019,99,77,68,-4,10,57,207,2,5,86,8.5
103,All of Me,John Legend,Other,2013,120,26,42,-7,13,33,270,92,3,86,6.5


In [37]:
Year_bin = spotify_top20_by_year[["Year","Title","Artist","Genre","Popularity"]]
Year_bin["Song"] = Year_bin.apply(lambda x:dict(Title=x.get("Title"),
                                              Artist=x.get("Artist"),
                                              Genre=x.get("Genre"),
                                              Popularity=x.get("Popularity")),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Year_bin["Song"] = Year_bin.apply(lambda x:dict(Title=x.get("Title"),


In [38]:
Year_bin.head()

Unnamed: 0,Year,Title,Artist,Genre,Popularity,Song
0,2023,Flowers,Miley Cyrus,Pop,98,"{'Title': 'Flowers', 'Artist': 'Miley Cyrus', 'Genre': 'Pop', 'Popularity': 98}"
1,2023,Cupid - Twin Ver.,FIFTY FIFTY,Pop,97,"{'Title': 'Cupid - Twin Ver.', 'Artist': 'FIFTY FIFTY', 'Genre': 'Pop', 'Popularity': 97}"
2,2023,BESO,ROSALÍA,Pop,96,"{'Title': 'BESO', 'Artist': 'ROSALÍA', 'Genre': 'Pop', 'Popularity': 96}"
3,2023,Boy's a liar Pt. 2,PinkPantheress,Other,96,"{'Title': 'Boy's a liar Pt. 2', 'Artist': 'PinkPantheress', 'Genre': 'Other', 'Popularity': 96}"
4,2022,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,Hip hop/Rap/R&b,96,"{'Title': 'Creepin' (with The Weeknd & 21 Savage)', 'Artist': 'Metro Boomin', 'Genre': 'Hip hop/Rap/R&b', 'Popularity': 96}"


In [39]:
Year_bin = Year_bin.groupby("Year")["Song"].apply(list)
Year_bin = pd.DataFrame(Year_bin)
Year_bin.head()

Unnamed: 0_level_0,Song
Year,Unnamed: 1_level_1
1985,"[{'Title': 'Running Up That Hill (A Deal With God)', 'Artist': 'Kate Bush', 'Genre': 'Pop', 'Popularity': 86}]"
1996,"[{'Title': 'Wannabe', 'Artist': 'Spice Girls', 'Genre': 'Pop', 'Popularity': 82}]"
1997,"[{'Title': 'Barbie Girl', 'Artist': 'Aqua', 'Genre': 'Pop', 'Popularity': 77}, {'Title': 'Everybody (Backstreet's Back) - Radio Edit', 'Artist': 'Backstreet Boys', 'Genre': 'Other', 'Popularity': 77}, {'Title': 'That Don't Impress Me Much - International Mix', 'Artist': 'Shania Twain', 'Genre': 'Other', 'Popularity': 64}]"
1998,"[{'Title': 'Hot Boyz', 'Artist': 'Missy Elliott', 'Genre': 'Pop', 'Popularity': 52}]"
1999,"[{'Title': 'Maria Maria (feat. The Product G&B)', 'Artist': 'Santana', 'Genre': 'Rock/Metal', 'Popularity': 84}, {'Title': 'All The Small Things', 'Artist': 'blink-182', 'Genre': 'Rock/Metal', 'Popularity': 83}, {'Title': 'I Want It That Way', 'Artist': 'Backstreet Boys', 'Genre': 'Other', 'Popularity': 83}, {'Title': 'Otherside', 'Artist': 'Red Hot Chili Peppers', 'Genre': 'Rock/Metal', 'Popularity': 82}, {'Title': 'The Next Episode', 'Artist': 'Dr. Dre', 'Genre': 'Other', 'Popularity': 82}, {'Title': '...Baby One More Time', 'Artist': 'Britney Spears', 'Genre': 'Pop', 'Popularity': 81}, ..."


In [40]:
Year_bin = Year_bin.reset_index()
Year_bin.head()

Unnamed: 0,Year,Song
0,1985,"[{'Title': 'Running Up That Hill (A Deal With God)', 'Artist': 'Kate Bush', 'Genre': 'Pop', 'Popularity': 86}]"
1,1996,"[{'Title': 'Wannabe', 'Artist': 'Spice Girls', 'Genre': 'Pop', 'Popularity': 82}]"
2,1997,"[{'Title': 'Barbie Girl', 'Artist': 'Aqua', 'Genre': 'Pop', 'Popularity': 77}, {'Title': 'Everybody (Backstreet's Back) - Radio Edit', 'Artist': 'Backstreet Boys', 'Genre': 'Other', 'Popularity': 77}, {'Title': 'That Don't Impress Me Much - International Mix', 'Artist': 'Shania Twain', 'Genre': 'Other', 'Popularity': 64}]"
3,1998,"[{'Title': 'Hot Boyz', 'Artist': 'Missy Elliott', 'Genre': 'Pop', 'Popularity': 52}]"
4,1999,"[{'Title': 'Maria Maria (feat. The Product G&B)', 'Artist': 'Santana', 'Genre': 'Rock/Metal', 'Popularity': 84}, {'Title': 'All The Small Things', 'Artist': 'blink-182', 'Genre': 'Rock/Metal', 'Popularity': 83}, {'Title': 'I Want It That Way', 'Artist': 'Backstreet Boys', 'Genre': 'Other', 'Popularity': 83}, {'Title': 'Otherside', 'Artist': 'Red Hot Chili Peppers', 'Genre': 'Rock/Metal', 'Popularity': 82}, {'Title': 'The Next Episode', 'Artist': 'Dr. Dre', 'Genre': 'Other', 'Popularity': 82}, {'Title': '...Baby One More Time', 'Artist': 'Britney Spears', 'Genre': 'Pop', 'Popularity': 81}, ..."
