In [9]:
import pandas as pd
import datetime

url = '../data/raw/universal_top_spotify_songs.csv'
df = pd.read_csv(url)

country_mapping = {
    'ZA': {'name': 'South Africa', 'continent': 'Africa'},
    'VN': {'name': 'Vietnam', 'continent': 'Asia'},
    'VE': {'name': 'Venezuela', 'continent': 'South America'},
    'UY': {'name': 'Uruguay', 'continent': 'South America'},
    'US': {'name': 'United States', 'continent': 'North America'},
    'UA': {'name': 'Ukraine', 'continent': 'Europe'},
    'TW': {'name': 'Taiwan', 'continent': 'Asia'},
    'TR': {'name': 'Turkey', 'continent': 'Europe/Asia'},
    'TH': {'name': 'Thailand', 'continent': 'Asia'},
    'SV': {'name': 'El Salvador', 'continent': 'North America'},
    'SK': {'name': 'Slovakia', 'continent': 'Europe'},
    'SG': {'name': 'Singapore', 'continent': 'Asia'},
    'SE': {'name': 'Sweden', 'continent': 'Europe'},
    'SA': {'name': 'Saudi Arabia', 'continent': 'Asia'},
    'RO': {'name': 'Romania', 'continent': 'Europe'},
    'PY': {'name': 'Paraguay', 'continent': 'South America'},
    'PT': {'name': 'Portugal', 'continent': 'Europe'},
    'PL': {'name': 'Poland', 'continent': 'Europe'},
    'PK': {'name': 'Pakistan', 'continent': 'Asia'},
    'PH': {'name': 'Philippines', 'continent': 'Asia'},
    'PE': {'name': 'Peru', 'continent': 'South America'},
    'PA': {'name': 'Panama', 'continent': 'North America'},
    'NZ': {'name': 'New Zealand', 'continent': 'Oceania'},
    'NO': {'name': 'Norway', 'continent': 'Europe'},
    'NL': {'name': 'Netherlands', 'continent': 'Europe'},
    'NI': {'name': 'Nicaragua', 'continent': 'North America'},
    'NG': {'name': 'Nigeria', 'continent': 'Africa'},
    'MY': {'name': 'Malaysia', 'continent': 'Asia'},
    'MX': {'name': 'Mexico', 'continent': 'North America'},
    'MA': {'name': 'Morocco', 'continent': 'Africa'},
    'LV': {'name': 'Latvia', 'continent': 'Europe'},
    'LU': {'name': 'Luxembourg', 'continent': 'Europe'},
    'LT': {'name': 'Lithuania', 'continent': 'Europe'},
    'KZ': {'name': 'Kazakhstan', 'continent': 'Asia'},
    'KR': {'name': 'South Korea', 'continent': 'Asia'},
    'JP': {'name': 'Japan', 'continent': 'Asia'},
    'IT': {'name': 'Italy', 'continent': 'Europe'},
    'IS': {'name': 'Iceland', 'continent': 'Europe'},
    'IN': {'name': 'India', 'continent': 'Asia'},
    'IL': {'name': 'Israel', 'continent': 'Asia'},
    'IE': {'name': 'Ireland', 'continent': 'Europe'},
    'ID': {'name': 'Indonesia', 'continent': 'Asia'},
    'HU': {'name': 'Hungary', 'continent': 'Europe'},
    'HN': {'name': 'Honduras', 'continent': 'North America'},
    'HK': {'name': 'Hong Kong', 'continent': 'Asia'},
    'GT': {'name': 'Guatemala', 'continent': 'North America'},
    'GR': {'name': 'Greece', 'continent': 'Europe'},
    'FR': {'name': 'France', 'continent': 'Europe'},
    'FI': {'name': 'Finland', 'continent': 'Europe'},
    'ES': {'name': 'Spain', 'continent': 'Europe'},
    'EG': {'name': 'Egypt', 'continent': 'Africa'},
    'EE': {'name': 'Estonia', 'continent': 'Europe'},
    'EC': {'name': 'Ecuador', 'continent': 'South America'},
    'DO': {'name': 'Dominican Republic', 'continent': 'North America'},
    'DK': {'name': 'Denmark', 'continent': 'Europe'},
    'DE': {'name': 'Germany', 'continent': 'Europe'},
    'CZ': {'name': 'Czech Republic', 'continent': 'Europe'},
    'CR': {'name': 'Costa Rica', 'continent': 'North America'},
    'CO': {'name': 'Colombia', 'continent': 'South America'},
    'CL': {'name': 'Chile', 'continent': 'South America'},
    'CH': {'name': 'Switzerland', 'continent': 'Europe'},
    'CA': {'name': 'Canada', 'continent': 'North America'},
    'BY': {'name': 'Belarus', 'continent': 'Europe'},
    'BR': {'name': 'Brazil', 'continent': 'South America'},
    'BO': {'name': 'Bolivia', 'continent': 'South America'},
    'BG': {'name': 'Bulgaria', 'continent': 'Europe'},
    'BE': {'name': 'Belgium', 'continent': 'Europe'},
    'AU': {'name': 'Australia', 'continent': 'Oceania'},
    'AT': {'name': 'Austria', 'continent': 'Europe'},
    'AR': {'name': 'Argentina', 'continent': 'South America'},
    'AE': {'name': 'United Arab Emirates', 'continent': 'Asia'},
    'GB': {'name': 'United Kingdom', 'continent': 'Europe'}
}

#Make columns lower case for readibility.
df.rename(columns={ col: col.rstrip().replace(" ", "_").lower() for col in df.columns}, inplace=True)

#Dropping null values
df = df.dropna(subset=['country','artists'])

#converting date column to date.
df['snapshot_date'] = pd.to_datetime(df['snapshot_date'])
df['month'] = df['snapshot_date'].apply(lambda x: x.month if isinstance(x, pd.Timestamp) else x)
df['year'] = df['snapshot_date'].apply(lambda x: x.year if isinstance(x, pd.Timestamp) else x)

#Data from the last 3 months.
df = df[df['snapshot_date'] >= '2024-09-01']

#Top 20 ranked
df = df[df['daily_rank'] <= 20]

#added columns for full country name and continent
df["country_full"] = df["country"].map(lambda x: country_mapping.get(x, {}).get('name', 'Unknown'))
df["continent"] = df["country"].map(lambda x: country_mapping.get(x, {}).get('continent', 'Unknown'))

#dropped irrelevant columns
df = df[['spotify_id','name','artists','daily_rank','daily_movement','weekly_movement','snapshot_date','popularity','is_explicit','album_name','album_release_date','danceability','energy','loudness','month','year','country_full','continent']]

#filtered out by continent
df = df[df['continent'] == 'Europe']

df

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,snapshot_date,popularity,is_explicit,album_name,album_release_date,danceability,energy,loudness,month,year,country_full,continent
300,05vZtu5obzMiZdreJK7HH3,Врубай,Parfeniuk,1,0,0,2025-02-02,56,False,Врубай,2024-11-28,0.871,0.628,-4.952,2,2025,Ukraine,Europe
301,7CuxOoQMEaiSXIGwJJkQHW,Кульбаби,Тоня Матвієнко,2,0,0,2025-02-02,61,False,Кульбаби,2022-12-09,0.832,0.668,-5.033,2,2025,Ukraine,Europe
302,4MQmxIxM5CXKjdq4IrCWhd,Касета,SadSvit,3,0,0,2025-02-02,61,False,Cassette,2021-12-13,0.654,0.743,-7.189,2,2025,Ukraine,Europe
303,7DY756WOLyOz2Xnhw4EFiC,São Paulo (feat. Anitta),"The Weeknd, Anitta",4,0,46,2025-02-02,71,True,Hurry Up Tomorrow,2025-01-31,0.538,0.658,-7.273,2,2025,Ukraine,Europe
304,6zECxDYlNtEybq8hZ9Tp12,ParisLove,FORTUNA 812,5,7,3,2025-02-02,65,True,ParisLove,2024-11-08,0.614,0.622,-6.989,2,2025,Ukraine,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523906,3ovTEUMYEoiT8ikEMVXBmS,Lilien,AYLIVA,16,-1,5,2024-09-01,73,False,In Liebe,2024-08-16,0.862,0.549,-6.635,9,2024,Austria,Europe
523907,2uqYupMHANxnwgeiXTZXzd,Austin (Boots Stop Workin'),Dasha,17,0,8,2024-09-01,91,True,What Happens Now?,2024-02-16,0.756,0.672,-6.743,9,2024,Austria,Europe
523908,3U5JVgI2x4rDyHGObzJfNf,Unwritten,Natasha Bedingfield,18,1,-1,2024-09-01,88,False,Unwritten,2004-08-30,0.706,0.800,-6.333,9,2024,Austria,Europe
523909,31IhzT94l1iqqTcwohZzbY,AMA DOREN,Bobby Vandamme,19,-8,31,2024-09-01,59,False,AMA DOREN,2024-08-29,0.542,0.487,-15.587,9,2024,Austria,Europe


In [191]:
df['album_id'] = pd.factorize(df['album_name'])[0]
df[df['album_id'] == 50]

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,snapshot_date,popularity,is_explicit,album_name,album_release_date,danceability,energy,loudness,month,year,country_full,continent,album_id
760,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,11,11,-5,2025-02-02,55,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,2,2025,Romania,Europe,50
7965,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,16,-1,-9,2025-01-31,55,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50
11564,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,15,-6,-8,2025-01-30,55,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50
15158,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,9,1,-4,2025-01-29,55,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50
18759,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,10,-1,-5,2025-01-28,54,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50
22358,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,9,-3,-4,2025-01-27,54,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50
25951,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,6,0,0,2025-01-26,54,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50
29551,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,6,1,-1,2025-01-25,54,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50
33202,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,7,0,-2,2025-01-24,54,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50
36802,40GLYWLoMFhkuoDImEwHm4,Jenifer buci de fier,Raihold,7,-2,-4,2025-01-23,54,False,Jenifer buci de fier,2023-03-25,0.767,0.809,-5.591,1,2025,Romania,Europe,50


In [197]:
#IDs

#Generating songs table
unique_album = df['album_name'].unique()
album_id_mapping = {album: i+1 for i, album in enumerate(unique_album)}
album_df = pd.DataFrame(album_id_mapping.items(),columns=['album_name','album_id'])
album_df['album_name'] = album_df['album_name'].astype(str)
album_df
#Generating song_artists table


#Generating song table
song_df = df[['spotify_id', 'name', 'is_explicit','artists','album_id']].drop_duplicates()
song_df

#album table
album_df = df[['album_id','album_name','album_release_date']]
album_df

#

Unnamed: 0,album_id,album_name,album_release_date
300,0,Врубай,2024-11-28
301,1,Кульбаби,2022-12-09
302,2,Cassette,2021-12-13
303,3,Hurry Up Tomorrow,2025-01-31
304,4,ParisLove,2024-11-08
...,...,...,...
523906,502,In Liebe,2024-08-16
523907,670,What Happens Now?,2024-02-16
523908,662,Unwritten,2004-08-30
523909,1601,AMA DOREN,2024-08-29


In [21]:
url2 = '../data/raw/music_festivals.csv'
df2 = pd.read_csv(url2)

#Make columns lower case for readibility.
df2.rename(columns={ col: col.rstrip().replace(" ", "_").lower() for col in df2.columns}, inplace=True)

# Extracting the first character as currency symbol
df2['currency'] = df2['economic_impact'].astype(str).str.strip().str[0]

# Cleaning economic_impact column with a lambda function
df2['economic_impact'] = df2['economic_impact'].astype(str).apply(
    lambda x: float(x.lstrip("£€").strip().replace("million", "").strip()) * 1_000_000
    if any(sym in x for sym in ["£", "€"]) else None
)


# Filtering for specific currencies
df2 = df2[df2['currency'].isin(['£', '€'])]

# Rename the column 'location' to 'attendance'
df2.rename(columns={'location': 'attendance','festival_name':'country','attendance_numbers':'age_group','music_genre':'genre'}, inplace=True)

#making attendance and economic_impact numeric values.
df2['attendance'] = df2['attendance'].astype(float)
df2['economic_impact'] = df2['economic_impact'].astype(float)

#countries with the most economic impact
pivot_table = df2.pivot_table(index='country',values='economic_impact').sort_values(by='economic_impact',ascending=False)
pivot_table = pivot_table['economic_impact'].apply(lambda x: f"{x:,.2f}")

#clean festival_name values
df2['country'] = df2['country'].apply(lambda x: x.strip())

#assigned festivalID to table.
df2 = df2.reset_index()
df2.rename(columns={'index':'festival_id'},inplace=True)

#drop irrelevant columns
df2.drop(columns={'unnamed:_0'},inplace=True)

#replace 'UK' for 'United Kingdom'
df2['country'] = df2['country'].replace('UK','United Kingdom')

df2

Index(['festival_id', 'country', 'attendance', 'age_group',
       'visitor_demographics', 'economic_impact', 'genre', 'currency'],
      dtype='object')

In [63]:
#Generating genre table to export.
genres_df = pd.DataFrame(df2["genre"].unique(), columns=['genre'])
genres_df_split = genres_df["genre"].apply(lambda x: x.split("/")[-1] if len(x.split("/")) == 2 else x)
genres_df_ = genres_df["genre"].apply(lambda x: x.split("/")[0] if len(x.split("/")) == 2 else x)
genres_df = pd.concat([genres_df_split,genres_df_])
genre_df = pd.DataFrame(genres_df.unique(), columns=['genre'])



Unnamed: 0,genre
0,Various
1,EDM
2,Rock
3,Indie
4,Jazz
5,Electronic
6,Alternative
7,Metal
8,Pop
9,Folk


In [4]:
#Create final dataset csv file

df.to_csv('../data/clean/top_spotify_songs_cleaned.csv', index=False)
df2.to_csv('../data/clean/music_festivals_cleaned.csv',index=False)

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,snapshot_date,popularity,is_explicit,album_name,album_release_date,danceability,energy,loudness,month,year,country_full,continent
300,05vZtu5obzMiZdreJK7HH3,Врубай,Parfeniuk,1,0,0,2025-02-02,56,False,Врубай,2024-11-28,0.871,0.628,-4.952,2,2025,Ukraine,Europe
301,7CuxOoQMEaiSXIGwJJkQHW,Кульбаби,Тоня Матвієнко,2,0,0,2025-02-02,61,False,Кульбаби,2022-12-09,0.832,0.668,-5.033,2,2025,Ukraine,Europe
302,4MQmxIxM5CXKjdq4IrCWhd,Касета,SadSvit,3,0,0,2025-02-02,61,False,Cassette,2021-12-13,0.654,0.743,-7.189,2,2025,Ukraine,Europe
303,7DY756WOLyOz2Xnhw4EFiC,São Paulo (feat. Anitta),"The Weeknd, Anitta",4,0,46,2025-02-02,71,True,Hurry Up Tomorrow,2025-01-31,0.538,0.658,-7.273,2,2025,Ukraine,Europe
304,6zECxDYlNtEybq8hZ9Tp12,ParisLove,FORTUNA 812,5,7,3,2025-02-02,65,True,ParisLove,2024-11-08,0.614,0.622,-6.989,2,2025,Ukraine,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523906,3ovTEUMYEoiT8ikEMVXBmS,Lilien,AYLIVA,16,-1,5,2024-09-01,73,False,In Liebe,2024-08-16,0.862,0.549,-6.635,9,2024,Austria,Europe
523907,2uqYupMHANxnwgeiXTZXzd,Austin (Boots Stop Workin'),Dasha,17,0,8,2024-09-01,91,True,What Happens Now?,2024-02-16,0.756,0.672,-6.743,9,2024,Austria,Europe
523908,3U5JVgI2x4rDyHGObzJfNf,Unwritten,Natasha Bedingfield,18,1,-1,2024-09-01,88,False,Unwritten,2004-08-30,0.706,0.800,-6.333,9,2024,Austria,Europe
523909,31IhzT94l1iqqTcwohZzbY,AMA DOREN,Bobby Vandamme,19,-8,31,2024-09-01,59,False,AMA DOREN,2024-08-29,0.542,0.487,-15.587,9,2024,Austria,Europe
