## A Simple Analysis of the Correlation Between GDP Per Capita and Music Tastes
Access of WorldBank API and GDP Data

In [1]:
import requests

def get_gdp(country_code, year):
    url = (
        f"https://api.worldbank.org/v2/country/{country_code}"
        f"/indicator/NY.GDP.PCAP.CD?format=json&date={year}"
    )
    r = requests.get(url)
    data = r.json()

    if len(data) < 2 or data[1] is None or len(data[1]) == 0:
        return None
    
    return data[1][0]["value"]


Reading the Database

In [3]:
import pandas as pd
df = pd.read_csv("../data/final_database.csv")

  df = pd.read_csv("../data/final_database.csv")


Genres by Country

In [4]:
# Number of songs based on country and genre
country_genre_counts = (
    df.groupby(["Country", "Genre"])
      .size()
      .reset_index(name="Count")
)


In [5]:
# Total number of songs for a country
country_totals = (
    country_genre_counts.groupby("Country")["Count"]
    .sum()
    .reset_index(name="Total")
)

# Merging the genre table with total songs for countries
merged = country_genre_counts.merge(country_totals, on="Country")

merged["Percent"] = merged["Count"] / merged["Total"]


In [6]:
# Pivot: country-rows, genre-columns, percent-values
genre_matrix = merged.pivot_table(
    index="Country",
    columns="Genre",
    values="Percent",
    fill_value=0
)

In [7]:
# Returns the top n genres of countries
def top_n_genres(country, n=10):
    row = genre_matrix.loc[country]
    return row.sort_values(ascending=False).head(n)



In [8]:
# Mapping country names to ISO2 codes
country_to_iso = {
    "Argentina": "AR",
    "Australia": "AU",
    "Austria": "AT",
    "Belgium": "BE",
    "Brazil": "BR",
    "Canada": "CA",
    "Chile": "CL",
    "Colombia": "CO",
    "Costa Rica": "CR",
    "Denmark": "DK",
    "Ecuador": "EC",
    "Finland": "FI",
    "France": "FR",
    "Germany": "DE",
    "Indonesia": "ID",
    "Ireland": "IE",
    "Italy": "IT",
    "Malaysia": "MY",
    "Mexico": "MX",
    "Netherlands": "NL",
    "New Zealand": "NZ",
    "Norway": "NO",
    "Peru": "PE",
    "Philippines": "PH",
    "Poland": "PL",
    "Portugal": "PT",
    "Singapore": "SG",
    "Spain": "ES",
    "Sweden": "SE",
    "Switzerland": "CH",
    "Taiwan": "TW",
    "Turkey": "TR",
    "UK": "GB",
    "USA": "US",
}

# Removing global
countries_clean = [c for c in genre_matrix.index if c != "Global"]



In [9]:
# The year selected for this analysis was 2020 as the dataset contained data from 2017-2020
YEAR = 2020

gdp_data = {}

for country in countries_clean:
    iso = country_to_iso[country]
    gdp_value = get_gdp(iso, YEAR)
    gdp_data[country] = gdp_value

gdp_data


{'Argentina': 8535.59938004389,
 'Australia': 51791.540179984,
 'Austria': 48716.4098900349,
 'Belgium': 45906.2875805246,
 'Brazil': 7074.19378337644,
 'Canada': 43537.839298904,
 'Chile': 13114.815470545,
 'Colombia': 5339.68711357943,
 'Costa Rica': 12394.0493969354,
 'Denmark': 60985.4885601514,
 'Ecuador': 5463.64515348598,
 'Finland': 48828.6846862799,
 'France': 39169.8606000707,
 'Germany': 47379.765194548,
 'Indonesia': 3853.70288774146,
 'Ireland': 86622.5067251295,
 'Italy': 32091.4866621366,
 'Malaysia': 9957.52626697613,
 'Mexico': 8841.2707511328,
 'Netherlands': 53467.9277413737,
 'New Zealand': 41850.92032802,
 'Norway': 68340.0181033702,
 'Peru': 6133.32552410183,
 'Philippines': 3227.57910235199,
 'Poland': 16150.9291009726,
 'Portugal': 22299.4044062173,
 'Singapore': 61410.0792630788,
 'Spain': 27233.9426461608,
 'Sweden': 52653.7565934247,
 'Switzerland': 85897.7843338323,
 'Taiwan': None,
 'Turkey': 8638.73903848102,
 'UK': 40404.8062238951,
 'USA': 64401.50743542

Genre Matrix and GDP Dataset Merge

In [10]:
import pandas as pd

gdp_df = pd.DataFrame.from_dict(gdp_data, orient="index", columns=["GDP"])

final_df = genre_matrix.merge(gdp_df, left_index=True, right_index=True)


Ranking Genres by "Importance" in order to mitigate outliers

In [11]:
# Average share across countries for each genre (range 0â€“1)
genre_importance = genre_matrix.mean(axis=0)

# "Top" 20 genres
top20_genres = genre_importance.sort_values(ascending=False).head(20)

top20_genres


Genre
dance pop            0.152369
latin                0.064073
pop                  0.043048
k-pop                0.028848
n-a                  0.022477
atl hip hop          0.018618
big room             0.017254
hip hop              0.016231
colombian pop        0.016071
german hip hop       0.015897
canadian hip hop     0.015305
francoton            0.014455
dutch hip hop        0.014175
emo rap              0.013624
boy band             0.012766
canadian pop         0.011690
alternative metal    0.011550
modern rock          0.011004
italian hip hop      0.010574
melodic rap          0.010508
dtype: float64

Top 20 Genres Correlated with GDP

In [12]:
# GDP correlation only for the top 20 genres
corr_with_gdp = final_df.corr(numeric_only=True)["GDP"].dropna()

# Only filtering genres in the top 20
corr_top20 = corr_with_gdp.loc[corr_with_gdp.index.isin(top20_genres.index)]

print("'Important' genres that are most positively correlated with GDP:\n")
print(corr_top20.sort_values(ascending=False).head(10))

print("\n'Important' genres that are most negatively correlated with GDP:\n")
print(corr_top20.sort_values(ascending=True).head(10))


'Important' genres that are most positively correlated with GDP:

emo rap             0.572140
hip hop             0.505523
atl hip hop         0.486617
melodic rap         0.479618
big room            0.355317
canadian hip hop    0.320801
german hip hop      0.288154
dutch hip hop       0.153121
pop                 0.117880
francoton           0.104820
Name: GDP, dtype: float64

'Important' genres that are most negatively correlated with GDP:

latin               -0.534576
colombian pop       -0.504821
k-pop               -0.344205
boy band            -0.102957
alternative metal   -0.083935
canadian pop        -0.038028
n-a                 -0.017518
italian hip hop     -0.008577
dance pop            0.042085
modern rock          0.097147
Name: GDP, dtype: float64


In [13]:
import matplotlib.pyplot as plt
# Scatter plot for a selected country
def scatter_genre_vs_gdp(genre_name):
    sub = final_df[[genre_name, "GDP"]].dropna()

    plt.figure()
    plt.scatter(sub[genre_name], sub["GDP"])
    plt.xlabel(f"{genre_name} ratio")
    plt.ylabel("GDP per capita (USD)")
    plt.title(f"{genre_name} ratio vs GDP")
    plt.show()


# scatter_genre_vs_gdp("INSERT_GENRE_NAME")



## GDP Prediction Based On Genres


In [14]:
# Music taste vectors by country 
X = final_df[top20_genres.index]

# Target variable
y = final_df["GDP"]


In [15]:
from sklearn.model_selection import train_test_split

feature_cols = list(top20_genres.index)

# Creating a small dataframe with just these columns and GDP
ml_df = final_df[feature_cols + ["GDP"]].dropna()  # Removing NaNs

# Producing X and y
X = ml_df[feature_cols].values
y = ml_df["GDP"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=67
)


##  Linear Regression
A simple linear model


In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Setting up and training the linreg model
linreg = LinearRegression()
linreg.fit(X_train_scaled, y_train)

# Predictions based on training data
y_pred_lin = linreg.predict(X_test_scaled)

# Performance metrics
r2_lin = r2_score(y_test, y_pred_lin)
mae_lin = mean_absolute_error(y_test, y_pred_lin)

print("Linear Regression Results:")
print(f"R^2:  {r2_lin:.3f}")
print(f"MAE:  {mae_lin:,.2f}")

# Result sample
print("\nActual data and predicted data for the first 10 countries (linreg) :")
for true, pred in list(zip(y_test, y_pred_lin))[:10]:
    print(f"Actual: {true:,.0f}  |  Prediction: {pred:,.0f}")


Linear Regression Results:
R^2:  -831642.152
MAE:  7,067,323.81

Actual data and predicted data for the first 10 countries (linreg) :
Actual: 8,536  |  Prediction: -44,872
Actual: 61,410  |  Prediction: -10,222
Actual: 53,468  |  Prediction: -611,902
Actual: 9,958  |  Prediction: 13,600
Actual: 39,170  |  Prediction: 190,314
Actual: 13,115  |  Prediction: 88,071
Actual: 85,898  |  Prediction: -1,245,143
Actual: 32,091  |  Prediction: -68,065,384
Actual: 48,716  |  Prediction: -28,116
Actual: 48,829  |  Prediction: 196,566


## Random Forest
In order to catch non-linear relationships


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Set up the random forest model
rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

# Train
rf.fit(X_train, y_train)

# Predict
y_pred_rf = rf.predict(X_test)

# Metrics
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print("Random Forest Results")
print("-" * 40)
print(f"R^2:  {r2_rf:.3f}")
print(f"MAE:  {mae_rf:,.2f}")

print("\nActual data and predicted data for the first 10 countries (RF) :")
for true, pred in list(zip(y_test, y_pred_rf))[:10]:
    print(f"Actual: {true:,.0f}  |  Prediction: {pred:,.0f}")


Random Forest Results
----------------------------------------
R^2:  0.638
MAE:  8,838.31

Actual data and predicted data for the first 10 countries (RF) :
Actual: 8,536  |  Prediction: 8,574
Actual: 61,410  |  Prediction: 39,378
Actual: 53,468  |  Prediction: 50,978
Actual: 9,958  |  Prediction: 17,183
Actual: 39,170  |  Prediction: 42,028
Actual: 13,115  |  Prediction: 8,288
Actual: 85,898  |  Prediction: 48,544
Actual: 32,091  |  Prediction: 27,906
Actual: 48,716  |  Prediction: 46,183
Actual: 48,829  |  Prediction: 43,989


## Clustering Using KMeans
Grouping of countries based on music tastes


In [20]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Scale genre vectors for all countries
scaler_all = StandardScaler()
X_all_scaled = scaler_all.fit_transform(X)

# KMeans model (k=3 cluster)
k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_all_scaled)

# Adding cluster results to dataframe
cluster_df = ml_df.copy()
cluster_df["cluster"] = clusters

# Average GDP for clusters
cluster_stats = cluster_df.groupby("cluster")["GDP"].agg(["mean", "median", "count"])


In [19]:
# Listing countries based on the clusters
cluster_df = cluster_df.copy()
cluster_df["country"] = cluster_df.index


for c in sorted(cluster_df["cluster"].unique()):
    print(f"\n--- Cluster {c} ---")
    sub = cluster_df[cluster_df["cluster"] == c]

    print(f"Num. of Countries: {len(sub)}")
    print("Avg. GDP:", round(sub["GDP"].mean(), 2))

    print("Countries:")
    for name in sub["country"].tolist():
        print("  -", name)



--- Cluster 0 ---
Num. of Countries: 20
Avg. GDP: 33690.63
Countries:
  - Argentina
  - Austria
  - Belgium
  - Brazil
  - Chile
  - Colombia
  - Denmark
  - Ecuador
  - Finland
  - France
  - Germany
  - Italy
  - Mexico
  - Netherlands
  - Norway
  - Peru
  - Spain
  - Sweden
  - Switzerland
  - Turkey

--- Cluster 1 ---
Num. of Countries: 4
Avg. GDP: 19612.22
Countries:
  - Indonesia
  - Malaysia
  - Philippines
  - Singapore

--- Cluster 2 ---
Num. of Countries: 9
Avg. GDP: 42161.5
Countries:
  - Australia
  - Canada
  - Costa Rica
  - Ireland
  - New Zealand
  - Poland
  - Portugal
  - UK
  - USA
