
# 1. Setup



In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.options.mode.copy_on_write = True

# Using the URL for the file
spotify_original = pd.read_csv("dataset.csv")

spotify_original_reshape = spotify_original.iloc[:,1:]

# 2. Data Cleaning

In [None]:
# Cleaning rows with missing information
missing_data_rows = spotify_original_reshape[spotify_original_reshape.isnull().any(axis=1)]

missing_data_rows

spotify_original_reshape_drop = spotify_original_reshape.dropna()

print(spotify_original_reshape.shape)
print(spotify_original_reshape_drop.shape)

In [None]:
#clean track_name and artists columns by stripping spaces and converting to lowercase
spotify_original_reshape_drop['track_name_clean'] = spotify_original_reshape_drop['track_name'].str.strip().str.lower()
spotify_original_reshape_drop['artists_clean'] = spotify_original_reshape_drop['artists'].str.strip().str.lower()

#priority list for genres to handle duplicates
genre_priority = ['pop', 'rock', 'hip hop', 'rap', 'reggaeton', 'latin', 'electronic', 'r&b', 'reggae', 'dance', 'classical']
spotify_original_reshape_drop['genre_priority'] = spotify_original_reshape_drop['track_genre'].apply(lambda x: genre_priority.index(x) if x in genre_priority else len(genre_priority))

#sort the dataset by track_name, artists, genre priority, popularity, and duration
spotify_data_sorted = spotify_original_reshape_drop.sort_values(by=['track_name_clean', 'artists_clean', 'genre_priority', 'popularity', 'duration_ms'],
                                                                ascending=[True, True, True, True, False])

#remove duplicates
spotify_cleaned = spotify_data_sorted.drop_duplicates(subset=['track_name_clean', 'artists_clean'], keep='first')

# checking size
print(f"Shape of the dataset before cleaning: {spotify_original_reshape_drop.shape}")
print(f"Shape of the dataset after cleaning: {spotify_cleaned.shape}")

# removing extra columns added
spotify_cleaned_final = spotify_cleaned.drop(columns=['track_name_clean', 'artists_clean', 'genre_priority'])

# Fcheck size again
print(f"Shape of the dataset after removing extra columns: {spotify_cleaned_final.shape}")


spotify_cleaned_final

In [None]:
# Merge Genres
genre_mapping = {
    # Pop
    'pop': 'Pop', 'power-pop': 'Pop', 'synth-pop': 'Pop', 'indie-pop': 'Pop',
    'k-pop': 'Pop', 'j-pop': 'Pop', 'cantopop': 'Pop', 'mandopop': 'Pop',
    'british': 'Pop', 'spanish': 'Pop', 'latino': 'Pop', 'pop-film': 'Pop',
    'pagode': 'Pop', 'j-idol': 'Pop', 'sad': 'Pop',
    
    # Rock
    'rock': 'Rock', 'rock-n-roll': 'Rock', 'alt-rock': 'Rock', 'indie': 'Rock',
    'hard-rock': 'Rock', 'punk-rock': 'Rock', 'garage': 'Rock', 'psych-rock': 'Rock',
    'grunge': 'Rock', 'guitar': 'Rock', 'ska': 'Rock', 'emo': 'Rock', 'punk': 'Rock',
    'death-metal': 'Rock', 'hardcore': 'Rock', 'metal': 'Rock', 'heavy-metal': 'Rock',
    'black-metal': 'Rock', 'metalcore': 'Rock', 'j-rock': 'Electronic/Dance', 'rockabilly': 'Rock',
    'alternative': 'Rock',
    
    # Electronic/Dance
    'electronic': 'Electronic/Dance', 'edm': 'Electronic/Dance', 'house': 'Electronic/Dance',
    'deep-house': 'Electronic/Dance', 'progressive-house': 'Electronic/Dance',
    'techno': 'Electronic/Dance', 'minimal-techno': 'Electronic/Dance',
    'detroit-techno': 'Electronic/Dance', 'trance': 'Electronic/Dance',
    'dubstep': 'Electronic/Dance', 'drum-and-bass': 'Electronic/Dance',
    'breakbeat': 'Electronic/Dance', 'club': 'Electronic/Dance',
    'dancehall': 'Electronic/Dance', 'j-dance': 'Electronic/Dance', 'disco': 'Electronic/Dance',
    'hardstyle': 'Electronic/Dance', 'chill': 'Electronic/Dance',
    'electro': 'Electronic/Dance', 'dance': 'Electronic/Dance',
    
    # Hip-Hop/R&B
    'hip-hop': 'Hip-Hop/R&B', 'r-n-b': 'Hip-Hop/R&B', 'funk': 'Hip-Hop/R&B',
    'afrobeat': 'Hip-Hop/R&B', 'reggaeton': 'Hip-Hop/R&B', 'dub': 'Hip-Hop/R&B',
    'groove': 'Hip-Hop/R&B', 'reggae': 'Hip-Hop/R&B',
    
    # Jazz/Blues
    'jazz': 'Jazz/Blues', 'blues': 'Jazz/Blues', 'bluegrass': 'Jazz/Blues',
    'gospel': 'Jazz/Blues', 'soul': 'Jazz/Blues',
    
    # Classical/Instrumental
    'classical': 'Classical/Instrumental', 'piano': 'Classical/Instrumental',
    'opera': 'Classical/Instrumental', 'ambient': 'Classical/Instrumental',
    'idm': 'Classical/Instrumental', 'trip-hop': 'Classical/Instrumental',
    'new-age': 'Classical/Instrumental', 'singer-songwriter': 'Classical/Instrumental',
    'study': 'Classical/Instrumental',
    
    # Folk/World
    'folk': 'Folk/World', 'acoustic': 'Folk/World', 'country': 'Folk/World',
    'honky-tonk': 'Folk/World', 'turkish': 'Folk/World', 'brazil': 'Folk/World',
    'samba': 'Folk/World', 'forro': 'Folk/World', 'indian': 'Folk/World',
    'iranian': 'Folk/World', 'malay': 'Folk/World', 'afrobeat': 'Folk/World',
    'world-music': 'Folk/World',
    
    # Latin
    'latin': 'Latin', 'salsa': 'Latin', 'tango': 'Latin', 'pagode': 'Latin',
    'mpb': 'Latin', 'sertanejo': 'Latin', 'brazil': 'Latin',
    
    # Children's/Family
    'kids': 'Children\'s/Family', 'children': 'Children\'s/Family', 'disney': 'Children\'s/Family',
    'show-tunes': 'Children\'s/Family', 'romance': 'Children\'s/Family', 'happy': 'Children\'s/Family',
    
    # Experimental/Alternative
    'industrial': 'Experimental/Alternative', 'grindcore': 'Experimental/Alternative',
    'goth': 'Experimental/Alternative', 'detroit-techno': 'Experimental/Alternative',
    'idm': 'Experimental/Alternative',
    
    # Other/Functional
    'anime': 'Other/Functional', 'study': 'Other/Functional', 'party': 'Other/Functional',
    'sleep': 'Other/Functional', 'comedy': 'Other/Functional', 'french': 'Other/Functional',
    'german': 'Other/Functional', 'swedish': 'Other/Functional', 'chicago-house': 'Other/Functional'
}

# Apply mapping
spotify_cleaned_final['merged_genre'] = spotify_cleaned_final['track_genre'].map(genre_mapping)

# drop track_genre column
spotify_cleaned_final = spotify_cleaned_final.drop(columns=['track_genre'])

# Add one-hot encoding for merged_genre
spotify_cleaned_final = pd.get_dummies(spotify_cleaned_final, columns=['merged_genre'])
print(spotify_cleaned_final.columns)

# 3. Neural Network
- This will use linear regression to predict popularity. Consider that popularity represent the general 'likeness' of a song, find out what contribute the most to popularity will help us identify the most important features of a song that could influence a user. Though in real life, an artist would have major influence on this matter, for the sake of simplicity, we will not consider artists in this study.
- We also drop duration_ms, the assumption being a song is liked not for its duration, and the information would be distracting to the model. Also there is not much correlation between duration and popularity in correlation heatmap.

In [None]:
"""
Train a model to predict the popularity of a song based on its audio features.
"""

!pip install scikit-learn
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

spotify_processed = spotify_cleaned_final

Y = spotify_processed['popularity']
X = spotify_processed[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                       'time_signature']]

numerical_features = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
category_features = ['mode', 'merged_genre_Classical/Instrumental', 'merged_genre_Electronic/Dance', 'merged_genre_Folk/World', 'merged_genre_Hip-Hop/R&B', 'merged_genre_Jazz/Blues', 'merged_genre_Latin', 'merged_genre_Other/Functional', 'merged_genre_Pop', 'merged_genre_Rock']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


Gridsearch for best hyper-parameters

In [None]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import StandardScaler

# Data Preprocessing
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train.values, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
Y_val_tensor = torch.tensor(Y_val.values, dtype=torch.float32).view(-1, 1)

# Define Neural Network Model
class SongPopularityModel(nn.Module):
    def __init__(self, input_dim):
        super(SongPopularityModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define a Wrapper
class SongPopularityModelWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, input_dim, lr=0.001, epochs=100, batch_size=32):
        self.input_dim = input_dim
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = SongPopularityModel(input_dim)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

    def fit(self, X, y):
        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)
        
        dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
        loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            self.model.train()
            for X_batch, y_batch in loader:
                self.optimizer.zero_grad()
                predictions = self.model(X_batch)
                loss = self.criterion(predictions, y_batch)
                loss.backward()
                self.optimizer.step()
        return self

    def predict(self, X):
        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            return self.model(X_tensor).numpy()

# Define Parameter Grid for GridSearchCV
param_grid = {
    'lr': [0.001, 0.01, 0.1],
    'batch_size': [16, 32, 64]
}

# Initialize the Wrapper Model
torch_regressor = SongPopularityModelWrapper(input_dim=X_train.shape[1])

# Perform Grid Search
grid_search = GridSearchCV(estimator=torch_regressor, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train.values, Y_train.values)

# Best Parameters
print("Best Hyperparameters:", grid_search.best_params_)


Evaluate model with best hyper-parameters

In [None]:
best_model = SongPopularityModelWrapper(input_dim=X_train.shape[1], lr=0.1, batch_size=64)
best_model.fit(X_train.values, Y_train.values)

# Training Performance
y_train_pred = best_model.predict(X_train.values)
train_mse = mean_squared_error(Y_train, y_train_pred)
train_r2 = r2_score(Y_train, y_train_pred)
print(f"Training MSE: {train_mse:.4f}")
print(f"Training R2: {train_r2:.4f}")

# Test Performance
y_test_pred = best_model.predict(X_test.values)
test_mse = mean_squared_error(Y_test, y_test_pred)
test_r2 = r2_score(Y_test, y_test_pred)
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R2: {test_r2:.4f}")

Permutation Feature Importance

In [None]:
# Permutation Importance
def compute_permutation_importance(model, X_test, Y_test, metric_function, feature_names, n_repeats=5):
    """
    Compute permutation importance for each feature.
    """
    base_score = metric_function(Y_test, model.predict(X_test.values))
    importances = {}

    for col_idx, col_name in enumerate(feature_names):
        permuted_scores = []
        for _ in range(n_repeats):
            X_test_permuted = X_test.copy()
            X_test_permuted.iloc[:, col_idx] = np.random.permutation(X_test.iloc[:, col_idx])
            # Evaluate the model on the permuted data
            permuted_score = metric_function(Y_test, model.predict(X_test_permuted.values))
            permuted_scores.append(permuted_score)
        # Importance is the difference between baseline and permuted scores
        importances[col_name] = base_score - np.mean(permuted_scores)

    return importances


# Compute permutation importance
feature_names = X_test.columns
importances = compute_permutation_importance(
    best_model, X_test, Y_test,
    metric_function=mean_squared_error,
    feature_names=feature_names
)

# Display results in descending order of importance (absolute value)
print("\nFeature Importances (Permutation Importance):")
importances = {k: v for k, v in sorted(importances.items(), key=lambda item: abs(item[1]), reverse=True)}
for feature, importance in importances.items():
    print(f"{feature}: {importance:.4f}")
