
# Setup



In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.options.mode.copy_on_write = True

# Using the URL for the file
spotify_original = pd.read_csv("spotify_data/dataset.csv")

spotify_original_reshape = spotify_original.iloc[:,1:]

# Data Cleaning


*   Todo 1
*   Todo 2



In [None]:
#spotify_original_reshape.head(20)
#spotify_original.shape
spotify_original_reshape




In [None]:
spotify_original_reshape['track_name'].describe()

In [None]:
spotify_original_reshape['popularity'].describe()

In [None]:
spotify_original_reshape['track_genre'].describe()

In [None]:
missing_track = spotify_original_reshape[spotify_original_reshape['track_id'] == '1kR4gIb7nGxHPI3D2ifs59']
print(missing_track)

In [None]:
# Cleaning rows with missing information
missing_data_rows = spotify_original_reshape[spotify_original_reshape.isnull().any(axis=1)]

missing_data_rows

spotify_original_reshape_drop = spotify_original_reshape.dropna()

print(spotify_original_reshape.shape)
print(spotify_original_reshape_drop.shape)

In [None]:
#clean track_name and artists columns by stripping spaces and converting to lowercase
spotify_original_reshape_drop['track_name_clean'] = spotify_original_reshape_drop['track_name'].str.strip().str.lower()
spotify_original_reshape_drop['artists_clean'] = spotify_original_reshape_drop['artists'].str.strip().str.lower()

#priority list for genres to handle duplicates
genre_priority = ['pop', 'rock', 'hip hop', 'rap', 'reggaeton', 'latin', 'electronic', 'r&b', 'reggae', 'dance', 'classical']
spotify_original_reshape_drop['genre_priority'] = spotify_original_reshape_drop['track_genre'].apply(lambda x: genre_priority.index(x) if x in genre_priority else len(genre_priority))

#sort the dataset by track_name, artists, genre priority, popularity, and duration
spotify_data_sorted = spotify_original_reshape_drop.sort_values(by=['track_name_clean', 'artists_clean', 'genre_priority', 'popularity', 'duration_ms'],
                                                                ascending=[True, True, True, True, False])

#remove duplicates
spotify_cleaned = spotify_data_sorted.drop_duplicates(subset=['track_name_clean', 'artists_clean'], keep='first')

# checking size
print(f"Shape of the dataset before cleaning: {spotify_original_reshape_drop.shape}")
print(f"Shape of the dataset after cleaning: {spotify_cleaned.shape}")

# removing extra columns added
spotify_cleaned_final = spotify_cleaned.drop(columns=['track_name_clean', 'artists_clean', 'genre_priority'])

# Fcheck size again
print(f"Shape of the dataset after removing extra columns: {spotify_cleaned_final.shape}")


spotify_cleaned_final


In [None]:
#Correlation matrix with target being Danceability

numeric_data = spotify_cleaned_final.select_dtypes(include=['float64', 'int64'])

correlation_matrix = numeric_data.corr()

danceability_correlation = correlation_matrix["danceability"].sort_values(ascending=False)

print(danceability_correlation)

import seaborn as sns
import matplotlib.pyplot as plt


# Create a heatmap to visualize correlations
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap for Spotify Dataset')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import Lasso


X_loudness = spotify_cleaned_final.select_dtypes(include=['float64', 'int64']).drop(columns=['energy'])
y_energy = spotify_cleaned_final['energy']


X_train_val, X_test, y_train_val, y_test = train_test_split(X_loudness, y_energy, test_size=1/3, random_state=35)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=35)

# polynomial transformation to the features (degree=2)
poly = PolynomialFeatures(degree=1)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

# train the linear regression model using the transformed polynomial features
model = LinearRegression()
model.fit(X_train_poly, y_train)

print("Coefficients (Theta):", model.coef_)

print("Columns", X_loudness.columns)

# evaluate on validation set
y_pred_val = model.predict(X_val_poly)
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)
print(f"Mean Squared Error (Validation Data): {mse_val}")
print(f"R-squared (Validation Data): {r2_val}")

# once validated, evaluate on the test set (unseen data)
y_pred_test = model.predict(X_test_poly)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print(f"Mean Squared Error (Test Data): {mse_test}")
print(f"R-squared (Test Data): {r2_test}")

# Check the shape of the splits
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

import matplotlib.pyplot as plt

# Calculate residuals for the test set
residuals = y_test - y_pred_test

# Scatter plot of residuals
plt.figure(figsize=(8, 6))
plt.scatter(y_pred_test, residuals, color='blue', alpha=0.3)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Energy')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.grid(True)
plt.show()

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Polynomial transformation (you've already done this)
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

# Step 2: Initialize and fit the Lasso regression model
lasso_model = Lasso(alpha=0.005)  # You can tune the alpha value
lasso_model.fit(X_train_poly, y_train)

# Step 3: Predict on validation and test sets
y_pred_val_lasso = lasso_model.predict(X_val_poly)
y_pred_test_lasso = lasso_model.predict(X_test_poly)

# Step 4: Evaluate the model performance
mse_val_lasso = mean_squared_error(y_val, y_pred_val_lasso)
r2_val_lasso = r2_score(y_val, y_pred_val_lasso)
print(f"Lasso - Mean Squared Error (Validation Data): {mse_val_lasso}")
print(f"Lasso - R-squared (Validation Data): {r2_val_lasso}")

mse_test_lasso = mean_squared_error(y_test, y_pred_test_lasso)
r2_test_lasso = r2_score(y_test, y_pred_test_lasso)
print(f"Lasso - Mean Squared Error (Test Data): {mse_test_lasso}")
print(f"Lasso - R-squared (Test Data): {r2_test_lasso}")

# Step 5: Analyze residuals for Lasso model
residuals_lasso = y_test - y_pred_test_lasso

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_test_lasso, residuals_lasso, color='blue', alpha=0.3)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Energy (Lasso)')
plt.ylabel('Residuals')
plt.title('Residuals Plot for Lasso Regression')
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X_loudness = numeric_data[['loudness']]
y_energy = numeric_data['energy']



X_train_val, X_test, y_train_val, y_test = train_test_split(X_loudness, y_energy, test_size=1/3, random_state=35)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=35)

# polynomial transformation to the features (degree=2)
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

# train the linear regression model using the transformed polynomial features
model = LinearRegression()
model.fit(X_train_poly, y_train)

# evaluate on validation set
y_pred_val = model.predict(X_val_poly)
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)
print(f"Mean Squared Error (Validation Data): {mse_val}")
print(f"R-squared (Validation Data): {r2_val}")

# once validated, evaluate on the test set (unseen data)
y_pred_test = model.predict(X_test_poly)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print(f"Mean Squared Error (Test Data): {mse_test}")
print(f"R-squared (Test Data): {r2_test}")

# plot the validation data points and regression line
plt.scatter(X_val, y_val, color='green', label='Validation Data points', s=10, alpha=0.3)
sorted_idx = np.argsort(X_val.values.flatten())
plt.plot(X_val.values[sorted_idx], y_pred_val[sorted_idx], color='orange', linewidth=2, label='Polynomial regression line (Validation)')

plt.xticks(np.arange(min(X_val.values), max(X_val.values)+1, 2), rotation=45, fontsize=10)
plt.yticks(np.arange(0, 1.01, 0.1))

plt.xlabel('Loudness')
plt.ylabel('Energy (scale 0-1)')
plt.title('Energy vs Loudness with Polynomial Regression Line (Validation Data)')
plt.legend()
plt.grid(True)
plt.show()

# plot the test data points and regression line
plt.scatter(X_test, y_test, color='blue', label='Test Data points', s=10, alpha=0.3)
sorted_idx = np.argsort(X_test.values.flatten())
plt.plot(X_test.values[sorted_idx], y_pred_test[sorted_idx], color='red', linewidth=2, label='Polynomial regression line (Test)')

plt.xticks(np.arange(min(X_test.values), max(X_test.values)+1, 2), rotation=45, fontsize=10)
plt.yticks(np.arange(0, 1.01, 0.1))

plt.xlabel('Loudness')
plt.ylabel('Energy (scale 0-1)')
plt.title('Energy vs Loudness with Polynomial Regression Line (Test Data)')
plt.legend()
plt.grid(True)
plt.show()

# Check the shape of the splits
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")


In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Select additional features along with loudness
X_features = numeric_data[['loudness', 'tempo', 'valence', 'danceability']]  # Add relevant features
y_energy = numeric_data['energy']

# Step 2: Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X_features, y_energy, test_size=1/3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

# Step 3: Apply polynomial transformation (degree=2 or 3) on multiple features
poly = PolynomialFeatures(degree=2)  # You can change to degree 3 if needed
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

# Step 4: Train a Ridge regression model
ridge_model = Ridge(alpha=1.0)  # You can tune the alpha parameter
ridge_model.fit(X_train_poly, y_train)

# Step 5: Predict on validation and test sets
y_pred_val_ridge = ridge_model.predict(X_val_poly)
y_pred_test_ridge = ridge_model.predict(X_test_poly)

# Step 6: Evaluate the model performance
mse_val_ridge = mean_squared_error(y_val, y_pred_val_ridge)
r2_val_ridge = r2_score(y_val, y_pred_val_ridge)
print(f"Ridge - Mean Squared Error (Validation Data): {mse_val_ridge}")
print(f"Ridge - R-squared (Validation Data): {r2_val_ridge}")

mse_test_ridge = mean_squared_error(y_test, y_pred_test_ridge)
r2_test_ridge = r2_score(y_test, y_pred_test_ridge)
print(f"Ridge - Mean Squared Error (Test Data): {mse_test_ridge}")
print(f"Ridge - R-squared (Test Data): {r2_test_ridge}")

# Step 7: Plot the residuals for Ridge regression on the test set
residuals_ridge = y_test - y_pred_test_ridge
plt.scatter(X_test['loudness'], residuals_ridge)  # Scatter plot against loudness
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Loudness')
plt.ylabel('Residuals')
plt.title('Residuals Plot for Ridge Regression with Additional Features')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],        # Maximum depth of the tree
    'min_samples_split': [2, 10, 20]        # Minimum number of samples required to split
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Perform grid search with cross-validation (cv=3)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from grid search
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Evaluate the best model
best_rf_model = grid_search.best_estimator_

# Predict on validation and test sets
y_pred_val_best_rf = best_rf_model.predict(X_val)
y_pred_test_best_rf = best_rf_model.predict(X_test)

# Evaluate performance
mse_val_best_rf = mean_squared_error(y_val, y_pred_val_best_rf)
r2_val_best_rf = r2_score(y_val, y_pred_val_best_rf)
print(f"Best Random Forest - Mean Squared Error (Validation Data): {mse_val_best_rf}")
print(f"Best Random Forest - R-squared (Validation Data): {r2_val_best_rf}")

mse_test_best_rf = mean_squared_error(y_test, y_pred_test_best_rf)
r2_test_best_rf = r2_score(y_test, y_pred_test_best_rf)
print(f"Best Random Forest - Mean Squared Error (Test Data): {mse_test_best_rf}")
print(f"Best Random Forest - R-squared (Test Data): {r2_test_best_rf}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

importances = best_rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importances (Best Random Forest)")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.ylabel('Importance')
plt.show()

# Validation set plot
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred_val_best_rf, color='green', alpha=0.3, label='Predicted vs Actual')
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', lw=2, label='Perfect Fit Line')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Validation Data: Actual vs Predicted Energy (Best Random Forest)')
plt.legend()
plt.grid(True)
plt.show()

# Test set plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_test_best_rf, color='blue', alpha=0.3, label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, label='Perfect Fit Line')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Test Data: Actual vs Predicted Energy (Best Random Forest)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Step 1: Plot the actual vs predicted for Validation set
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred_val_ridge, color='green', alpha=0.3, label='Predicted vs Actual')
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', lw=2, label='Perfect Fit Line')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Validation Data: Actual vs Predicted Energy')
plt.legend()
plt.grid(True)
plt.show()

# Step 2: Plot the actual vs predicted for Test set
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_test_ridge, color='blue', alpha=0.3, label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, label='Perfect Fit Line')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Test Data: Actual vs Predicted Energy')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
residuals = y_test - y_pred_test
plt.scatter(X_test, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Loudness')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.show()


# Exploratory Data Analysis


*   Todo 1
*   Todo 2



In [None]:
spotify_original_reshape['liveness'].hist(bins = 30, alpha = 0.5, color='blue')

In [None]:
"Visualizing correlation between features"

plt.scatter(spotify_original_reshape['danceability'], spotify_original_reshape['popularity'], s = 0.1)
plt.xlabel('Danceability')
plt.ylabel('Popularity')
plt.title('Danceability vs Popularity')

In [None]:
"Visualizing correlation between features"
plt.scatter(spotify_original_reshape['energy'], spotify_original_reshape['popularity'], s = 0.1)
plt.xlabel('Energy')
plt.ylabel('Popularity')
plt.title('Energy vs Popularity')

In [None]:
#Finding the Correlation Between Popularity and other features

for features in spotify_original_reshape.select_dtypes(include=[np.number]).columns:
  print(features, 'vs. Popularity Correlation:', np.corrcoef(spotify_original_reshape['popularity'], spotify_original_reshape[features])[0,1])


In [None]:
#Finding the Correlation Between Tempo and other features

for features in spotify_original_reshape.select_dtypes(include=[np.number]).columns:
  print(features, 'vs. Tempo Correlation:', np.corrcoef(spotify_original_reshape['tempo'], spotify_original_reshape[features])[0,1])


Regression

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklego.linear_model import LADRegression

ls_tempo_danceability_fit = LinearRegression()
ls_tempo_danceability_fit.fit(X=np.array(spotify_original_reshape_drop['tempo']).reshape(-1, 1),
                          y=spotify_original_reshape_drop['danceability'])

In [None]:
fig = px.scatter(spotify_original_reshape_drop, x='tempo', y='danceability')
fig.add_trace(
    go.Scatter(x=spotify_original_reshape_drop['tempo'],
                y=ls_tempo_danceability_fit.intercept_ + spotify_original_reshape_drop['tempo'] * ls_tempo_danceability_fit.coef_[0],
                mode='lines',
                name='LS',
                line={'dash': 'solid',
                      'color': 'black'})
)

In [None]:
pred_train_df = pd.DataFrame(
    {'true': spotify_original_reshape_drop['danceability'],
     'ls_pred': ls_tempo_danceability_fit.predict(np.array(spotify_original_reshape_drop['tempo']).reshape(-1, 1))})
pred_train_df

In [None]:
# calculate the rMSE, MAE, MAD, correlation, and R2 of the true price with the LS predictions
print('LS rMSE:', np.sqrt(mean_squared_error(pred_train_df['true'], pred_train_df['ls_pred'])))
print('LS MAE:', mean_absolute_error(pred_train_df['true'], pred_train_df['ls_pred']))
print('LS MAD:', np.median(np.abs(pred_train_df['true'] - pred_train_df['ls_pred'])))
print('LS correlation:', np.corrcoef(pred_train_df['true'], pred_train_df['ls_pred'])[0, 1])
print('LS R2:', r2_score(pred_train_df['true'], pred_train_df['ls_pred']))

In [None]:
ls_loudness_energy_fit = LinearRegression()
ls_loudness_energy_fit.fit(X=np.array(spotify_original_reshape_drop['loudness']).reshape(-1, 1),
                          y=spotify_original_reshape_drop['energy'])

fig = px.scatter(spotify_original_reshape_drop, x='loudness', y='energy')
fig.add_trace(
    go.Scatter(x=spotify_original_reshape_drop['loudness'],
                y=ls_loudness_energy_fit.intercept_ + spotify_original_reshape_drop['loudness'] * ls_loudness_energy_fit.coef_[0],
                mode='lines',
                name='LS',
                line={'dash': 'solid',
                      'color': 'black'})
)

In [None]:
pred_train_df = pd.DataFrame(
    {'true': spotify_original_reshape_drop['energy'],
     'ls_pred': ls_loudness_energy_fit.predict(np.array(spotify_original_reshape_drop['loudness']).reshape(-1, 1))})
pred_train_df

In [None]:
# calculate the rMSE, MAE, MAD, correlation, and R2 of the true price with the LS predictions
print('LS rMSE:', np.sqrt(mean_squared_error(pred_train_df['true'], pred_train_df['ls_pred'])))
print('LS MAE:', mean_absolute_error(pred_train_df['true'], pred_train_df['ls_pred']))
print('LS MAD:', np.median(np.abs(pred_train_df['true'] - pred_train_df['ls_pred'])))
print('LS correlation:', np.corrcoef(pred_train_df['true'], pred_train_df['ls_pred'])[0, 1])
print('LS R2:', r2_score(pred_train_df['true'], pred_train_df['ls_pred']))