
# 1. Setup



In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.options.mode.copy_on_write = True

# Using the URL for the file
spotify_original = pd.read_csv("spotify_data/dataset.csv")

spotify_original_reshape = spotify_original.iloc[:,1:]

# 2. Data Cleaning


*   Todo 1
*   Todo 2



In [None]:
#spotify_original_reshape.head(20)
#spotify_original.shape
spotify_original_reshape




In [None]:
spotify_original_reshape['track_name'].describe()

In [None]:
spotify_original_reshape['popularity'].describe()

In [None]:
spotify_original_reshape['track_genre'].describe()

In [None]:
missing_track = spotify_original_reshape[spotify_original_reshape['track_id'] == '1kR4gIb7nGxHPI3D2ifs59']
print(missing_track)

In [None]:
# Cleaning rows with missing information
missing_data_rows = spotify_original_reshape[spotify_original_reshape.isnull().any(axis=1)]

missing_data_rows

spotify_original_reshape_drop = spotify_original_reshape.dropna()

print(spotify_original_reshape.shape)
print(spotify_original_reshape_drop.shape)

In [None]:
#clean track_name and artists columns by stripping spaces and converting to lowercase
spotify_original_reshape_drop['track_name_clean'] = spotify_original_reshape_drop['track_name'].str.strip().str.lower()
spotify_original_reshape_drop['artists_clean'] = spotify_original_reshape_drop['artists'].str.strip().str.lower()

#priority list for genres to handle duplicates
genre_priority = ['pop', 'rock', 'hip hop', 'rap', 'reggaeton', 'latin', 'electronic', 'r&b', 'reggae', 'dance', 'classical']
spotify_original_reshape_drop['genre_priority'] = spotify_original_reshape_drop['track_genre'].apply(lambda x: genre_priority.index(x) if x in genre_priority else len(genre_priority))

#sort the dataset by track_name, artists, genre priority, popularity, and duration
spotify_data_sorted = spotify_original_reshape_drop.sort_values(by=['track_name_clean', 'artists_clean', 'genre_priority', 'popularity', 'duration_ms'],
                                                                ascending=[True, True, True, True, False])

#remove duplicates
spotify_cleaned = spotify_data_sorted.drop_duplicates(subset=['track_name_clean', 'artists_clean'], keep='first')

# checking size
print(f"Shape of the dataset before cleaning: {spotify_original_reshape_drop.shape}")
print(f"Shape of the dataset after cleaning: {spotify_cleaned.shape}")

# removing extra columns added
spotify_cleaned_final = spotify_cleaned.drop(columns=['track_name_clean', 'artists_clean', 'genre_priority'])

# Fcheck size again
print(f"Shape of the dataset after removing extra columns: {spotify_cleaned_final.shape}")


spotify_cleaned_final


# 3. Exploratory Data Analysis




In [None]:
"""
Histograms
"""
spotify_original_reshape['liveness'].hist(bins = 30, alpha = 0.5, color='blue')

In [None]:
"""
Visualizing correlation between dancebility and popularity features
"""

plt.scatter(spotify_original_reshape['danceability'], spotify_original_reshape['popularity'], s = 0.1)
plt.xlabel('Danceability')
plt.ylabel('Popularity')
plt.title('Danceability vs Popularity')

In [None]:
"""
Visualizing correlation between energy and popularity
"""

plt.scatter(spotify_original_reshape['energy'], spotify_original_reshape['popularity'], s = 0.1)
plt.xlabel('Energy')
plt.ylabel('Popularity')
plt.title('Energy vs Popularity')

In [None]:
"""
Visualizing the Correlation Between Popularity and other features
"""

for features in spotify_original_reshape.select_dtypes(include=[np.number]).columns:
  print(features, 'vs. Popularity Correlation:', np.corrcoef(spotify_original_reshape['popularity'], spotify_original_reshape[features])[0,1])


In [None]:
"""
Visualizing the Correlation Between Tempo and other features
"""

for features in spotify_original_reshape.select_dtypes(include=[np.number]).columns:
  print(features, 'vs. Tempo Correlation:', np.corrcoef(spotify_original_reshape['tempo'], spotify_original_reshape[features])[0,1])


In [None]:
"""
Correlation matrix and Heatmap
"""

numeric_data = spotify_cleaned_final.select_dtypes(include=['float64', 'int64'])

correlation_matrix = numeric_data.corr()

danceability_correlation = correlation_matrix["danceability"].sort_values(ascending=False)

print(danceability_correlation)

import seaborn as sns
import matplotlib.pyplot as plt


# Create a heatmap to visualize correlations
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap for Spotify Dataset')
plt.show()


# PCA

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

spotify_cleaned_final.shape

numeric_columns = spotify_cleaned_final.select_dtypes(include=[np.number]).columns
non_numeric_columns = spotify_cleaned_final.select_dtypes(exclude=[np.number]).columns

print(f"Numeric columns ({len(numeric_columns)}): {numeric_columns.tolist()}")
print(f"Non-numeric columns ({len(non_numeric_columns)}): {non_numeric_columns.tolist()}")



In [None]:
from sklearn.preprocessing import StandardScaler

numeric_data = spotify_cleaned_final.select_dtypes(include=[np.number])

scaler = StandardScaler()
data_standardized = scaler.fit_transform(numeric_data)


In [None]:
pca = PCA(n_components=None) 
pca.fit(data_standardized)

data_pca = pca.transform(data_standardized)



In [None]:
# Perform SVD on the standardized dataset
pca_U, pca_d, pca_V = np.linalg.svd(data_standardized, full_matrices=False)

explained_variance = (pca_d ** 2) / (len(data_standardized) - 1)

total_variance = np.sum(explained_variance)
explained_variance_ratio = explained_variance / total_variance

print("Explained Variance Ratio:", explained_variance_ratio)
print(pca_d)




In [None]:
prop_var = np.square(pca_d) / sum(np.square(pca_d))
pd.DataFrame(
    {"PC": 1 + np.arange(0, prop_var.shape[0]),
     "variability_explained": prop_var.round(2),
     "cumulative_variability_explained": prop_var.cumsum().round(2)
     }).head(14)

In [None]:
loadings1 = pd.DataFrame(
    {
        "features": spotify_cleaned_final_numeric.columns, 
        "pc1_loading": pca_V[0] 
    }
)

loadings1.reindex(loadings1["pc1_loading"].abs().sort_values(ascending=False).index) \
    .head(10)


In [None]:
loadings2 = pd.DataFrame(
    {"features": spotify_cleaned_final_numeric.columns,
     "pc2_loading": pca_V[1]
     })
# look at the 10 largest (absolute value) loadings for PC2 but print out the signed value
loadings2.reindex(loadings2["pc2_loading"].abs().sort_values(ascending=False).index) \
    .head(10)

In [None]:
import pandas as pd

pca_scaled_x = data_standardized @ pca_V.T

pca_scaled_x = pd.DataFrame(pca_scaled_x)

pca_scaled_x.columns = ["PC" + str(1 + col) for col in range(pca_scaled_x.shape[1])]

pca_scaled_x.index = spotify_cleaned_final['track_name'] 

pca_scaled_x.head()


In [None]:
import matplotlib.pyplot as plt

# Plotting PC1 vs PC2
plt.figure(figsize=(10, 6))
plt.scatter(pca_scaled_x['PC1'], pca_scaled_x['PC2'], alpha=0.7)


plt.xlabel('Principal Component 1 (PC1)')
plt.ylabel('Principal Component 2 (PC2)')
plt.title('PCA: PC1 vs PC2')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
scatter = plt.scatter(pca_scaled_x['PC1'], pca_scaled_x['PC2'], c=spotify_cleaned_final_numeric['energy'], cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label='Energy')
plt.xlabel('Principal Component 1 (PC1)')
plt.ylabel('Principal Component 2 (PC2)')
plt.title('PCA: PC1 vs PC2 (Colored by Energy)')
plt.show()


In [None]:
pca_prop_explained = np.square(pca_d) / sum(np.square(pca_d))

# Create a line plot showing the proportion of variance explained by each principal component
fig = px.line(
    x=np.arange(1, pca_prop_explained.shape[0] + 1),  # PC indices start at 1
    y=pca_prop_explained,
    labels={"x": "Principal Component (PC)", "y": "Proportion of Variance Explained"},
    title="Proportion of Variance Explained by Each Principal Component"
)

# Display the plot
fig.show()

In [None]:
import numpy as np
import plotly.express as px

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca_prop_explained)

# Create a line plot showing the cumulative variance explained by each component
fig = px.line(
    x=np.arange(1, cumulative_variance.shape[0] + 1),
    y=cumulative_variance,
    labels={"x": "Principal Component (PC)", "y": "Cumulative Variance Explained"},
    title="Cumulative Variance Explained by Principal Components"
)

fig.show()



In [None]:
reduced_data = data_standardized @ pca_V[:7].T
reduced_data_df = pd.DataFrame(reduced_data, columns=[f'PC{i+1}' for i in range(7)])

reduced_data_df.index = spotify_cleaned_final['track_name']
reduced_data_df.head()

In [None]:
import plotly.express as px

# Scatter plot of PC1 vs PC2
fig = px.scatter(
    reduced_data_df,
    x='PC1',
    y='PC2',
    hover_name=reduced_data_df.index,
    title="PCA Scatter Plot: PC1 vs PC2"
)
fig.show()


Explanations: 

1. For this project check-in your team must demonstrate at least one unsupervised learning method: PCA or clustering. You may even combine them.

    *We chose to apply PCA to our dataset.*

2. If you apply PCA to your data, include code, cumulative variability explained, and scree plot. Explain how you are using PCA in your project for dimensionality reduction or to learn structure in the data.

    *We used PCA to reduce the dataset from 14 features to 7 principal components. We projected the data to the first 7 PC, and we determined which PC were the most influential using the scree plot and the cumulative variance plot*

3. If you apply clustering, include code. quantitative metrics to evaluate clustering, and how you determined the number of clusters for your data and why. Explain how clustering helped you learn about the structure of data for your project.

    *N/A*


# Neural Network
* We want to predict the mode of a song using neural network with 4 hidden layers and 1 output layer

## Data Cleaning
* Here we will drop track_id, track_name, artists, popularity because they are irrelevant to predicting the mode of a song

In [None]:
!pip install torch
import torch
import torch.nn as nn

# drop track_id, track_name, artists, album_name, track_genre, mode, and popularity columns
x_train = spotify_cleaned_final.drop(columns=['track_id', 'track_name', 'artists', 'track_genre', 'popularity', 'mode', 'album_name'])

# standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

## Try PCA

In [None]:
# Perform SVD on the standardized dataset
pca_U, pca_d, pca_V = np.linalg.svd(x_train, full_matrices=False)

prop_var = np.square(pca_d) / sum(np.square(pca_d))
print(prop_var)

# PC transformation
x_train_pca = x_train @ pca_V.T

# pick the first 10 principal components
x_train_pca = x_train[:, :10]

# create tensor from the standardized data
x_train_pca = torch.tensor(x_train_pca, dtype=torch.float32)
y_train_pca = torch.tensor(spotify_cleaned_final['mode'].values, dtype=torch.float32)
input_pca_size = x_train_pca.shape[1]

## Define Neural Network with 4 hidden layers and 1 output

In [None]:
class MultiLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MultiLayerNN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)  # Input to first hidden layer
        self.layer2 = nn.Linear(hidden_size, hidden_size)  # First to second hidden layer
        self.layer3 = nn.Linear(hidden_size, hidden_size)  # Second to third hidden layer
        self.layer4 = nn.Linear(hidden_size, hidden_size)  # Third to fourth hidden layer
        self.layer5 = nn.Linear(hidden_size, hidden_size)  # Fourth to fifth hidden layer
        self.layer6 = nn.Linear(hidden_size, hidden_size)  # Fifth to sixth hidden layer
        self.output_layer = nn.Linear(hidden_size, output_size)  # Fourth to output layer
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation function

    def forward(self, x):
        x = self.sigmoid(self.layer1(x))
        x = self.sigmoid(self.layer2(x))
        x = self.sigmoid(self.layer3(x))
        x = self.sigmoid(self.layer4(x))    
        x = self.sigmoid(self.layer5(x))
        x = self.sigmoid(self.layer6(x))
        x = self.sigmoid(self.output_layer(x))
        return x

## Neural Network with PC inputs

### Model, Loss_Fn, Optimizer

In [None]:
model = MultiLayerNN(input_size=input_pca_size, hidden_size=16, output_size=1)
loss_function = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

### Train The Model

In [None]:

# Train the model
print("\n#################### Training the neural network ####################\n")
num_epochs = 1000
for epoch in range(num_epochs):
  prediction = model(x_train_pca)
  loss = loss_function(prediction, y_train_pca.view(-1, 1))

  # Backward pass
  optimizer.zero_grad() # Zero out the gradients from the previous iteration
  loss.backward() # Compute the gradients for each parameter
  optimizer.step() # Update the parameters using the gradients

  # Print the loss every 10 epochs
  if epoch % 10 == 0:
    print(f"Epoch {epoch}, Loss: {loss.item()}")
    

### Learning Rate Experiment

In [None]:
# Learning Rate Finder
lrs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
losses = []

for lr in lrs:
    model = MultiLayerNN(input_size=input_pca_size, hidden_size=16, output_size=1)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    loss_function = nn.BCELoss()

    for epoch in range(1000):
        prediction = model(x_train_pca)
        loss = loss_function(prediction, y_train_pca.view(-1, 1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    losses.append(loss.item())

plt.plot(lrs, losses)
# label the axes
plt.xlabel('Learning Rate')
plt.ylabel('Loss')

### Evaluate model performance with LR = 0.1

In [None]:
# Evaluate the model with lr=0.1
model = MultiLayerNN(input_size=input_pca_size, hidden_size=16, output_size=1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
loss_function = nn.BCELoss()

for epoch in range(1000):
    prediction = model(x_train_pca)
    loss = loss_function(prediction, y_train_pca.view(-1, 1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Evaluate the model with accuracy, precision, recall, and F1 score
prediction = model(x_train)
prediction = torch.round(prediction)
y_train = y_train.view(-1, 1)

true_positives = torch.sum(prediction[y_train == 1] == 1).item()
false_positives = torch.sum(prediction[y_train == 0] == 1).item()
true_negatives = torch.sum(prediction[y_train == 0] == 0).item()
false_negatives = torch.sum(prediction[y_train == 1] == 0).item()

accuracy = (true_positives + true_negatives) / len(y_train)
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)

# confusion matrix
confusion_matrix = torch.zeros(2, 2)
confusion_matrix[0, 0] = true_negatives
confusion_matrix[0, 1] = false_positives
confusion_matrix[1, 0] = false_negatives
confusion_matrix[1, 1] = true_positives

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Confusion Matrix:\n{confusion_matrix}")
