In [3]:
import numpy as np
import pandas as pd

# Load the dataset using Pandas
file_path = '/content/drive/MyDrive/Colab Notebooks/Spotify_Youtube.xlsx'
data = pd.read_excel(file_path)

# Filter rows with valid 'Likes' and feature column values (exclude rows with missing or non-numeric values)
data = data.dropna(subset=['Likes', 'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_ms'])

# Extract the 'Likes' column as the target variable
y = data['Likes']
# Apply logarithmic transformation to the target variable
y = np.log1p(y)

# Extract the features you want to use for regression
X = data[['Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_ms']]

# Standardize the features (z-score normalization)
mean_vals = np.mean(X, axis=0)
std_devs = np.std(X, axis=0)
X_standardized = (X - mean_vals) / std_devs

# Number of folds for cross-validation
num_folds = 5
fold_size = len(data) // num_folds

mae_list, mse_list, rmse_list, r2_list = [], [], [], []

for fold in range(num_folds):
    # Split the dataset into training and testing sets (using different folds for each iteration)
    test_start, test_end = fold * fold_size, (fold + 1) * fold_size
    test_indices = list(range(test_start, test_end))
    train_indices = [i for i in range(len(data)) if i not in test_indices]

    X_train, X_test = X_standardized.iloc[train_indices], X_standardized.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

    # Convert Pandas Series to NumPy array and reshape
    y_train = y_train.to_numpy().reshape(-1, 1)

    # Define the neural network architecture
    input_size = X_train.shape[1]
    hidden_size = 64  # Number of units in the hidden layer
    output_size = 1  # For regression

    # Initialize weights and bias with small random values
    np.random.seed(0)
    w1 = np.random.rand(input_size, hidden_size) * 0.01
    b1 = np.zeros((1, hidden_size))
    w2 = np.random.rand(hidden_size, output_size) * 0.01
    b2 = np.zeros((1, output_size))

    # Hyperparameters
    learning_rate = 0.0001  # Adjusted learning rate
    epochs = 1000

    # Gradient clipping threshold
    max_gradient_norm = 1.0

    for epoch in range(epochs):
        # Forward propagation
        z1 = np.dot(X_train, w1) + b1
        a1 = np.maximum(0, z1)  # ReLU activation
        z2 = np.dot(a1, w2) + b2
        y_pred = z2

        # Calculate the mean absolute error
        error = np.mean(np.abs(y_pred - y_train))

        # Backpropagation
        delta2 = y_pred - y_train
        dw2 = np.dot(a1.T, delta2)
        db2 = np.sum(delta2, axis=0)

        # Implement gradient clipping
        dw2 = np.clip(dw2, -max_gradient_norm, max_gradient_norm)

        # Calculate delta1 using chain rule
        delta1 = np.dot(delta2, w2.T) * (a1 > 0)  # ReLU derivative
        dw1 = np.dot(X_train.T, delta1)
        db1 = np.sum(delta1, axis=0)

        # Implement gradient clipping
        dw1 = np.clip(dw1, -max_gradient_norm, max_gradient_norm)

        # Update weights and biases
        w1 -= learning_rate * dw1
        b1 -= learning_rate * db1
        w2 -= learning_rate * dw2
        b2 -= learning_rate * db2

        if epoch % 100 == 0:
            print(f'Fold: {fold + 1}, Epoch {epoch}, Error: {error}')

    # Make predictions on the test set
    z1_test = np.dot(X_test, w1) + b1
    a1_test = np.maximum(0, z1_test)  # ReLU activation
    z2_test = np.dot(a1_test, w2) + b2
    y_test_pred = z2_test.reshape(-1)  # Adjust the shape to 1-dimensional array

    # Calculate performance metrics for the fold
    mae = np.mean(np.abs(y_test_pred - y_test))  # Mean Absolute Error
    mse = np.mean((y_test_pred - y_test) ** 2)     # Mean Squared Error
    rmse = np.sqrt(mse)
    ss_tot = np.sum((y_test - np.mean(y_test)) ** 2)
    ss_res = np.sum((y_test - y_test_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)

    # Append metrics to lists
    mae_list.append(mae)
    mse_list.append(mse)
    rmse_list.append(rmse)
    r2_list.append(r2)

    # Print or log the metrics for the fold
    print(f'Fold: {fold + 1}, MAE: {mae}, MSE: {mse}, RMSE: {rmse}, R^2: {r2}')

# After the loop, calculate and print average metrics across all folds
print(f'Average MAE: {np.mean(mae_list)}, Average MSE: {np.mean(mse_list)}, Average RMSE: {np.mean(rmse_list)}, Average R^2: {np.mean(r2_list)}')


Fold: 1, Epoch 0, Error: 11.345454391796402
Fold: 1, Epoch 100, Error: 1.9914886809845538
Fold: 1, Epoch 200, Error: 1.975266894878961
Fold: 1, Epoch 300, Error: 1.953870401851647
Fold: 1, Epoch 400, Error: 1.9314544923283015
Fold: 1, Epoch 500, Error: 1.9124372484494467
Fold: 1, Epoch 600, Error: 1.8958309319519064
Fold: 1, Epoch 700, Error: 2.4139438183550554
Fold: 1, Epoch 800, Error: 1.974601008399274
Fold: 1, Epoch 900, Error: 1.9548211504854462
Fold: 1, MAE: 1.9001439663578374, MSE: 5.807065762692279, RMSE: 2.4097854183914964, R^2: 0.08400166494563199
Fold: 2, Epoch 0, Error: 11.625267954307159
Fold: 2, Epoch 100, Error: 1.9406791130161987
Fold: 2, Epoch 200, Error: 1.9253787193080847
Fold: 2, Epoch 300, Error: 1.9051053913683598
Fold: 2, Epoch 400, Error: 1.8839431250611731
Fold: 2, Epoch 500, Error: 1.8664369614710226
Fold: 2, Epoch 600, Error: 1.8540344601645873
Fold: 2, Epoch 700, Error: 3.4145224942961496
Fold: 2, Epoch 800, Error: 1.879563727387597
Fold: 2, Epoch 900, Error