# Initial Setup

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
# Check GPU
!nvidia-smi

Wed May  3 00:42:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    40W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 385, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 385 (delta 86), reused 51 (delta 51), pack-reused 269[K
Receiving objects: 100% (385/385), 105.74 KiB | 341.00 KiB/s, done.
Resolving deltas: 100% (188/188), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 2.4 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has the right kind of GPU, a NVIDIA A100-SXM4-40GB!
We will now install RAPIDS cuDF, cuML, and cuGraph via pip! 
Please stand by, should be quick...
***********************************************************************



In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [5]:
import numpy as np
import pandas as pd

import findspark
findspark.init()

import cudf, io, cuml
import cupy as cp

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, pandas_udf, PandasUDFType
from pyspark.sql.types import StructType, StructField, DoubleType, LongType

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("rapids").getOrCreate()

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import json

from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import explode, split, lower, regexp_replace, count, when, col

from cuml.svm import SVC, LinearSVC
from cuml.metrics import accuracy_score, mean_squared_error, confusion_matrix, mean_absolute_error
from cuml.model_selection import GridSearchCV
from cuml.linear_model import Ridge, Lasso, LinearRegression
from cuml.ensemble import RandomForestClassifier

from sklearn.datasets import make_classification

from sklearn.model_selection import cross_val_score

from sklearn.metrics import f1_score

# Data Preperation

In [8]:
#i have an extra folder named /H516 so my pathway might be different than yours
lyrics_df = spark.read.csv("/content/drive/My Drive/Colab Notebooks/spotify_with_word_counts.csv", header=True)

lyrics_df.show()
lyrics_df.count()

+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------+-------+----+-------+----------+----------------+--------------+------------+-------------+------------+------------+---------+--------------------+
|            track_id|          track_name|        track_artist|track_popularity|      track_album_id|    track_album_name|track_album_release_date|       playlist_name|         playlist_id|playlist_genre|   playlist_subgenre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|language|genreID|year|minutes|word_count|words_per_minute|repetition_pct|stopword_pct|profanity_pct|negative_pct|positive_pct|Sentiment|   words_only_lyrics

14398

In [9]:
df_With_Extra_Columns = lyrics_df.withColumn("HitOrMiss", \
   when(lyrics_df.track_popularity < 21, 0). \
   when(lyrics_df.track_popularity > 79, 2).
    otherwise(1) \
  )

df_With_Extra_Columns = df_With_Extra_Columns.withColumn("instrumentalnessFlag", \
   when(df_With_Extra_Columns.instrumentalness > .5, 1).
    otherwise(0) \
  )

df_With_Extra_Columns = df_With_Extra_Columns.withColumn("livenessFlag", \
   when(df_With_Extra_Columns.liveness > .5, 1).
    otherwise(0) \
  )

df_With_Extra_Columns = df_With_Extra_Columns.withColumn("acousticnessFlag", \
   when(df_With_Extra_Columns.acousticness > .5, 1).
    otherwise(0) \
  )

df_With_Extra_Columns = df_With_Extra_Columns.withColumn("SpeechinessVsWordCount", \
   df_With_Extra_Columns.speechiness * df_With_Extra_Columns.word_count
  )

df_With_Extra_Columns = df_With_Extra_Columns.withColumn("instrumentalnessFlagVsrepetition_pct", \
   df_With_Extra_Columns.instrumentalnessFlag * df_With_Extra_Columns.repetition_pct
  )

df_With_Extra_Columns = df_With_Extra_Columns.withColumn("acousticnessFlagnessVsrepetition_pct", \
   df_With_Extra_Columns.acousticnessFlag * df_With_Extra_Columns.repetition_pct
  )

df_With_Extra_Columns = df_With_Extra_Columns.withColumn("acousticnessFlagnessVsloudness", \
   df_With_Extra_Columns.acousticnessFlag * df_With_Extra_Columns.loudness
  )



In [10]:
df_With_Extra_Columns.count()

14398

In [11]:
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('track_popularity', col('track_popularity').cast('int'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('genreID', col('genreID').cast('int'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('key', col('key').cast('int'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('mode', col('mode').cast('int'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('words_per_minute', col('words_per_minute').cast('int'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('word_count', col('word_count').cast('int'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('Sentiment', col('Sentiment').cast('int'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('danceability', col('danceability').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('energy', col('energy').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('loudness', col('loudness').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('speechiness', col('speechiness').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('acousticness', col('acousticness').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('instrumentalness', col('instrumentalness').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('liveness', col('liveness').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('valence', col('valence').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('tempo', col('tempo').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('repetition_pct', col('repetition_pct').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('stopword_pct', col('stopword_pct').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('profanity_pct', col('profanity_pct').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('minutes', col('minutes').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('negative_pct', col('negative_pct').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('positive_pct', col('positive_pct').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('SpeechinessVsWordCount', col('SpeechinessVsWordCount').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('instrumentalnessFlagVsrepetition_pct', col('instrumentalnessFlagVsrepetition_pct').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('acousticnessFlagnessVsrepetition_pct', col('acousticnessFlagnessVsrepetition_pct').cast('double'))
df_With_Extra_Columns = df_With_Extra_Columns.withColumn('acousticnessFlagnessVsloudness', col('acousticnessFlagnessVsloudness').cast('double'))


In [12]:
# Convert PySpark DataFrame to Pandas DataFrame
pdf_full = df_With_Extra_Columns.toPandas()
pdf_full.convert_dtypes().dtypes

#randomize the pandas dataframe
pdf_full = pdf_full.sample(frac = 1)

df_cudf_full = cudf.DataFrame.from_pandas(pdf_full)
df_cudf_full = df_cudf_full.dropna()
df_cudf_full.count()

track_id                                14396
track_name                              14396
track_artist                            14396
track_popularity                        14396
track_album_id                          14396
track_album_name                        14396
track_album_release_date                14396
playlist_name                           14396
playlist_id                             14396
playlist_genre                          14396
playlist_subgenre                       14396
danceability                            14396
energy                                  14396
key                                     14396
loudness                                14396
mode                                    14396
speechiness                             14396
acousticness                            14396
instrumentalness                        14396
liveness                                14396
valence                                 14396
tempo                             

In [13]:
df_cudf_test = df_cudf_full.head(n=3000)
df_cudf_train = df_cudf_full.tail(n=11396)

expression = "(genreID != 3)"
df_cudf_No_Rap = df_cudf_full.query(expression)
expression = "(genreID == 3)"
df_cudf_Rap = df_cudf_full.query(expression)

df_cudf_test_Rap = df_cudf_Rap.head(n=500)
df_cudf_train_Rap = df_cudf_Rap.tail(n=1998)

df_cudf_test_No_Rap = df_cudf_No_Rap.head(n=2500)
df_cudf_train_No_Rap = df_cudf_No_Rap.tail(n=9398)

# Linear Regression Models

In [None]:
# define the input features and target variable
X = df_cudf_train[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y = df_cudf_train['track_popularity']

# create the linear regression model
linreg = LinearRegression()

# define the grid of hyperparameters just so I can perform regular k-fold cv using Gridsearch to keep same writing
parameters = {'fit_intercept': [True], 'normalize': [True]}

# perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(linreg, parameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X.to_cupy().get(), y.to_cupy().get())

# extract the best model and make predictions on new data
best_model = grid_search.best_estimator_
X_test = df_cudf_test[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y_test = df_cudf_test['track_popularity']
y_pred = best_model.predict(X_test.to_cupy().get())

# calculate the root mean squared error of the predictions
rmse = mean_squared_error(y_test.to_cupy().get(), y_pred, squared=False)
mae = mean_absolute_error(y_test.to_cupy().get(), y_pred)

# print the root mean squared error
print(f"Root mean squared error: {rmse:.4f}")
print("Mean Abosolute Error:", mae)


Root mean squared error: 23.6168
Mean Abosolute Error: 19.747772904597976


In [None]:
# define the input features and target variable
X = df_cudf_train[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train['track_popularity']

# create the linear regression model
linreg = LinearRegression()

# define the grid of hyperparameters just so I can perform regular k-fold cv using Gridsearch to keep same writing
parameters = {'fit_intercept': [True], 'normalize': [True]}

# perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(linreg, parameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X.to_cupy().get(), y.to_cupy().get())

# extract the best model and make predictions on new data
best_model = grid_search.best_estimator_
X_test = df_cudf_test[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test['track_popularity']
y_pred = best_model.predict(X_test.to_cupy().get())

# calculate the root mean squared error of the predictions
rmse = mean_squared_error(y_test.to_cupy().get(), y_pred, squared=False)
mae = mean_absolute_error(y_test.to_cupy().get(), y_pred)

# print the root mean squared error
print(f"Root mean squared error: {rmse:.4f}")
print("Mean Abosolute Error:", mae)


Root mean squared error: 23.6102
Mean Abosolute Error: 19.731994804441857


In [None]:
# define the input features and target variable
X = df_cudf_train_No_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_No_Rap['track_popularity']

# create the linear regression model
linreg = LinearRegression()

# define the grid of hyperparameters just so I can perform regular k-fold cv using Gridsearch to keep same writing
parameters = {'fit_intercept': [True], 'normalize': [True]}

# perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(linreg, parameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X.to_cupy().get(), y.to_cupy().get())

# extract the best model and make predictions on new data
best_model = grid_search.best_estimator_
X_test = df_cudf_test_No_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_No_Rap['track_popularity']
y_pred = best_model.predict(X_test.to_cupy().get())

# calculate the root mean squared error of the predictions
rmse = mean_squared_error(y_test.to_cupy().get(), y_pred, squared=False)
mae = mean_absolute_error(y_test.to_cupy().get(), y_pred)

# print the root mean squared error
print(f"Root mean squared error: {rmse:.4f}")
print("Mean Abosolute Error:", mae)


Root mean squared error: 23.3707
Mean Abosolute Error: 19.617312250627933


In [None]:
# define the input features and target variable
X = df_cudf_train_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_Rap['track_popularity']

# create the linear regression model
linreg = LinearRegression()

# define the grid of hyperparameters just so I can perform regular k-fold cv using Gridsearch to keep same writing
parameters = {'fit_intercept': [True], 'normalize': [True]}

# perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(linreg, parameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X.to_cupy().get(), y.to_cupy().get())

# extract the best model and make predictions on new data
best_model = grid_search.best_estimator_
X_test = df_cudf_test_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_Rap['track_popularity']
y_pred = best_model.predict(X_test.to_cupy().get())

# calculate the root mean squared error of the predictions
rmse = mean_squared_error(y_test.to_cupy().get(), y_pred, squared=False)
mae = mean_absolute_error(y_test.to_cupy().get(), y_pred)

# print the root mean squared error
print(f"Root mean squared error: {rmse:.4f}")
print("Mean Abosolute Error:", mae)


Root mean squared error: 24.8378
Mean Abosolute Error: 20.350762491122445


# Ridge Regression Models

In [None]:
# define the input features and target variable
X = df_cudf_train[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y = df_cudf_train['track_popularity']

# split the data into training and testing sets
X_test = df_cudf_test[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y_test = df_cudf_test['track_popularity']

# create a Ridge regression estimator
alpha_range = [0.001, 0.01, 0.1, 1.0]
ridge = Ridge(fit_intercept=True, normalize=False, solver='eig')

# perform a grid search to find the best alpha value
grid_search = GridSearchCV(ridge, {'alpha': alpha_range}, cv=5, refit = True)
grid_search.fit(X, y)

# print the best alpha value and corresponding score
print('Best alpha:', grid_search.best_params_['alpha'])

y_pred = grid_search.predict(X_test)

# calculate the root mean squared error of the predictions
rmse = mean_squared_error(y_test.to_cupy().get(), y_pred, squared=False)
mae = mean_absolute_error(y_test.to_cupy().get(), y_pred)

# print the root mean squared error
print(f"Root mean squared error: {rmse:.4f}")
print("Mean Abosolute Error:", mae)



Best alpha: 1.0
Root mean squared error: 23.6092
Mean Abosolute Error: 19.742151753731786


In [None]:
# define the input features and target variable
X = df_cudf_train[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train['track_popularity']

# split the data into training and testing sets
X_test = df_cudf_test[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test['track_popularity']

# create a Ridge regression estimator
alpha_range = [0.001, 0.01, 0.1, 1.0]
ridge = Ridge(fit_intercept=True, normalize=False, solver='eig')

# perform a grid search to find the best alpha value
grid_search = GridSearchCV(ridge, {'alpha': alpha_range}, cv=5, refit = True)
grid_search.fit(X, y)

# print the best alpha value and corresponding score
print('Best alpha:', grid_search.best_params_['alpha'])

y_pred = grid_search.predict(X_test)

# calculate the accuracy of the predictions
accuracy = mean_squared_error(y_test, y_pred)**0.5

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")

Best alpha: 0.1
Accuracy: 23.6098


In [None]:
# define the input features and target variable
X = df_cudf_train_No_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_No_Rap['track_popularity']

# split the data into training and testing sets
X_test = df_cudf_test_No_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_No_Rap['track_popularity']

# create a Ridge regression estimator
alpha_range = [0.001, 0.01, 0.1, 1.0]
ridge = Ridge(fit_intercept=True, normalize=False, solver='eig')

# perform a grid search to find the best alpha value
grid_search = GridSearchCV(ridge, {'alpha': alpha_range}, cv=5, refit = True)
grid_search.fit(X, y)

# print the best alpha value and corresponding score
print('Best alpha:', grid_search.best_params_['alpha'])

y_pred = grid_search.predict(X_test)

# calculate the accuracy of the predictions
accuracy = mean_squared_error(y_test, y_pred)**0.5

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")

Best alpha: 0.1
Accuracy: 23.3703


In [None]:
# define the input features and target variable
X = df_cudf_train_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_Rap['track_popularity']

# split the data into training and testing sets
X_test = df_cudf_test_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_Rap['track_popularity']

# create a Ridge regression estimator
alpha_range = [0.001, 0.01, 0.1, 1.0]
ridge = Ridge(fit_intercept=True, normalize=False, solver='eig')

# perform a grid search to find the best alpha value
grid_search = GridSearchCV(ridge, {'alpha': alpha_range}, cv=5, refit = True)
grid_search.fit(X, y)

# print the best alpha value and corresponding score
print('Best alpha:', grid_search.best_params_['alpha'])

y_pred = grid_search.predict(X_test)

# calculate the accuracy of the predictions
accuracy = mean_squared_error(y_test, y_pred)**0.5

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")

Best alpha: 0.1
Accuracy: 24.8262


# Lasso Models

In [None]:
# define the input features and target variable
X = df_cudf_train[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y = df_cudf_train['track_popularity']

# split the data into training and testing sets
X_test = df_cudf_test[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y_test = df_cudf_test['track_popularity']

# create a Ridge regression estimator
alpha_range = [0.001, 0.01, 0.1, 1.0]
lasso = Lasso()

# perform a grid search to find the best alpha value
grid_search = GridSearchCV(lasso, {'alpha': alpha_range}, cv=5, refit = True)
grid_search.fit(X, y)

# print the best alpha value and corresponding score
print('Best alpha:', grid_search.best_params_['alpha'])

y_pred = grid_search.predict(X_test)

# calculate the root mean squared error of the predictions
rmse = mean_squared_error(y_test.to_cupy().get(), y_pred, squared=False)
mae = mean_absolute_error(y_test.to_cupy().get(), y_pred)

# print the root mean squared error
print(f"Root mean squared error: {rmse:.4f}")
print("Mean Abosolute Error:", mae)


Best alpha: 0.001
Root mean squared error: 23.6162
Mean Abosolute Error: 19.738793983239095


In [None]:
# define the input features and target variable
X = df_cudf_train[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train['track_popularity']

# split the data into training and testing sets
X_test = df_cudf_test[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test['track_popularity']

# create a Ridge regression estimator
alpha_range = [0.001, 0.01, 0.1, 1.0]
lasso = Lasso()

# perform a grid search to find the best alpha value
grid_search = GridSearchCV(lasso, {'alpha': alpha_range}, cv=5, refit = True)
grid_search.fit(X, y)

# print the best alpha value and corresponding score
print('Best alpha:', grid_search.best_params_['alpha'])

y_pred = grid_search.predict(X_test)

# calculate the accuracy of the predictions
accuracy = mean_squared_error(y_test, y_pred)**0.5

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")


Best alpha: 0.001
Accuracy: 23.6173


In [None]:
# define the input features and target variable
X = df_cudf_train_No_Rap[["energy", "loudness", "speechiness", "SpeechinessVsWordCount", "tempo", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_No_Rap['track_popularity']

# split the data into training and testing sets
X_test = df_cudf_test_No_Rap[["energy", "loudness", "speechiness", "SpeechinessVsWordCount", "tempo", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_No_Rap['track_popularity']

# create a Ridge regression estimator
alpha_range = [0.001, 0.01, 0.1, 1.0]
lasso = Lasso()

# perform a grid search to find the best alpha value
grid_search = GridSearchCV(lasso, {'alpha': alpha_range}, cv=5, refit = True)
grid_search.fit(X, y)

# print the best alpha value and corresponding score
print('Best alpha:', grid_search.best_params_['alpha'])

y_pred = grid_search.predict(X_test)

# calculate the accuracy of the predictions
accuracy = mean_squared_error(y_test, y_pred)**0.5

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")


Best alpha: 0.001
Accuracy: 23.3717


In [None]:
# define the input features and target variable
X = df_cudf_train_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_Rap['track_popularity']

# split the data into training and testing sets
X_test = df_cudf_test_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_Rap['track_popularity']

# create a Ridge regression estimator
alpha_range = [0.001, 0.01, 0.1, 1.0]
lasso = Lasso()

# perform a grid search to find the best alpha value
grid_search = GridSearchCV(lasso, {'alpha': alpha_range}, cv=5, refit = True)
grid_search.fit(X, y)

# print the best alpha value and corresponding score
print('Best alpha:', grid_search.best_params_['alpha'])

y_pred = grid_search.predict(X_test)

# calculate the accuracy of the predictions
accuracy = mean_squared_error(y_test, y_pred)**0.5

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")


Best alpha: 0.001
Accuracy: 24.8357


# Logistic Regression Models

In [None]:
model = cuml.LogisticRegression(multi_class='multinomial')

# define the input features and target variable
X = df_cudf_train[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y = df_cudf_train['HitOrMiss']

model.fit(X, y)

# Make predictions on new data
X_test = df_cudf_test[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y_test = df_cudf_test['HitOrMiss']

y_pred = model.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), y_pred.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")


[I] [00:06:28.663364] Unused keyword parameter: multi_class during cuML estimator initialization
[W] [00:06:31.434540] L-BFGS: max iterations reached
[W] [00:06:31.435720] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
Accuracy: 0.7377
F1 scores for each class: [0.         0.84855491 0.24719101]


In [None]:
model = cuml.LogisticRegression(multi_class='multinomial')

# define the input features and target variable
X = df_cudf_train[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train['HitOrMiss']

model.fit(X, y)

# Make predictions on new data
X_test = df_cudf_test[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test['HitOrMiss']

y_pred = model.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), y_pred.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")


[I] [00:06:31.460267] Unused keyword parameter: multi_class during cuML estimator initialization
[W] [00:06:32.115132] L-BFGS: max iterations reached
[W] [00:06:32.116566] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
Accuracy: 0.7377
F1 scores for each class: [0.         0.84856648 0.19512195]


In [None]:
model = cuml.LogisticRegression(multi_class='multinomial')

# define the input features and target variable
X = df_cudf_train_No_Rap[["energy", "loudness", "speechiness", "SpeechinessVsWordCount", "tempo", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_No_Rap['HitOrMiss']

model.fit(X, y)

# Make predictions on new data
X_test = df_cudf_test_No_Rap[["energy", "loudness", "speechiness", "SpeechinessVsWordCount", "tempo", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_No_Rap['HitOrMiss']

y_pred = model.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), y_pred.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")


[I] [00:06:32.134415] Unused keyword parameter: multi_class during cuML estimator initialization
[W] [00:06:32.748591] L-BFGS: max iterations reached
[W] [00:06:32.749809] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
Accuracy: 0.7460
F1 scores for each class: [0.         0.85405654 0.21538462]


In [None]:
model = cuml.LogisticRegression(multi_class='multinomial')

# define the input features and target variable
X = df_cudf_train_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_Rap['HitOrMiss']

model.fit(X, y)

# Make predictions on new data
X_test = df_cudf_test_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_Rap['HitOrMiss']

y_pred = model.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), y_pred.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")


[I] [00:06:32.771185] Unused keyword parameter: multi_class during cuML estimator initialization
Accuracy: 0.6900
F1 scores for each class: [0.         0.81666667 0.18181818]


# Random Forest Models

In [None]:
# define the input features and target variable
X = df_cudf_train[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y = df_cudf_train['HitOrMiss']

# split the data into training and testing sets
X_test = df_cudf_test[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y_test = df_cudf_test['HitOrMiss']

# define a range of hyperparameters to search over
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]}

# initialize a Random Forest classifier estimator
rf = RandomForestClassifier(max_features='auto')

# perform grid search over the hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X.to_cupy().get(), y.to_cupy().get())

# print the best performing hyperparameters and their performance
print("Best hyperparameters:", grid_search.best_params_)
print("Average score:", grid_search.best_score_)
print("Standard deviation:", grid_search.cv_results_['std_test_score'][grid_search.best_index_])

# extract the best performing random forest
best_rf = grid_search.best_estimator_

# train the best performing random forest on the entire dataset
best_rf.fit(X, y)

# use the best performing random forest for prediction
predictions = best_rf.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, predictions)

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), predictions.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")


  ret = func(*args, **kwargs)


Best hyperparameters: {'max_depth': 10, 'n_estimators': 50}
Average score: 0.7418393339645736
Standard deviation: 0.001175853789597843
Accuracy: 0.7380
F1 scores for each class: [0.00825309 0.84898746 0.22727273]


In [None]:
# define the input features and target variable
X = df_cudf_train[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train['HitOrMiss']

# split the data into training and testing sets
X_test = df_cudf_test[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test['HitOrMiss']

# define a range of hyperparameters to search over
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]}

# initialize a Random Forest classifier estimator
rf = RandomForestClassifier(max_features='auto')

# perform grid search over the hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X.to_cupy().get(), y.to_cupy().get())

# print the best performing hyperparameters and their performance
print("Best hyperparameters:", grid_search.best_params_)
print("Average score:", grid_search.best_score_)
print("Standard deviation:", grid_search.cv_results_['std_test_score'][grid_search.best_index_])

# extract the best performing random forest
best_rf = grid_search.best_estimator_

# train the best performing random forest on the entire dataset
best_rf.fit(X, y)

# use the best performing random forest for prediction
predictions = best_rf.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, predictions)

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), predictions.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")


  ret = func(*args, **kwargs)


Best hyperparameters: {'max_depth': 5, 'n_estimators': 200}
Average score: 0.7414005450220549
Standard deviation: 0.001216391937138455
Accuracy: 0.7373
F1 scores for each class: [0.         0.84840323 0.17283951]


In [None]:
# define the input features and target variable
X = df_cudf_train_No_Rap[["energy", "loudness", "speechiness", "SpeechinessVsWordCount", "tempo", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_No_Rap['HitOrMiss']

# split the data into training and testing sets
X_test = df_cudf_test_No_Rap[["energy", "loudness", "speechiness", "SpeechinessVsWordCount", "tempo", "minutes", "word_count", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_No_Rap['HitOrMiss']

# define a range of hyperparameters to search over
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]}

# initialize a Random Forest classifier estimator
rf = RandomForestClassifier(max_features='auto')

# perform grid search over the hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X.to_cupy().get(), y.to_cupy().get())

# print the best performing hyperparameters and their performance
print("Best hyperparameters:", grid_search.best_params_)
print("Average score:", grid_search.best_score_)
print("Standard deviation:", grid_search.cv_results_['std_test_score'][grid_search.best_index_])

# extract the best performing random forest
best_rf = grid_search.best_estimator_

# train the best performing random forest on the entire dataset
best_rf.fit(X, y)

# use the best performing random forest for prediction
predictions = best_rf.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, predictions)

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), predictions.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")


  ret = func(*args, **kwargs)


Best hyperparameters: {'max_depth': 10, 'n_estimators': 200}
Average score: 0.739305991190425
Standard deviation: 0.0019352570002557244
Accuracy: 0.7464
F1 scores for each class: [0.03311258 0.85364162 0.28169014]


In [None]:
# define the input features and target variable
X = df_cudf_train_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y = df_cudf_train_Rap['HitOrMiss']

# split the data into training and testing sets
X_test = df_cudf_test_Rap[["energy", "loudness", "SpeechinessVsWordCount", "minutes", "words_per_minute", "profanity_pct"]]
y_test = df_cudf_test_Rap['HitOrMiss']

# define a range of hyperparameters to search over
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]}

# initialize a Random Forest classifier estimator
rf = RandomForestClassifier(max_features='auto')

# perform grid search over the hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X.to_cupy().get(), y.to_cupy().get())

# print the best performing hyperparameters and their performance
print("Best hyperparameters:", grid_search.best_params_)
print("Average score:", grid_search.best_score_)
print("Standard deviation:", grid_search.cv_results_['std_test_score'][grid_search.best_index_])

# extract the best performing random forest
best_rf = grid_search.best_estimator_

# train the best performing random forest on the entire dataset
best_rf.fit(X, y)

# use the best performing random forest for prediction
predictions = best_rf.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, predictions)

# print the accuracy and confusion matrix
print(f"Accuracy: {accuracy:.4f}")

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), predictions.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")


  ret = func(*args, **kwargs)


Best hyperparameters: {'max_depth': 5, 'n_estimators': 200}
Average score: 0.7537556390977443
Standard deviation: 0.0023437322956368816
Accuracy: 0.6880
F1 scores for each class: [0.        0.8156956 0.0952381]


# SVM Model

In [17]:
# define the input features and target variable
X = df_cudf_train[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment","instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y = df_cudf_train['HitOrMiss']

# split the data into training and testing sets
X_test = df_cudf_test[["danceability", "energy","key","loudness","mode","speechiness","valence","tempo","minutes","word_count","words_per_minute","repetition_pct","stopword_pct","profanity_pct","negative_pct","positive_pct","Sentiment", "instrumentalnessFlag","livenessFlag","acousticnessFlag","SpeechinessVsWordCount","instrumentalnessFlagVsrepetition_pct","acousticnessFlagnessVsrepetition_pct","acousticnessFlagnessVsloudness"]]
y_test = df_cudf_test['HitOrMiss']

# Create an SVC classifier object
svc = LinearSVC(C=1.0)

# Perform grid search cross-validation to find the best hyperparameters
svc.fit(X.to_cupy().get(), y.to_cupy().get())

# Make predictions on test data using the best model
y_pred = svc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

print('svc accuracy:', accuracy)

# calculate the f1 score of the model
f1 = f1_score(y_test.to_cupy().get(), y_pred.to_cupy().get(), average=None)
print(f"F1 scores for each class: {f1}")

svc accuracy: 0.7363333106040955
F1 scores for each class: [0.         0.84791386 0.12307692]
