In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, desc, countDistinct, max as spark_max, min as spark_min, explode

# Load environment variables from .env
load_dotenv()

print(" COCKROACH_USER:", os.getenv("COCKROACH_USER"))
print(" COCKROACH_PASS:", os.getenv("COCKROACH_PASS"))
print(" COCKROACH_HOST:", os.getenv("COCKROACH_HOST"))
print(" COCKROACH_PORT:", os.getenv("COCKROACH_PORT"))
print(" MONGO URI:", os.getenv("MONGO_ATLAS_URI"))

 COCKROACH_USER: shubh
 COCKROACH_PASS: nrTPuxNNk9Oggf1lCXDkYw
 COCKROACH_HOST: bowing-slime-10451.j77.aws-ap-south-1.cockroachlabs.cloud
 COCKROACH_PORT: 26257
 MONGO URI: mongodb+srv://shbpndr:CrVz9nzipaLOZFVk@sounds-similar.8sd8tnl.mongodb.net/


In [2]:
spark = SparkSession.builder \
    .appName("CockroachDB_PySpark_Project") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.18") \
    .getOrCreate()

In [3]:
spark

In [4]:
COCKROACH_USER = os.getenv("COCKROACH_USER")
COCKROACH_PASS = os.getenv("COCKROACH_PASS")
COCKROACH_HOST = os.getenv("COCKROACH_HOST")
COCKROACH_PORT = os.getenv("COCKROACH_PORT")
DATABASE_NAME = "music"

jdbc_url = f"jdbc:postgresql://{COCKROACH_HOST}:{COCKROACH_PORT}/{DATABASE_NAME}?sslmode=require"

connection_properties = {
    "user": COCKROACH_USER,
    "password": COCKROACH_PASS,
    "driver": "org.postgresql.Driver"
}

In [7]:
def load_table(table_name):
    return spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=connection_properties
    )

track_link_df = load_table("track_links")
audio_df = load_table("audio_features")
track_reference_df = load_table("track_reference")
lyrics_df = load_table("lyrics")
sentiments_df = load_table('lyrics_emotions')

## Merge Tables

In [8]:
merged_df = lyrics_df.join(sentiments_df, on='musicbrainz_id', how='inner') \
                     .join(audio_df, on='musicbrainz_id', how='inner')

In [9]:
merged_df.show(5, truncate=False)

+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
# Filter only columns that exist in merged_df
existing_cols = [col for col in assembler_input_cols if col in merged_df.columns]

# Now fill only those
df_filled = merged_df.fillna("", subset=existing_cols)

In [21]:
existing_cols

['musicbrainz_id',
 'genius_lyrics',
 'lastfm_wiki_content',
 'goemotion_sadness',
 'goemotion_realization',
 'goemotion_neutral',
 'goemotion_love',
 'goemotion_grief',
 'goemotion_amusement',
 'goemotion_gratitude',
 'goemotion_disappointment',
 'goemotion_surprise',
 'goemotion_nervousness',
 'goemotion_embarrassment',
 'goemotion_remorse',
 'goemotion_joy',
 'goemotion_fear',
 'goemotion_excitement',
 'goemotion_anger',
 'goemotion_pride',
 'goemotion_caring',
 'goemotion_disgust',
 'goemotion_confusion',
 'goemotion_optimism',
 'goemotion_relief',
 'goemotion_desire',
 'goemotion_annoyance',
 'goemotion_approval',
 'goemotion_admiration',
 'goemotion_disapproval',
 'goemotion_curiosity',
 'nrc_anger',
 'nrc_anticipation',
 'nrc_disgust',
 'nrc_fear',
 'nrc_joy',
 'nrc_negative',
 'nrc_positive',
 'nrc_sadness',
 'nrc_surprise',
 'nrc_trust',
 'textblob_polarity',
 'textblob_subjectivity',
 'vader_neg',
 'vader_neu',
 'vader_pos',
 'vader_compound',
 'duration_seconds',
 'sample_ra

In [22]:
missing_cols = [col for col in assembler_input_cols if col not in merged_df.columns]
print("Missing columns:", missing_cols)

Missing columns: []


## Missing values

In [14]:
assembler_input_cols = ['musicbrainz_id', 
  'genius_lyrics',
  'lastfm_wiki_content',
  'goemotion_sadness',
  'goemotion_realization',
  'goemotion_neutral',
  'goemotion_love','goemotion_grief','goemotion_amusement','goemotion_gratitude',
  'goemotion_disappointment','goemotion_surprise','goemotion_nervousness',
  'goemotion_embarrassment','goemotion_remorse','goemotion_joy','goemotion_fear','goemotion_excitement',
  'goemotion_anger','goemotion_pride','goemotion_caring',
  'goemotion_disgust','goemotion_confusion','goemotion_optimism','goemotion_relief',
  'goemotion_desire','goemotion_annoyance','goemotion_approval',
  'goemotion_admiration','goemotion_disapproval','goemotion_curiosity','nrc_anger','nrc_anticipation','nrc_disgust',
  'nrc_fear','nrc_joy','nrc_negative','nrc_positive','nrc_sadness','nrc_surprise','nrc_trust',
  'textblob_polarity','textblob_subjectivity','vader_neg','vader_neu','vader_pos','vader_compound','duration_seconds'  ,'sample_rate','tempo'  ,'loudness', 
  'danceability','energy'   ,'speechiness'  ,'acousticness'   ,'instrumentalness'  ,'liveness'  ,'valence' ,'spectral_centroid' ,'spectral_rolloff'  ,'spectral_bandwidth',
  'spectral_flatness'    ,'zero_crossing_rate'  ,'rms_energy'    ,'tempo_variability' ,'f0_mean' ,'mel_mean' ,
  'dynamic_range'  ,'mfcc_1'  ,'mfcc_2' ,
  'mfcc_3'  ,'mfcc_4' ,'mfcc_5' ,'mfcc_6' ,'mfcc_7'  ,'mfcc_8' ,'mfcc_9'  ,'mfcc_10' ,'mfcc_11' ,
  'mfcc_12' ,'mfcc_13' ,'spectral_contrast_1','spectral_contrast_2','spectral_contrast_3','spectral_contrast_4','spectral_contrast_5','spectral_contrast_6','spectral_contrast_7','chroma_cens_1'   ,'chroma_cens_2'   ,'chroma_cens_3'   ,'chroma_cens_4'   ,'chroma_cens_5'   ,'chroma_cens_6'   ,'chroma_cens_7'   ,'chroma_cens_8'   ,'chroma_cens_9'   ,'chroma_cens_10'  ,
  'chroma_cens_11'  ,'chroma_cens_12'  ,'tonnetz_1'  ,'tonnetz_2' ,'tonnetz_3' ,'tonnetz_4' ,'tonnetz_5' ,'tonnetz_6']

In [15]:
df_filled = merged_df.fillna("", subset=assembler_input_cols)

In [19]:
df_filled.show(1, truncate=False)

+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Assemble Numeric Features into a Feature Vector

In [25]:
from pyspark.ml.feature import VectorAssembler

# Filter only columns that exist in merged_df
existing_cols = [col for col in assembler_input_cols if col in merged_df.columns]

numeric_cols = [field.name for field in merged_df.schema.fields
                if isinstance(field.dataType, NumericType) and field.name in existing_cols]

# Fill missing values (assuming this step is needed before assembling)
df_filled = merged_df.fillna("", subset=numeric_cols)

# Create VectorAssembler
assembler = VectorAssembler(
    inputCols=numeric_cols,
    outputCol="features"
)

# Apply the assembler to create the 'features' column
df_vectorized = assembler.transform(df_filled)


## Cosine

In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

# Assuming df_vectorized has a "features" column with the assembled feature vectors


In [30]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import numpy as np

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = float(vec1.dot(vec2))
    norm1 = np.linalg.norm(vec1.toArray())
    norm2 = np.linalg.norm(vec2.toArray())
    if norm1 == 0.0 or norm2 == 0.0:
        return 0.0
    return dot_product / (norm1 * norm2)

# UDF to apply cosine similarity
cosine_similarity_udf = udf(cosine_similarity, DoubleType())

# Generate the pairwise cosine similarity between the features of different rows
# Self-join the dataframe for pairwise comparison
df_pairs = df_vectorized.alias("df1").join(df_vectorized.alias("df2"), col("df1.musicbrainz_id") < col("df2.musicbrainz_id"))

# Compute cosine similarity
df_similarities = df_pairs.withColumn(
    "cosine_similarity",
    cosine_similarity_udf(col("df1.features"), col("df2.features"))
)



In [32]:
# Show the top similarities
df_similarities.select("df1.musicbrainz_id", "df2.musicbrainz_id", "cosine_similarity").show()

Py4JJavaError: An error occurred while calling o340.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 67.0 failed 1 times, most recent failure: Lost task 0.0 in stage 67.0 (TID 45) (Shubh-PC executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (VectorAssembler$$Lambda$3787/0x00000001013ab840: (struct<goemotion_sadness:double,goemotion_realization:double,goemotion_neutral:double,goemotion_love:double,goemotion_grief:double,goemotion_amusement:double,goemotion_gratitude:double,goemotion_disappointment:double,goemotion_surprise:double,goemotion_nervousness:double,goemotion_embarrassment:double,goemotion_remorse:double,goemotion_joy:double,goemotion_fear:double,goemotion_excitement:double,goemotion_anger:double,goemotion_pride:double,goemotion_caring:double,goemotion_disgust:double,goemotion_confusion:double,goemotion_optimism:double,goemotion_relief:double,goemotion_desire:double,goemotion_annoyance:double,goemotion_approval:double,goemotion_admiration:double,goemotion_disapproval:double,goemotion_curiosity:double,nrc_anger:double,nrc_anticipation:double,nrc_disgust:double,nrc_fear:double,nrc_joy:double,nrc_negative:double,nrc_positive:double,nrc_sadness:double,nrc_surprise:double,nrc_trust:double,textblob_polarity:double,textblob_subjectivity:double,vader_neg:double,vader_neu:double,vader_pos:double,vader_compound:double,duration_seconds:double,sample_rate_double_VectorAssembler_b42c933ca80b:double,tempo:double,loudness:double,danceability:double,energy:double,speechiness:double,acousticness:double,instrumentalness:double,liveness:double,valence:double,spectral_centroid:double,spectral_rolloff:double,spectral_bandwidth:double,spectral_flatness:double,zero_crossing_rate:double,rms_energy:double,tempo_variability:double,f0_mean:double,mel_mean:double,dynamic_range:double,mfcc_1:double,mfcc_2:double,mfcc_3:double,mfcc_4:double,mfcc_5:double,mfcc_6:double,mfcc_7:double,mfcc_8:double,mfcc_9:double,mfcc_10:double,mfcc_11:double,mfcc_12:double,mfcc_13:double,spectral_contrast_1:double,spectral_contrast_2:double,spectral_contrast_3:double,spectral_contrast_4:double,spectral_contrast_5:double,spectral_contrast_6:double,spectral_contrast_7:double,chroma_cens_1:double,chroma_cens_2:double,chroma_cens_3:double,chroma_cens_4:double,chroma_cens_5:double,chroma_cens_6:double,chroma_cens_7:double,chroma_cens_8:double,chroma_cens_9:double,chroma_cens_10:double,chroma_cens_11:double,chroma_cens_12:double,tonnetz_1:double,tonnetz_2:double,tonnetz_3:double,tonnetz_4:double,tonnetz_5:double,tonnetz_6:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.ScalaUDF_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.sql.execution.joins.UnsafeCartesianRDD.compute(CartesianProductExec.scala:46)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 37 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:382)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4177)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3382)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:284)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:323)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (VectorAssembler$$Lambda$3787/0x00000001013ab840: (struct<goemotion_sadness:double,goemotion_realization:double,goemotion_neutral:double,goemotion_love:double,goemotion_grief:double,goemotion_amusement:double,goemotion_gratitude:double,goemotion_disappointment:double,goemotion_surprise:double,goemotion_nervousness:double,goemotion_embarrassment:double,goemotion_remorse:double,goemotion_joy:double,goemotion_fear:double,goemotion_excitement:double,goemotion_anger:double,goemotion_pride:double,goemotion_caring:double,goemotion_disgust:double,goemotion_confusion:double,goemotion_optimism:double,goemotion_relief:double,goemotion_desire:double,goemotion_annoyance:double,goemotion_approval:double,goemotion_admiration:double,goemotion_disapproval:double,goemotion_curiosity:double,nrc_anger:double,nrc_anticipation:double,nrc_disgust:double,nrc_fear:double,nrc_joy:double,nrc_negative:double,nrc_positive:double,nrc_sadness:double,nrc_surprise:double,nrc_trust:double,textblob_polarity:double,textblob_subjectivity:double,vader_neg:double,vader_neu:double,vader_pos:double,vader_compound:double,duration_seconds:double,sample_rate_double_VectorAssembler_b42c933ca80b:double,tempo:double,loudness:double,danceability:double,energy:double,speechiness:double,acousticness:double,instrumentalness:double,liveness:double,valence:double,spectral_centroid:double,spectral_rolloff:double,spectral_bandwidth:double,spectral_flatness:double,zero_crossing_rate:double,rms_energy:double,tempo_variability:double,f0_mean:double,mel_mean:double,dynamic_range:double,mfcc_1:double,mfcc_2:double,mfcc_3:double,mfcc_4:double,mfcc_5:double,mfcc_6:double,mfcc_7:double,mfcc_8:double,mfcc_9:double,mfcc_10:double,mfcc_11:double,mfcc_12:double,mfcc_13:double,spectral_contrast_1:double,spectral_contrast_2:double,spectral_contrast_3:double,spectral_contrast_4:double,spectral_contrast_5:double,spectral_contrast_6:double,spectral_contrast_7:double,chroma_cens_1:double,chroma_cens_2:double,chroma_cens_3:double,chroma_cens_4:double,chroma_cens_5:double,chroma_cens_6:double,chroma_cens_7:double,chroma_cens_8:double,chroma_cens_9:double,chroma_cens_10:double,chroma_cens_11:double,chroma_cens_12:double,tonnetz_1:double,tonnetz_2:double,tonnetz_3:double,tonnetz_4:double,tonnetz_5:double,tonnetz_6:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.ScalaUDF_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.sql.execution.joins.UnsafeCartesianRDD.compute(CartesianProductExec.scala:46)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 37 more
