# Inference 2 Consumer 1 as geting the data from producer 1 and extract features, then saved on HDFS, as a producer 2

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "./spark-3.5.3-bin-hadoop3"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StringType

if 'spark_inf' in locals():
    spark_inf.stop()

# Initialize Spark session with Kafka support
spark_inf = SparkSession.builder \
    .appName("RealTimeModelInference") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.7.1") \
    .getOrCreate()


:: loading settings :: url = jar:file:/root/music/spark-3.5.3-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-streaming-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-cab1483e-3bd4-4e14-8c1a-21f53f66e3de;1.0
	confs: [default]
	found org.apache.spark#spark-streaming-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in 

In [4]:

df_kafka = spark_inf.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "audio_files") \
    .load()

# Convert binary Kafka value to string to get the file path
audio_files_df = df_kafka.selectExpr("CAST(value AS BINARY) as content")
##
#audio_files_df.writeStream.format("console").start().awaitTermination()
df_binary_with_name = df_kafka.selectExpr("CAST(key AS STRING) as file_name", "CAST(value AS BINARY) as content")

In [5]:
# from ipynb.fs.full.Main import extract_features_from_content

import numpy as np
import librosa
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import ArrayType, FloatType
# Feature extraction function
def extract_features_from_content(content):

    if len(content) % 4 == 0:
        dtype = np.float32  # Assuming float32 corresponds to 4 bytes
    elif len(content) % 2 == 0:
        dtype = np.int16   # Assuming int16 corresponds to 2 bytes
    else:
        print("Content length is not compatible with int16 or float32.")
        print(f"Content length: {len(content)}")
        #return [] 
    
    audio_data = np.frombuffer(content, dtype=dtype)
    
    # Normalize the data for librosa processing
    y = librosa.util.buf_to_float(audio_data, dtype=np.float32)
    sr = 22050  # Assign a fixed sample rate

    
    # Extract features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_std = np.std(mfcc, axis=1)
    
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)
    chroma_std = np.std(chroma, axis=1)
    
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_centroid_mean = np.mean(spectral_centroid)
    spectral_centroid_std = np.std(spectral_centroid)
    
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    spectral_contrast_mean = np.mean(spectral_contrast, axis=1)
    spectral_contrast_std = np.std(spectral_contrast, axis=1)
    
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    zero_crossing_rate_mean = np.mean(zero_crossing_rate)
    zero_crossing_rate_std = np.std(zero_crossing_rate)
    
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    spectral_rolloff_mean = np.mean(spectral_rolloff)
    spectral_rolloff_std = np.std(spectral_rolloff)
    
    # Combine all features into one array
    features = np.hstack([
        mfcc_mean, mfcc_std,
        chroma_mean, chroma_std,
        [spectral_centroid_mean, spectral_centroid_std],
        spectral_contrast_mean, spectral_contrast_std,
        [zero_crossing_rate_mean, zero_crossing_rate_std],
        [spectral_rolloff_mean, spectral_rolloff_std]
    ])
    
    return features.tolist()

# Define UDF to apply the feature extraction function to each row
@pandas_udf(ArrayType(FloatType()))
def extract_audio_features(content):
    return content.apply(extract_features_from_content)

In [None]:
### validate the empty and process data , save as parquet on HDFS
from pyspark.sql.functions import lit, size

# Define the process_audio_batch function
hdfs_path = "hdfs://localhost:9000/user/hadoop/Music/Inference"

def process_audio_batch(df, epoch_id):

    # Extract features using the UDF
    features_df = df.withColumn("audio_features", extract_audio_features("content"))

    # Filter to only keep non-empty audio_features before writing
    valid_features_df = features_df.filter(features_df["audio_features"].isNotNull() & (size(features_df["audio_features"]) > 0))

    if valid_features_df.count() > 0:
        # Mark records as processed and write to HDFS
        valid_features_df = valid_features_df.withColumn("processed", lit(True))
        valid_features_df.write.mode("append").parquet(hdfs_path)
        print(f"Successfully wrote valid features to {hdfs_path}")
    else:
        print("No valid features to write for this batch.")

    # Optionally, log and show any empty audio features for debugging
    empty_features = features_df.filter(features_df["audio_features"].isNull() | (size(features_df["audio_features"]) == 0))
    if empty_features.count() > 0:
        print("Warning: Found empty audio_features in the following records:")
        empty_features.show(truncate=False)

# Set up the query to process each batch and run inference
query = audio_files_df.writeStream \
    .foreachBatch(process_audio_batch) \
    .start()

query.awaitTermination()


24/10/28 01:41:05 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-03d4a562-0f66-405c-9969-7d4fae903540. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/10/28 01:41:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/10/28 01:41:06 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/10/28 01:41:06 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/10/28 01:41:06 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/10/28 01:41:06 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con

No valid features to write for this batch.


24/10/28 01:42:10 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:42:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
                                                                                

Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:42:19 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:46:29 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:46:32 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:46:36 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:51:39 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:51:43 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:51:47 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:53:55 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:53:58 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:54:02 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:57:25 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:57:29 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:57:32 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:57:33 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:57:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:57:36 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:57:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:57:57 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:57:58 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:57:59 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:00 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:01 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:58:02 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:02 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:03 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:03 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:05 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:05 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 01:58:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:09 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 01:58:09 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:05:38 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:05:42 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:05:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:06:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:06:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:06:17 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:06:49 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:06:50 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:06:52 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:07:30 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:07:31 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:07:32 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:08:05 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:08:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:08:08 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:09:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:09:49 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:09:53 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:13:16 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:13:20 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:13:23 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:13:31 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:13:32 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:13:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:13:57 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:13:58 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:13:59 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:14:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:14:08 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:14:10 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


24/10/28 02:14:33 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:14:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/10/28 02:14:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


Successfully wrote valid features to hdfs://localhost:9000/user/hadoop/Music/Inference


                                                                                

In [6]:
spark_inf.stop()