In [8]:
from pyspark.sql.types import ArrayType, FloatType, IntegerType, StringType, StructType, StructField
from pyspark.ml.feature import SQLTransformer
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, concat, udf, col
from tensorflow.keras.models import load_model
from pyspark.ml.linalg import Vectors, VectorUDT
import numpy as np
from PIL import Image
import os
from pyspark.ml import PipelineModel

In [9]:
spark: SparkSession = SparkSession.builder.appName("Traffic Signs Classification Streaming") \
    .master("local[*]") \
    .config("spark.rapids.sql.enabled", "true") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.memory", "12g") \
    .config("spark.python.worker.memory", "12g") \
    .config("spark.executor.pyspark.memory", "12g") \
    .config("spark.rpc.message.maxSize", "128") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.sql.streaming.checkpointLocation", "./tmp") \
    .getOrCreate()

dataset_path = "./dataset"

schema = StructType([
    StructField("Width", IntegerType(), True),
    StructField("Height", IntegerType(), True),
    StructField("Roi.X1", IntegerType(), True),
    StructField("Roi.Y1", IntegerType(), True),
    StructField("Roi.X2", IntegerType(), True),
    StructField("Roi.Y2", IntegerType(), True),
    StructField("ClassId", IntegerType(), True),
    StructField("Path", StringType(), True)
])

input_stream = spark.readStream.option("header", "true").schema(schema).csv(os.path.join(dataset_path, "streaming"))
input_stream = input_stream.withColumn("Path", concat(lit(dataset_path + "/"), input_stream["Path"])).dropna(subset=["ClassId"])


In [10]:
def process_image(img_path, roi_x1, roi_y1, roi_x2, roi_y2):
    try:
        img = Image.open(img_path)
        cropped_img = img.crop((roi_x1, roi_y1, roi_x2, roi_y2))
        resized_img = np.array(cropped_img.resize((32, 32), resample=Image.Resampling.LANCZOS))
        return (resized_img.flatten() / 255.0).tolist()
    except Exception as e:
        print(f"Error processing image {img_path}: {e}")
        return [0.0] * (32 * 32 * 3)

def image_to_vector(img_features):
    return Vectors.dense(img_features)

spark.udf.register("process_image", process_image, ArrayType(FloatType()))
spark.udf.register("image_to_vector", image_to_vector, VectorUDT())

24/12/24 02:31:05 WARN SimpleFunctionRegistry: The function process_image replaced a previously registered function.
24/12/24 02:31:05 WARN SimpleFunctionRegistry: The function image_to_vector replaced a previously registered function.


<function __main__.image_to_vector(img_features)>

In [11]:
lrModel = PipelineModel.load("best-model-lr")
rfModel = PipelineModel.load("best-model-rf")

In [12]:
lrPredictionsStream = lrModel.transform(input_stream)

outputStream = lrPredictionsStream.select("Path", "ClassId", "prediction").writeStream.option("path", "output/lr").outputMode("append").trigger(once=True).format("csv").start()
outputStream.awaitTermination()

In [13]:
rfPredictionsStream = rfModel.transform(input_stream)

outputStream = rfPredictionsStream.select("Path", "ClassId", "prediction").writeStream.option("path", "output/rf").outputMode("append").trigger(once=True).format("csv").start()
outputStream.awaitTermination()

2024-12-24 02:45:53.654781: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-24 02:45:53.843070: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-24 02:45:53.843124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-24 02:45:53.871762: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-24 02:45:53.932263: I tensorflow/core/platform/cpu_feature_guar

TypeError: Error when deserializing class 'InputLayer' using config={'batch_shape': [None, 32, 32, 3], 'dtype': 'float32', 'sparse': False, 'name': 'input_layer'}.

Exception encountered: Unrecognized keyword arguments: ['batch_shape']