In [None]:
# Impor library yang diperlukan
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr, when
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [None]:
# Membuat SparkSession
spark = SparkSession.builder \
    .appName("MachineFailureStreaming") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .getOrCreate()

print("SparkSession berhasil dibuat.")

In [None]:
# Mendefinisikan Skema Data Mentah

raw_schema = StructType([
    StructField("UDI", IntegerType(), True),
    StructField("Product ID", StringType(), True),
    StructField("Type", StringType(), True),
    StructField("Air temperature [K]", DoubleType(), True),
    StructField("Process temperature [K]", DoubleType(), True),
    StructField("Rotational speed [rpm]", IntegerType(), True),
    StructField("Torque [Nm]", DoubleType(), True),
    StructField("Tool wear [min]", IntegerType(), True),
    StructField("Machine failure", IntegerType(), True),
    StructField("TWF", IntegerType(), True),
    StructField("HDF", IntegerType(), True),
    StructField("PWF", IntegerType(), True),
    StructField("OSF", IntegerType(), True),
    StructField("RNF", IntegerType(), True)
])

In [None]:
# Membaca Aliran Data dari Topik Kafka 'raw_sensor_data'
# readStream membuat DataFrame yang merepresentasikan aliran data dari Kafka.
raw_stream_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "raw_sensor_data") \
    .option("startingOffsets", "earliest") \
    .load()

In [None]:
raw_stream_df.printSchema()

In [None]:
# Melakukan Parsing dan Transformasi Data
# 1. Mengambil kolom 'value' dari Kafka (yang berisi data JSON) dan ubah menjadi STRING.
# 2. Menggunakan from_json untuk mem-parsing string JSON menjadi struct menggunakan skema yang kita definisikan.
# 3. 'Flatten' struct agar setiap key di JSON menjadi kolom sendiri.
parsed_df = raw_stream_df \
    .select(from_json(col("value").cast("string"), raw_schema).alias("data")) \
    .select("data.*")

In [None]:
# Melakukan Transformasi
#    - Mengganti nama kolom agar lebih mudah digunakan (tanpa spasi/karakter aneh).
#    - Konversi suhu dari Kelvin ke Celsius.
#    - Membuat satu kolom 'failure_type' yang lebih deskriptif.
transformed_df = parsed_df.withColumnRenamed("Air temperature [K]", "air_temperature_k") \
    .withColumnRenamed("Process temperature [K]", "process_temperature_k") \
    .withColumnRenamed("Rotational speed [rpm]", "rotational_speed_rpm") \
    .withColumnRenamed("Torque [Nm]", "torque_nm") \
    .withColumnRenamed("Tool wear [min]", "tool_wear_min") \
    .withColumnRenamed("Machine failure", "machine_failure") \
    .withColumnRenamed("Product ID", "product_id") \
    .withColumn("air_temperature_celsius", col("air_temperature_k") - 273.15) \
    .withColumn("process_temperature_celsius", col("process_temperature_k") - 273.15) \
    .withColumn("failure_type",
        when(col("TWF") == 1, "Tool Wear Failure")
        .when(col("HDF") == 1, "Heat Dissipation Failure")
        .when(col("PWF") == 1, "Power Failure")
        .when(col("OSF") == 1, "Overstrain Failure")
        .when(col("RNF") == 1, "Random Failure")
        .otherwise("No Failure")
    )

In [None]:
# Pilih kolom final yang akan kita kirim
final_df = transformed_df.select(
    "UDI", "product_id", "Type", "air_temperature_celsius", "process_temperature_celsius",
    "rotational_speed_rpm", "torque_nm", "tool_wear_min", "machine_failure", "failure_type"
)

final_df.printSchema()

In [None]:
# Menulis Hasil Transformasi ke Konsol (Untuk Debugging)
# Memastikan transformasi benar sebelum menulis kembali ke Kafka.

console_query = final_df \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

In [None]:
# Menulis Aliran Data yang Sudah Bersih ke Topik Kafka 'clean_sensor_data'
kafka_output_df = final_df.select(expr("to_json(struct(*)) AS value"))

In [None]:
# Tulis ke topik Kafka baru
kafka_query = kafka_output_df \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("topic", "clean_sensor_data") \
    .option("checkpointLocation", "/tmp/spark_checkpoints/kafka_writer") \
    .start()

print("Streaming data yang sudah bersih ke topik 'clean_sensor_data' telah dimulai.")

In [None]:
# Untuk menghentikan semua query:
# console_query.stop()
# kafka_query.stop()