In [1]:
# Impor library yang diperlukan
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr, when, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [2]:
# Membuat SparkSession
spark = SparkSession.builder \
    .appName("PredictiveMaintenanceStreaming") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .getOrCreate()

print("SparkSession berhasil dibuat.")

:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-51639060-f15a-4f63-92b4-f088e49f15e4;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.0 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: reso

SparkSession berhasil dibuat.


In [3]:
# Mendefinisikan Skema
raw_schema = StructType([
    StructField("timestamp", StringType(), True),
    StructField("machine_id", StringType(), True),
    StructField("vibration", DoubleType(), True),
    StructField("acoustic", DoubleType(), True),
    StructField("temperature", DoubleType(), True),
    StructField("current", DoubleType(), True),
    StructField("IMF_1", DoubleType(), True),
    StructField("IMF_2", DoubleType(), True),
    StructField("IMF_3", DoubleType(), True),
    StructField("label", IntegerType(), True)
])

In [4]:
# Membaca dari topik Kafka
raw_stream_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "raw_sensor_data") \
    .option("startingOffsets", "earliest") \
    .load()

In [5]:
raw_stream_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
# Parsing dan Transformasi Data
parsed_df = raw_stream_df \
    .select(from_json(col("value").cast("string"), raw_schema).alias("data")) \
    .select("data.*")

In [None]:
# Transformasi utama
# transformed_df = parsed_df \
#     .withColumn("event_timestamp", to_timestamp(col("datetime"))) \
#     .withColumn("status",
#         when((col("volt") > 250) | (col("volt") < 190), "Voltage Anomaly")
#         .when(col("vibration") > 70, "High Vibration")
#         .otherwise("Normal")
#     )

In [7]:
transformed_df = parsed_df \
    .withColumn("event_timestamp", to_timestamp(col("timestamp"))) \
    .withColumn("status",
        when(col("label") == 1, "Failure Detected")
        .otherwise("Normal")
    )

In [8]:
# Pilih kolom final
final_df = transformed_df.select(
    "event_timestamp", "machine_id", "vibration", "acoustic", 
    "temperature", "current", "status",
    "IMF_1", "IMF_2", "IMF_3",
    "label"
)

In [9]:
# Memastikan transformasi benar sebelum menulis kembali ke Kafka.
# Menulis ke konsol untuk debugging
console_query = final_df \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

25/07/18 08:55:10 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-1468ca93-c8e4-443d-9fc9-b81c92ccb84e. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/07/18 08:55:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [10]:
# Menulis ke topik Kafka yang sudah bersih
kafka_output_df = final_df.select(expr("to_json(struct(*)) AS value"))
kafka_query = kafka_output_df \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("topic", "clean_sensor_data") \
    .option("checkpointLocation", "/tmp/spark_checkpoints/kafka_sensor_writer") \
    .start()

25/07/18 08:55:12 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/07/18 08:55:12 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
25/07/18 08:55:12 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
25/07/18 08:55:12 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
25/07/18 08:55:12 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
25/07/18 08:55:12 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.
25/07/18 08:55:20 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0) (172.18.0.8 executor 0): java.lang.NoSuchMethodError: 'boolean org.apache.spark.sql.catalyst.expressions.Cast$.apply$default$4()'
	at org.apache.spark.sql.kafka010.KafkaRowWriter.createPr

-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+----------+---------+--------+-----------+-------+----------------+-----+------+-----+-----+
|event_timestamp    |machine_id|vibration|acoustic|temperature|current|status          |IMF_1|IMF_2 |IMF_3|label|
+-------------------+----------+---------+--------+-----------+-------+----------------+-----+------+-----+-----+
|2024-07-01 08:00:00|M01       |0.822    |0.645   |66.85      |13.04  |Normal          |0.196|0.033 |0.0  |0    |
|2024-07-01 08:01:00|M01       |1.398    |0.834   |76.2       |15.08  |Failure Detected|0.345|0.132 |0.001|1    |
|2024-07-01 08:02:00|M01       |0.856    |0.59    |67.03      |12.3   |Normal          |0.187|0.017 |0.002|0    |
|2024-07-01 08:03:00|M01       |0.793    |0.544   |65.04      |11.69  |Normal          |0.196|-0.06 |0.003|0    |
|2024-07-01 08:04:00|M01       |1.279    |0.721   |78.19      |14.84  |Failure Detected|0.33 |-0.115|0.00

In [None]:
# Untuk menghentikan semua query:
console_query.stop()
kafka_query.stop()