In [1]:
# SEL 1: Impor Library
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import logging

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger(__name__)

In [3]:
# SEL 2: Membuat SparkSession dengan Konfigurasi Driver yang Benar
# Ini akan terhubung ke klaster Spark yang SUDAH BERJALAN.
spark = SparkSession.builder \
    .appName("AnomalyModelTraining") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

log.info("SparkSession berhasil dibuat.")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/18 05:42:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/18 05:42:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2025-07-18 05:42:22,450 - INFO - SparkSession berhasil dibuat.
25/07/18 06:03:16 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/07/18 06:03:19 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:978)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBacken

In [None]:
# SEL 3: Membaca Data (Hanya Data Normal) dari DB yang sudah berjalan
log.info("Membaca data 'Normal' dari PostgreSQL...")
try:
    df_normal = spark.read \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://postgres:5432/machine_db") \
        .option("dbtable", "sensor_readings") \
        .option("user", "user") \
        .option("password", "password") \
        .load() \
        .filter(col("label") == 0)
    log.info(f"Berhasil membaca {df_normal.count()} baris data 'Normal'.")
except Exception as e:
    log.error("Gagal membaca data dari database.", exc_info=True)
    df_normal = None

In [None]:
# SEL 4: Persiapan Fitur
if df_normal:
    feature_columns = ['vibration', 'acoustic', 'temperature', 'current', 'imf_1', 'imf_2', 'imf_3']
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    training_data = assembler.transform(df_normal).select("features")
    log.info("Fitur telah digabungkan.")

In [None]:
# SEL 5: Latih Model K-Means
if 'training_data' in locals():
    log.info("Memulai pelatihan model K-Means...")
    kmeans = KMeans(featuresCol="features", k=5, seed=1)
    model_kmeans = kmeans.fit(training_data)
    log.info("Pelatihan K-Means selesai.")

In [None]:
# SEL 6: Simpan Model
if 'model_kmeans' in locals():
    model_path = '/home/jovyan/work/spark_kmeans_model'
    try:
        model_kmeans.write().overwrite().save(model_path)
        log.info(f"Model K-Means berhasil disimpan ke direktori: {model_path}")
    except Exception as e:
        log.error(f"Gagal menyimpan model: {e}", exc_info=True)

In [None]:
# SEL 7: Hentikan SparkSession (PENTING!)
# Kita menghentikan sesi ini agar tidak mengganggu pekerjaan streaming lain.
spark.stop()
log.info("SparkSession untuk pelatihan telah dihentikan.")