In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import logging

In [2]:
# Konfigurasi logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger(__name__)

In [3]:
# SEL 2: Membuat SparkSession
spark = SparkSession.builder \
    .appName("AnomalyModelTraining") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.extraClassPath", "/opt/bitnami/spark/jars/postgresql-42.6.0.jar") \
    .config("spark.executor.extraClassPath", "/opt/bitnami/spark/jars/postgresql-42.6.0.jar") \
    .getOrCreate()

log.info("SparkSession berhasil dibuat dengan driver JDBC yang ditentukan.")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/07/15 13:36:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


2025-07-15 13:36:12,114 - INFO - SparkSession berhasil dibuat.


In [4]:
# SEL 3: Membaca Data Latih dari PostgreSQL
# Membaca data 'Normal' (label=0) langsung sebagai DataFrame Spark.
log.info("Membaca data dari PostgreSQL...")
try:
    df_normal = spark.read \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://postgres:5432/machine_db") \
        .option("dbtable", "sensor_readings") \
        .option("user", "user") \
        .option("password", "password") \
        .load() \
        .filter(col("label") == 0)
    
    log.info(f"Berhasil membaca {df_normal.count()} baris data 'Normal'.")
except Exception as e:
    log.error(f"Gagal membaca data dari database: {e}", exc_info=True)
    df_normal = None

2025-07-15 13:36:42,082 - INFO - Membaca data dari PostgreSQL...
2025-07-15 13:36:45,222 - ERROR - Gagal membaca data dari database: An error occurred while calling o31.load.
: java.sql.SQLException: No suitable driver
	at java.sql/java.sql.DriverManager.getDriver(DriverManager.java:298)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$2(JDBCOptions.scala:107)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:107)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:39)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:34)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.

In [8]:
# SEL 4: Persiapan Fitur dengan VectorAssembler
# Spark MLlib memerlukan semua fitur input berada dalam satu kolom vektor.

if df_normal:
    feature_columns = ['vibration', 'acoustic', 'temperature', 'current', 'imf_1', 'imf_2', 'imf_3']
    
    # VectorAssembler menggabungkan beberapa kolom menjadi satu kolom vektor
    assembler = VectorAssembler(
        inputCols=feature_columns,
        outputCol="features"
    )
    
    # Terapkan assembler ke data kita
    training_data = assembler.transform(df_normal).select("features")
    
    log.info("Data telah diubah menjadi format vektor fitur.")
    training_data.show(5, truncate=False)

In [11]:
# SEL 5: Latih Model LocalOutlierFactor dari Spark MLlib
if 'training_data' in locals():
    log.info("Memulai pelatihan model K-Means Spark MLlib...")
    
    # --- PERBAIKAN: Inisialisasi model K-Means ---
    # Kita akan mengelompokkan data normal ke dalam 5 cluster.
    kmeans = KMeans(featuresCol="features", k=5, seed=1)
    
    # Latih model
    model_kmeans = kmeans.fit(training_data)
    
    log.info("Pelatihan model K-Means selesai.")
    # Kita juga bisa menghitung ambang batas anomali di sini,
    # tapi untuk sekarang kita simpan modelnya dulu.

In [None]:
# SEL 6: Simpan Model Spark MLlib
# Model Spark disimpan sebagai sebuah direktori, bukan satu file .pkl

if 'model_anomaly_spark' in locals():
    model_path = '/home/jovyan/work/spark_anomaly_model'
    try:
        # Hapus model lama jika ada
        model_anomaly_spark.write().overwrite().save(model_path)
        log.info(f"Model Spark MLlib berhasil disimpan ke direktori: {model_path}")
    except Exception as e:
        log.error(f"Gagal menyimpan model: {e}", exc_info=True)
