# PySpark ETL: Pipeline & Logistics Data Integration

## Objective
Load, join, and cleanse data from PostgreSQL (Inspection Logs) and MongoDB (Asset Telemetry). 
Aggregate sensor data to match inspection windows and persist as Parquet.

In [1]:
import os
import sys
# Set environment variables for PySpark to use the correct Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
# Set HADOOP_HOME to the local hadoop directory
os.environ['HADOOP_HOME'] = os.path.abspath('../hadoop')
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['HADOOP_HOME'], 'bin')

# Set environment variables for PySpark to use the correct Python executable

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, avg, max, min

# Initialize Spark Session
# Note: In a real environment, you would need to include the necessary JARs for Postgres and MongoDB connectors
# .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,org.postgresql:postgresql:42.2.18")
spark = SparkSession.builder \
    .appName("PTT_Pipeline_Logistics_ETL") \
    .getOrCreate()

print("Spark Session Created")

Spark Session Created


In [2]:
# 1. Load Data from PostgreSQL (Mocked for this notebook as we don't have a live DB)
# In production: spark.read.format("jdbc").option("url", "jdbc:postgresql://localhost:5432/ptt_db")...

inspection_data = [
    (1, "SEG-101", "2023-10-01", "INS-01", False, 0.0, False),
    (2, "SEG-102", "2023-10-02", "INS-02", True, 15.5, True),
    (3, "SEG-103", "2023-10-03", "INS-01", False, 2.1, False)
]
columns = ["inspection_id", "pipeline_segment_id", "inspection_date", "inspector_id", "crack_detected", "corrosion_level", "maintenance_required"]

df_inspections = spark.createDataFrame(inspection_data, columns)
df_inspections.show()

+-------------+-------------------+---------------+------------+--------------+---------------+--------------------+
|inspection_id|pipeline_segment_id|inspection_date|inspector_id|crack_detected|corrosion_level|maintenance_required|
+-------------+-------------------+---------------+------------+--------------+---------------+--------------------+
|            1|            SEG-101|     2023-10-01|      INS-01|         false|            0.0|               false|
|            2|            SEG-102|     2023-10-02|      INS-02|          true|           15.5|                true|
|            3|            SEG-103|     2023-10-03|      INS-01|         false|            2.1|               false|
+-------------+-------------------+---------------+------------+--------------+---------------+--------------------+



In [3]:
# 2. Load Data from MongoDB (Mocked)
# In production: spark.read.format("mongo").option("uri", "mongodb://localhost:27017/ptt_logistics_db.asset_telemetry").load()

telemetry_data = [
    ("SEG-101", "2023-10-01 10:00:00", 1000.0, 30.0, 0.5),
    ("SEG-101", "2023-10-01 11:00:00", 1050.0, 32.0, 0.6),
    ("SEG-102", "2023-10-02 09:00:00", 900.0, 28.0, 2.5),
    ("SEG-102", "2023-10-02 10:00:00", 850.0, 29.0, 3.0)
]
telemetry_cols = ["pipeline_segment_id", "timestamp", "pressure_psi", "temperature_c", "vibration_level"]

df_telemetry = spark.createDataFrame(telemetry_data, telemetry_cols)
df_telemetry = df_telemetry.withColumn("timestamp", col("timestamp").cast("timestamp"))
df_telemetry.show()

+-------------------+-------------------+------------+-------------+---------------+
|pipeline_segment_id|          timestamp|pressure_psi|temperature_c|vibration_level|
+-------------------+-------------------+------------+-------------+---------------+
|            SEG-101|2023-10-01 10:00:00|      1000.0|         30.0|            0.5|
|            SEG-101|2023-10-01 11:00:00|      1050.0|         32.0|            0.6|
|            SEG-102|2023-10-02 09:00:00|       900.0|         28.0|            2.5|
|            SEG-102|2023-10-02 10:00:00|       850.0|         29.0|            3.0|
+-------------------+-------------------+------------+-------------+---------------+



In [4]:
# 3. Aggregation & Join
# Aggregate telemetry by segment and day (simplified)

df_telemetry_agg = df_telemetry.groupBy("pipeline_segment_id") \
    .agg(
        avg("pressure_psi").alias("avg_pressure"),
        avg("temperature_c").alias("avg_temp"),
        max("vibration_level").alias("max_vibration")
    )

# Join with Inspections
df_final = df_inspections.join(df_telemetry_agg, "pipeline_segment_id", "left")
df_final.show()

+-------------------+-------------+---------------+------------+--------------+---------------+--------------------+------------+--------+-------------+
|pipeline_segment_id|inspection_id|inspection_date|inspector_id|crack_detected|corrosion_level|maintenance_required|avg_pressure|avg_temp|max_vibration|
+-------------------+-------------+---------------+------------+--------------+---------------+--------------------+------------+--------+-------------+
|            SEG-101|            1|     2023-10-01|      INS-01|         false|            0.0|               false|      1025.0|    31.0|          0.6|
|            SEG-102|            2|     2023-10-02|      INS-02|          true|           15.5|                true|       875.0|    28.5|          3.0|
|            SEG-103|            3|     2023-10-03|      INS-01|         false|            2.1|               false|        NULL|    NULL|         NULL|
+-------------------+-------------+---------------+------------+--------------+---

In [5]:
# 4. Persist to Parquet
output_path = "../data/processed_pipeline_data.parquet"
df_final.write.mode("overwrite").parquet(output_path)
print(f"Data saved to {output_path}")

Data saved to ../data/processed_pipeline_data.parquet
