In [1]:
import os
import sys
import time
import json
import csv
import glob
import logging
import threading
import psutil
import boto3
import gzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO
from datetime import datetime
from pathlib import Path

# PySpark & Boto3 Config
from botocore import UNSIGNED
from botocore.config import Config as BotoConfig
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

# --- WINDOWS UMWGEBUNGS-KONFIGURATION ---
# Pfade explizit setzen, um WinError 2 zu vermeiden
JAVA_HOME_RAW = r"C:\ProgramData\jdk-11.0.29.7-hotspot"
HADOOP_HOME_RAW = r"C:\hadoop"

os.environ['JAVA_HOME'] = JAVA_HOME_RAW
os.environ['HADOOP_HOME'] = HADOOP_HOME_RAW

# Pfade f√ºr Binaries bauen
java_bin = os.path.join(JAVA_HOME_RAW, 'bin')
hadoop_bin = os.path.join(HADOOP_HOME_RAW, 'bin')

# PATH Variable erweitern
current_path = os.environ.get('PATH', '')
if java_bin not in current_path:
    os.environ['PATH'] = java_bin + ";" + current_path
if hadoop_bin not in current_path:
    os.environ['PATH'] = hadoop_bin + ";" + os.environ['PATH']

# Spark Home automatisch finden
import pyspark
spark_home_candidate = os.path.dirname(pyspark.__file__)
if os.path.exists(os.path.join(spark_home_candidate, "bin")):
    os.environ['SPARK_HOME'] = spark_home_candidate
    os.environ['PATH'] = os.path.join(spark_home_candidate, "bin") + ";" + os.environ['PATH']

# Logging Init
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Pipeline")

print("‚úÖ Umgebungsvariablen gesetzt. Imports vollst√§ndig.")

‚úÖ Umgebungsvariablen gesetzt. Imports vollst√§ndig.


In [2]:
class PipelineConfig:
    # --- STEUERUNG DATENMENGE ---
    # Wie viele Dateien sollen im Standard-Durchlauf geladen werden?
    FLIGHT_DATA_LIMIT = 25

    # S3 Settings (OpenSky)
    S3_BUCKET = "data-samples"
    S3_ENDPOINT = "https://s3.opensky-network.org"

    # Dateinamen Input
    AIRCRAFT_CSV_NAME = "aircraft-database-complete-2025-08.csv"

    # --- OUTPUT PFADE ---
    BASE_OUTPUT_DIR = os.path.join(os.getcwd(), "project_data_output")

    # Unterordner
    RAW_DIR = os.path.join(BASE_OUTPUT_DIR, "raw")
    PROCESSED_DIR = os.path.join(BASE_OUTPUT_DIR, "processed")
    METRICS_DIR = os.path.join(BASE_OUTPUT_DIR, "metrics")

    # Dateinamen Output
    AIRCRAFT_PARQUET_NAME = "aircraft_database.parquet"
    FLIGHT_DATA_OUTPUT_NAME = "flight_data_batch.parquet"
    METRICS_FILE = os.path.join(METRICS_DIR, "pipeline_metrics.json")
    BENCHMARK_CSV = "benchmark_history.csv" # Liegt direkt im Notebook-Ordner f√ºr einfache Analyse

def init_directories():
    """Erstellt die notwendige Ordnerstruktur"""
    for path in [PipelineConfig.RAW_DIR, PipelineConfig.PROCESSED_DIR, PipelineConfig.METRICS_DIR]:
        os.makedirs(path, exist_ok=True)
    print(f"‚úÖ Verzeichnisse initialisiert unter: {PipelineConfig.BASE_OUTPUT_DIR}")

init_directories()

‚úÖ Verzeichnisse initialisiert unter: C:\Users\valim\Master\AdvancedDataEngineering\FlyBigData_v2\notebooks\Use Cases\project_data_output


In [3]:
# --- A. PERFORMANCE MONITOR ---
class PerformanceMonitor:
    def __init__(self):
        self.running = False
        self.cpu_usage = []
        self.ram_usage = []
        self.net_sent = []
        self.net_recv = []
        self.start_time = 0
        self._thread = None

    def _monitor_loop(self):
        net_io_start = psutil.net_io_counters()
        start_bytes_recv = net_io_start.bytes_recv

        while self.running:
            self.cpu_usage.append(psutil.cpu_percent(interval=None))
            self.ram_usage.append(psutil.virtual_memory().percent)

            net_now = psutil.net_io_counters()
            self.net_recv.append((net_now.bytes_recv - start_bytes_recv) / (1024 * 1024))
            time.sleep(1)

    def start(self):
        self.running = True
        self.start_time = time.time()
        self.cpu_usage, self.ram_usage, self.net_recv = [], [], []
        psutil.cpu_percent(interval=None) # Init call
        self._thread = threading.Thread(target=self._monitor_loop)
        self._thread.start()

    def stop(self):
        self.running = False
        if self._thread:
            self._thread.join()

    def get_stats(self):
        if not self.cpu_usage: return {}
        return {
            "duration": time.time() - self.start_time,
            "avg_cpu": np.mean(self.cpu_usage),
            "avg_ram": np.mean(self.ram_usage),
            "total_net_mb": self.net_recv[-1] if self.net_recv else 0
        }

# --- B. AIRCRAFT INGEST (PANDAS) ---
def ingest_aircraft_database(config):
    logger.info("--- Start: Aircraft Database Ingest ---")
    s3 = boto3.client('s3', endpoint_url=config.S3_ENDPOINT, config=BotoConfig(signature_version=UNSIGNED))

    # Suchen und Laden
    key = f"metadata/{config.AIRCRAFT_CSV_NAME}"
    try:
        obj = s3.get_object(Bucket=config.S3_BUCKET, Key=key)
        df = pd.read_csv(obj['Body'], dtype=str, quotechar="'", on_bad_lines='skip')

        # Transformation
        cols = ['icao24', 'manufacturerName', 'model', 'typecode', 'operator']
        df = df[[c for c in cols if c in df.columns]].drop_duplicates(subset=['icao24'])

        out_path = os.path.join(config.PROCESSED_DIR, config.AIRCRAFT_PARQUET_NAME)
        df.to_parquet(out_path, index=False)
        logger.info(f"‚úÖ Aircraft Parquet gespeichert: {out_path}")
        return out_path
    except Exception as e:
        logger.error(f"Fehler Aircraft Ingest: {e}")
        return None

# --- C. FLIGHT DATA INGEST (SPARK) ---
def ingest_flight_data_spark(config, spark_session, file_limit=None):
    logger.info("--- Start: Flight Data Spark Ingest ---")
    s3 = boto3.client('s3', endpoint_url=config.S3_ENDPOINT, config=BotoConfig(signature_version=UNSIGNED))

    # 1. Dateien auflisten (csv.gz bevorzugt)
    limit = file_limit if file_limit else config.FLIGHT_DATA_LIMIT
    all_files = []

    try:
        paginator = s3.get_paginator('list_objects_v2')
        # Wir suchen in flights/ nach den komprimierten CSVs
        for page in paginator.paginate(Bucket=config.S3_BUCKET, Prefix="flights/"):
            if 'Contents' not in page: continue
            for obj in page['Contents']:
                if obj['Key'].endswith('.csv.gz'):
                    all_files.append(obj['Key'])
                if len(all_files) >= limit: break
            if len(all_files) >= limit: break
    except Exception as e:
        logger.error(f"S3 Listing Fehler: {e}")
        return None

    # Falls nicht genug Dateien da sind, Liste duplizieren f√ºr Stresstest
    if all_files and len(all_files) < limit:
        while len(all_files) < limit:
            all_files.extend(all_files[:limit - len(all_files)])

    file_list = all_files[:limit]
    logger.info(f"Starte Verarbeitung f√ºr {len(file_list)} Dateien...")

    # 2. Messung & Spark Job
    monitor = PerformanceMonitor()
    monitor.start()
    t_start = time.time()

    # Worker Funktion f√ºr Spark (liest S3 Stream)
    def worker(iter):
        s3_worker = boto3.client('s3', endpoint_url="https://s3.opensky-network.org", config=BotoConfig(signature_version=UNSIGNED))
        for k in iter:
            try:
                o = s3_worker.get_object(Bucket="data-samples", Key=k)
                with gzip.open(BytesIO(o['Body'].read()), 'rt', encoding='utf-8') as f:
                    for _ in f: pass # Dummy Read f√ºr IO Load
            except: pass
            yield (k,)

    # Ausf√ºhrung
    rdd = spark_session.sparkContext.parallelize(file_list, numSlices=max(2, len(file_list)))
    count = rdd.mapPartitions(worker).count()

    duration = time.time() - t_start
    monitor.stop()
    stats = monitor.get_stats()

    # 3. CSV Log schreiben
    file_exists = os.path.isfile(config.BENCHMARK_CSV)
    with open(config.BENCHMARK_CSV, mode='a', newline='') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(["timestamp", "files_count", "duration_sec", "avg_cpu", "avg_ram", "net_mb"])

        writer.writerow([
            datetime.now().isoformat(),
            len(file_list),
            round(duration, 2),
            round(stats.get('avg_cpu', 0), 1),
            round(stats.get('avg_ram', 0), 1),
            round(stats.get('total_net_mb', 0), 2)
        ])

    logger.info(f"‚úÖ Fertig: {len(file_list)} Files in {duration:.2f}s")
    return count

In [4]:
# 1. Spark Session Starten
print("üöÄ Starte Spark Session...")
spark = SparkSession.builder \
    .appName("OpenSkyPipeline") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

# 2. Aircraft Stammdaten (Einmalig)
print("\n--- Schritt 1: Aircraft Stammdaten ---")
ingest_aircraft_database(PipelineConfig)

# 3. Flugdaten (Benchmark Loop f√ºr Analyse)
print("\n--- Schritt 2: Flugdaten Benchmark ---")
# Wir simulieren steigende Last (5, 10, 20 Dateien), um sch√∂ne Graphen zu bekommen
scenarios = [5, 10, 20]

for n_files in scenarios:
    print(f"\n>> Starte Szenario mit {n_files} Dateien...")
    ingest_flight_data_spark(PipelineConfig, spark, file_limit=n_files)

print("\nüèÜ Pipeline und Benchmark erfolgreich abgeschlossen.")

üöÄ Starte Spark Session...


2026-02-03 20:47:43,531 - INFO - --- Start: Aircraft Database Ingest ---


Spark Version: 3.5.0

--- Schritt 1: Aircraft Stammdaten ---


2026-02-03 20:53:25,856 - INFO - ‚úÖ Aircraft Parquet gespeichert: C:\Users\valim\Master\AdvancedDataEngineering\FlyBigData_v2\notebooks\Use Cases\project_data_output\processed\aircraft_database.parquet
2026-02-03 20:53:25,876 - INFO - --- Start: Flight Data Spark Ingest ---



--- Schritt 2: Flugdaten Benchmark ---

>> Starte Szenario mit 5 Dateien...


2026-02-03 20:53:26,427 - INFO - Starte Verarbeitung f√ºr 5 Dateien...


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 0.0 failed 1 times, most recent failure: Lost task 2.0 in stage 0.0 (TID 2) (Valis executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.waitForNewConnection(Native Method)
	at java.base/java.net.PlainSocketImpl.socketAccept(PlainSocketImpl.java:163)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1046)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:407)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1045)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.waitForNewConnection(Native Method)
	at java.base/java.net.PlainSocketImpl.socketAccept(PlainSocketImpl.java:163)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 17 more


In [None]:
# --- PERFORMANCE VISUALISIERUNG ---
csv_file = PipelineConfig.BENCHMARK_CSV

if not os.path.exists(csv_file):
    print("‚ùå Keine Benchmark-Daten gefunden. Bitte Pipeline Block ausf√ºhren!")
else:
    df = pd.read_csv(csv_file)
    # Sortieren nach Files Count f√ºr sauberen Graphen
    df = df.sort_values(by="files_count")

    plt.style.use('ggplot')
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    # Plot 1: Skalierbarkeit (Zeit vs Anzahl Files)
    ax1.plot(df['files_count'], df['duration_sec'], marker='o', linewidth=2, color='#2980b9', label='Messwerte')

    # Trendlinie
    if len(df) > 1:
        z = np.polyfit(df['files_count'], df['duration_sec'], 1)
        p = np.poly1d(z)
        ax1.plot(df['files_count'], p(df['files_count']), "r--", alpha=0.6, label='Linearer Trend')

    ax1.set_title("Skalierbarkeit: Laufzeit")
    ax1.set_xlabel("Anzahl Dateien")
    ax1.set_ylabel("Dauer (Sekunden)")
    ax1.legend()
    ax1.grid(True)

    # Plot 2: Ressourcen
    ax2.set_title("Ressourcen-Auslastung")
    ax2.set_xlabel("Anzahl Dateien")
    ax2.set_ylabel("CPU / RAM (%)")

    l1 = ax2.plot(df['files_count'], df['avg_cpu'], marker='s', color='#e74c3c', label='√ò CPU')
    l2 = ax2.plot(df['files_count'], df['avg_ram'], marker='d', color='#27ae60', label='√ò RAM')

    # Zweite Y-Achse f√ºr Netzwerk
    ax2_net = ax2.twinx()
    l3 = ax2_net.plot(df['files_count'], df['net_mb'], marker='^', linestyle='--', color='#8e44ad', label='Netzwerk (MB)')
    ax2_net.set_ylabel("Datenvolumen (MB)")

    # Legende
    lns = l1 + l2 + l3
    labs = [l.get_label() for l in lns]
    ax2.legend(lns, labs, loc='center right')

    plt.tight_layout()
    plt.show()

    print("Analyse:")
    print(f"Datens√§tze analysiert: {len(df)}")
    print(f"Maximaler Traffic: {df['net_mb'].max():.2f} MB")