In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

catalog_name = 'opensky'

In [0]:
df = spark.table(f'{catalog_name}.default.raw')


In [0]:
display(df.limit(5))

In [0]:
df.printSchema()

# icao24: Aircraft unique identifier

# callsign: Flight’s spoken identifier; can be null

# origin_country: Country where the aircraft is registered or operated from

# time_position: Unix timestamp of last position update; can be null

# last_contact: Unix timestamp of last signal received from the aircraft

# longitude: Longitude of the aircraft’s position; can be null

# latitude: Latitude of the aircraft’s position; can be null

# geo_altitude: Geometric altitude in meters; can be null

# on_ground: Indicates if the aircraft is on the ground

# velocity: Ground speed in meters per second; can be null

# heading: True track (heading) over ground in degrees (0–359); can be null

# vertical_rate: Vertical rate in meters per second; can be null

# baro_altitude: Barometric altitude in meters; can be null

# squawk: Transponder squawk code (4-digit code for radar identification); can be null

# spi: Special Purpose Indicator (SPI) or IDENT flag; true if set, false otherwise

# position_source: Source of position info (e.g., ADS-B, MLAT); can be null

# ingest_time: Timestamp when data was ingested

In [0]:
from pyspark.sql.functions import col, to_timestamp

df = df.withColumn('icao24',col('icao24').cast(StringType()))\
       .withColumn('callsign',col('callsign').cast(StringType()))\
       .withColumn("time_position", to_timestamp(col("time_position"))) \
       .withColumn("last_contact", to_timestamp(col("last_contact"))) \
       .withColumn("longitude", col("longitude").cast("float")) \
       .withColumn("latitude", col("latitude").cast("float")) \
       .withColumn("ingest_time", to_timestamp(col("ingest_time"))) \
       .withColumn("geo_altitude", col("geo_altitude").cast("float")) \
       .withColumn("velocity", col("velocity").cast("float")) \
       .withColumn("heading", col("heading").cast("float")) \
       .withColumn("vertical_rate", col("vertical_rate").cast("float")) \
       .withColumn("baro_altitude", col("baro_altitude").cast("float"))\
       .withColumn("on_ground", col("on_ground").cast("boolean"))\
       .withColumn("spi", col("spi").cast("boolean")) \
       .withColumn('squawk',col('squawk').cast(IntegerType()))\
       .withColumn('origin_country',col('origin_country').cast(StringType()))\
       .withColumn("position_source", col("position_source").cast("int"))


In [0]:
from pyspark.sql.functions import col, current_timestamp
# Intervals violations
df_invalid = df.filter(
    # icao24: must be 6 hex chars (basic regex check)
    (~col("icao24").rlike("^[0-9A-Fa-f]{6}$")) |
    
    # callsign: null or max 10 chars
    (col("callsign").isNotNull() & (length(col("callsign")) > 10)) |
    
    # time_position and last_contact not in future
    (col("time_position").cast("timestamp") > current_timestamp()) |
    (col("last_contact").try_cast("timestamp") > current_timestamp()) |
    (col("ingest_time").try_cast("timestamp") > current_timestamp()) |

    # longitude out of bounds
    (col("longitude") < -180) | (col("longitude") > 180) |

    # latitude out of bounds
    (col("latitude") < -90) | (col("latitude") > 90) |

    # # geo_altitude out of bounds
    # (col("geo_altitude") < 0) | (col("geo_altitude") > 50000) |

    # velocity out of bounds (assuming km/h)
    (col("velocity") < 0) | (col("velocity") > 400) |

    # heading out of bounds
    (col("heading") < 0) | (col("heading") >= 360) |

    # vertical_rate out of bounds (assuming ft/min)
    (col("vertical_rate") < -15000) | (col("vertical_rate") > 15000) |

    # baro_altitude out of bounds
    (col("baro_altitude") < 0) | (col("baro_altitude") > 50000) |

    # position_source invalid
    (~col("position_source").isin([0, 1, 2])) &
    (col("position_source").isNotNull())
)

print(f"Number of invalid rows: {df_invalid.count()}")
df_invalid.show(truncate=False)

In [0]:
df.select('position_source').distinct().show()
'only availabe value is 0 : thats refers to teh Automatic Dependent Surveillance-Broadcast (ADS-B) system'

In [0]:
# 100 country
df.select('origin_country').distinct().show()

In [0]:
%pip install ydata_profiling

In [0]:
dbutils.library.restartPython()

In [0]:
from ydata_profiling import ProfileReport
# data profiling with 20% of the dataset

df_20 = df.sample(fraction=0.2, seed=42).toPandas()
profile = ProfileReport(df_20, title="My Dataset")
profile.to_file("report.html")