In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from datetime import datetime, timedelta
import pytz
import json
from pathlib import Path

In [4]:
BASE_DIR = Path("/home/jovyan/work")
RAW_FLIGHT_DATA_DIR = BASE_DIR / "data" / "raw" / "flight"
RAW_WEATHER_DATA_DIR = BASE_DIR / "data" / "raw" / "weather"
CLEAN_DATA_DIR = BASE_DIR / "data" / "clean"

now = datetime.now(pytz.timezone("Asia/Bangkok"))
day = now.strftime("%Y-%m-%d")
print(day)

spark = SparkSession.builder.appName("CloudJetTransform").getOrCreate()

2025-03-28


In [5]:
flight_path = RAW_FLIGHT_DATA_DIR / day / "data.json"

df_flight_raw = spark.read.option("multiline", True).json(str(flight_path))
df_flight = df_flight_raw.select(
    col("flight.iata").alias("flight_iata"),
    col("departure.iata").alias("departure_iata"),
    col("arrival.iata").alias("arrival_iata"),
    col("arrival.scheduled").alias("arrival_time"),
    col("airline.name").alias("airline"),
    col("arrival.airport").alias("arr_airport"),
)

df_flight.show(5)


+-----------+--------------+------------+--------------------+------------------+--------------------+
|flight_iata|departure_iata|arrival_iata|        arrival_time|           airline|         arr_airport|
+-----------+--------------+------------+--------------------+------------------+--------------------+
|        NQ2|           BKK|         NRT|2025-03-28T08:10:...|         Air Japan|Narita Internatio...|
|     AC6230|           BKK|         NRT|2025-03-27T15:05:...|        Air Canada|Narita Internatio...|
|       ZG52|           BKK|         NRT|2025-03-28T07:30:...|            Zipair|Narita Internatio...|
|     WS5931|           BKK|         NRT|2025-03-27T16:00:...|           WestJet|Narita Internatio...|
|     UL3340|           BKK|         NRT|2025-03-27T16:00:...|SriLankan Airlines|Narita Internatio...|
+-----------+--------------+------------+--------------------+------------------+--------------------+
only showing top 5 rows



In [6]:
def safe_get(d, keys, default=None):
    for k in keys:
        if isinstance(d, dict) and k in d:
            d = d[k]
        else:
            return default
    return d

def float_or_none(v):
    try:
        return float(v)
    except:
        return None

def get_iata_by_city(city):
    mapping = {
        "Tokyo": "NRT", "Seoul": "ICN", "Singapore": "SIN",
        "Kuala Lumpur": "KUL", "Taipei": "TPE",
        "Ho Chi Minh": "SGN", "Hong Kong": "HKG"
    }
    return mapping.get(city)

# Load raw JSON
weather_path = RAW_WEATHER_DATA_DIR / day / "data.json"
with open(weather_path, "r", encoding="utf-8") as f:
    weather_raw = json.load(f)

weather_rows = []
for w in weather_raw:
    row = {
        "city": w.get("city"),
        "iata": get_iata_by_city(w.get("city")),
        "temp": float_or_none(safe_get(w, ["data", "main", "temp"])),
        "humidity": float_or_none(safe_get(w, ["data", "main", "humidity"])),
        "weather": safe_get(w, ["data", "weather", 0, "main"]) or "Unknown",
        "wind_speed": float_or_none(safe_get(w, ["data", "wind", "speed"]))
    }
    weather_rows.append(row)

schema = StructType([
    StructField("city", StringType(), True),
    StructField("iata", StringType(), True),
    StructField("temp", FloatType(), True),
    StructField("humidity", FloatType(), True),
    StructField("weather", StringType(), True),
    StructField("wind_speed", FloatType(), True),
])

df_weather = spark.createDataFrame(weather_rows, schema=schema)
df_weather = df_weather.withColumnRenamed("iata", "weather_iata")
df_weather.show(5)


+------------+------------+-----+--------+-------+----------+
|        city|weather_iata| temp|humidity|weather|wind_speed|
+------------+------------+-----+--------+-------+----------+
|       Tokyo|         NRT|17.53|    84.0|Unknown|     11.32|
|       Seoul|         ICN| 6.76|    38.0|Unknown|      4.87|
|   Singapore|         SIN| 27.0|    87.0|Unknown|      1.54|
|Kuala Lumpur|         KUL|27.02|    87.0|Unknown|      0.65|
|      Taipei|         TPE|25.31|    59.0|Unknown|      1.34|
+------------+------------+-----+--------+-------+----------+
only showing top 5 rows



In [7]:
df_joined = df_flight.join(
    df_weather,
    df_flight.arrival_iata == df_weather.weather_iata,
    how="left"
)

df_joined.show(5)

# Save to parquet
output_path = CLEAN_DATA_DIR / day
df_joined.write.mode("overwrite").parquet(str(output_path / "flight_weather.parquet"))


+-----------+--------------+------------+--------------------+------------------+--------------------+-----+------------+-----+--------+-------+----------+
|flight_iata|departure_iata|arrival_iata|        arrival_time|           airline|         arr_airport| city|weather_iata| temp|humidity|weather|wind_speed|
+-----------+--------------+------------+--------------------+------------------+--------------------+-----+------------+-----+--------+-------+----------+
|        NQ2|           BKK|         NRT|2025-03-28T08:10:...|         Air Japan|Narita Internatio...|Tokyo|         NRT|17.53|    84.0|Unknown|     11.32|
|     AC6230|           BKK|         NRT|2025-03-27T15:05:...|        Air Canada|Narita Internatio...|Tokyo|         NRT|17.53|    84.0|Unknown|     11.32|
|       ZG52|           BKK|         NRT|2025-03-28T07:30:...|            Zipair|Narita Internatio...|Tokyo|         NRT|17.53|    84.0|Unknown|     11.32|
|     WS5931|           BKK|         NRT|2025-03-27T16:00:...|  