In [1]:
import os
print(os.environ.get("JAVA_HOME"))
!java -version

None
openjdk version "17.0.16" 2025-07-15
OpenJDK Runtime Environment (build 17.0.16+8-Ubuntu-0ubuntu122.04.1)
OpenJDK 64-Bit Server VM (build 17.0.16+8-Ubuntu-0ubuntu122.04.1, mixed mode, sharing)


In [2]:
!ls /usr/lib/jvm

java-1.17.0-openjdk-amd64  java-17-openjdk-amd64


In [3]:
# ---- Find Java, set JAVA_HOME, install PySpark, start Spark ----
import os, shutil, pathlib, subprocess

# 1) Locate the actual java binary (works on Linux/macOS, Anaconda, etc.)
java_bin = shutil.which("java")
if not java_bin:
    raise RuntimeError("Java not found on PATH. Install JDK 11‚Äì21 first.")

java_real = os.path.realpath(java_bin)                 # resolve symlinks
JAVA_HOME = str(pathlib.Path(java_real).parent.parent) # .../bin/java -> go up 2
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PATH"] = f"{JAVA_HOME}/bin:" + os.environ["PATH"]

print("JAVA_HOME ->", os.environ["JAVA_HOME"])
subprocess.run(["java", "-version"], check=False)

# 2) Ensure PySpark is available
import sys
!{sys.executable} -m pip install -q pyspark

# 3) Start Spark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("FAA_NOAA_Integration")
    .master("local[*]")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.extraJavaOptions", "-Xss4M")
    .config("spark.executor.extraJavaOptions", "-Xss4M")
    .getOrCreate()
)

spark


JAVA_HOME -> /usr/lib/jvm/java-17-openjdk-amd64


In [4]:
!pip install python-dotenv





### Creating a `.env` file

A `.env` file typically contains key-value pairs, where each line defines an environment variable. For example, if you have an API key, you would add a line like `NOAA_TOKEN="your_noaa_api_key_here"`.

In [5]:
%%writefile .env
NOAA_TOKEN="UYWGZCnhVtUKAGHQFkpTPRrxAanxYmMA"

Writing .env


In [6]:
!cat .env


NOAA_TOKEN="UYWGZCnhVtUKAGHQFkpTPRrxAanxYmMA"


Remember to replace `"YOUR_ACTUAL_NOAA_API_KEY"` with your actual NOAA API token. After creating the file, you can load these variables into your Python environment using libraries like `python-dotenv`, as you've already demonstrated in your notebook.

In [7]:
from dotenv import load_dotenv
import os

load_dotenv()
NOAA_TOKEN = os.getenv("NOAA_TOKEN")

if NOAA_TOKEN:
    print("NOAA token loaded successfully!")
else:
    print("NOAA token not found.")

NOAA token loaded successfully!


In [8]:
airport_to_station = {
    "ATL": "GHCND:USW00013874",
    "DFW": "GHCND:USW00003927",
    "DEN": "GHCND:USW00003017",
    "ORD": "GHCND:USW00094846",
    "LAX": "GHCND:USW00093134",
    "CLT": "GHCND:USW00013881",
    "LAS": "GHCND:USW00023169",
    "MCO": "GHCND:USW00012815",
    "PHX": "GHCND:USW00023183",
    "MIA": "GHCND:USW00012839",
    "SEA": "GHCND:USW00024233",
    "EWR": "GHCND:USW00014734",
    "JFK": "GHCND:USW00094789",
    "SFO": "GHCND:USW00023234",
    "BOS": "GHCND:USW00014739"
}


In [9]:
import requests
import pandas as pd

def fetch_noaa_weather(station_id, start_date, end_date, token):
    """
    Fetch daily weather data from NOAA GHCND dataset for a given station and date range.

    Returns a clean pivoted DataFrame with columns like TMAX, TMIN, PRCP.

    Args:
        station_id (str): NOAA station ID (e.g., "GHCND:USW00094728")
        start_date (str): Start date in 'YYYY-MM-DD' format
        end_date (str): End date in 'YYYY-MM-DD' format
        token (str): NOAA API token
    """
    url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data"
    headers = {"token": token}
    limit = 1000
    offset = 1
    all_data = []

    while True:
        params = {
            "datasetid": "GHCND",
            "stationid": station_id,
            "startdate": start_date,
            "enddate": end_date,
            "limit": limit,
            "offset": offset,
            "units": "standard"
        }

        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        results = response.json().get("results", [])
        if not results:
            break

        all_data.extend(results)
        offset += limit

    if not all_data:
        return pd.DataFrame()  # Return empty if no data

    df = pd.DataFrame(all_data)

    df['date'] = pd.to_datetime(df['date'])
    df['value'] = pd.to_numeric(df['value'], errors='coerce')

    df_pivot = df.pivot_table(index='date', columns='datatype', values='value').reset_index()

    return df_pivot


In [10]:
import pandas as pd
import time

def fetch_station_monthly(station_id, start_date, end_date, token):
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)

    all_chunks = []
    current = start

    while current < end:
        # Set next month boundary
        next_month = (current.replace(day=1) + pd.DateOffset(months=1))
        chunk_start = current.strftime("%Y-%m-%d")
        chunk_end = min(next_month, end).strftime("%Y-%m-%d")

        print(f"   ‚Üí Fetching {station_id} | {chunk_start} to {chunk_end}")

        try:
            df_chunk = fetch_noaa_weather(
                station_id=station_id,
                start_date=chunk_start,
                end_date=chunk_end,
                token=token
            )
            if not df_chunk.empty:
                all_chunks.append(df_chunk)

        except Exception as e:
            print("      ‚ö†Ô∏è Error:", e)

        # Avoid 429 rate limit
        time.sleep(0.4)

        current = next_month

    if all_chunks:
        return pd.concat(all_chunks, ignore_index=True)
    return pd.DataFrame()


In [None]:
START_DATE = "2015-01-01"
END_DATE   = "2025-01-01"   # stop point

all_weather = []

for airport, station in airport_to_station.items():
    print(f"\nüì° Fetching weather for {airport} ({station})")

    df = fetch_station_monthly(
        station_id=station,
        start_date=START_DATE,
        end_date=END_DATE,
        token=NOAA_TOKEN
    )

    if df.empty:
        print(f"   ‚ùå No data for {airport}")
        continue

    df["airport"] = airport
    df["station_id"] = station

    all_weather.append(df)
    print(f"   ‚úÖ Completed {airport}: {df.shape[0]} rows")



üì° Fetching weather for ATL (GHCND:USW00013874)
   ‚Üí Fetching GHCND:USW00013874 | 2015-01-01 to 2015-02-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-02-01 to 2015-03-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-03-01 to 2015-04-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-04-01 to 2015-05-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-05-01 to 2015-06-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-06-01 to 2015-07-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-07-01 to 2015-08-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-08-01 to 2015-09-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-09-01 to 2015-10-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-10-01 to 2015-11-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-11-01 to 2015-12-01
   ‚Üí Fetching GHCND:USW00013874 | 2015-12-01 to 2016-01-01
   ‚Üí Fetching GHCND:USW00013874 | 2016-01-01 to 2016-02-01
   ‚Üí Fetching GHCND:USW00013874 | 2016-02-01 to 2016-03-01
   ‚Üí Fetching GHCND:USW00013874 | 2016-03-01 to 2016-04-01
   ‚Üí Fetching GHCND:USW00013874 

In [None]:
df_weather_all = pd.concat(all_weather, ignore_index=True)
print("Total rows:", df_weather_all.shape)
df_weather_all.head()

Total rows: (54601, 35)


datatype,date,ADPT,ASLP,ASTP,AWBT,AWND,PRCP,RHAV,RHMN,RHMX,...,PSUN,TSUN,PGTM,WT05,airport,station_id,WT07,WT09,WESD,WT10
0,2015-01-01,-17.0,10271.0,9888.0,33.0,4.5,0.07,58.0,28.0,79.0,...,,,,,ATL,GHCND:USW00013874,,,,
1,2015-01-02,61.0,10257.0,9878.0,72.0,3.8,0.95,88.0,76.0,93.0,...,,,,,ATL,GHCND:USW00013874,,,,
2,2015-01-03,106.0,10220.0,9854.0,106.0,8.5,0.16,95.0,86.0,100.0,...,,,,,ATL,GHCND:USW00013874,,,,
3,2015-01-04,106.0,10203.0,9821.0,122.0,11.4,1.64,80.0,47.0,96.0,...,,,,,ATL,GHCND:USW00013874,,,,
4,2015-01-05,-33.0,10312.0,9925.0,17.0,12.1,0.0,55.0,32.0,74.0,...,,,,,ATL,GHCND:USW00013874,,,,


In [None]:
df_weather_all.to_csv("noaa_weather_15airports_10years.csv", index=False)

In [None]:
#df_weather = fetch_noaa_weather("GHCND:USW00094728", "2024-01-01", "2025-01-01", NOAA_TOKEN)

Error 503: <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>503 Service Unavailable</title>
</head><body>
<h1>Service Unavailable</h1>
<p>The server is temporarily unable to service your
request due to maintenance downtime or capacity
problems. Please try again later.</p>
<p>Additionally, a 503 Service Unavailable
error was encountered while trying to use an ErrorDocument to handle the request.</p>
</body></html>



In [None]:
print(df_weather.head())
print(df_weather.info())

datatype       date   ADPT     ASLP     ASTP  AWBT  AWND  PRCP  RHAV  RHMN  \
0        2024-01-01  -11.0  10166.0  10115.0  28.0   3.4  0.03  63.0  54.0   
1        2024-01-02  -61.0  10176.0  10125.0  -6.0   4.0  0.00  54.0  41.0   
2        2024-01-03  -44.0  10159.0  10108.0   6.0   4.9  0.00  57.0  49.0   
3        2024-01-04  -61.0  10159.0  10112.0   0.0   7.6  0.00  50.0  38.0   
4        2024-01-05 -100.0  10240.0  10190.0 -33.0   7.2  0.00  48.0  39.0   

datatype  RHMX  ...  TMIN   WDF2   WDF5  WSF2  WSF5  WT01  WT02  WT03  WT06  \
0         82.0  ...  35.0  270.0  220.0   8.9  16.1   NaN   NaN   NaN   NaN   
1         69.0  ...  29.0  300.0  320.0  10.1  16.1   NaN   NaN   NaN   NaN   
2         67.0  ...  34.0  300.0  320.0  10.1  15.0   NaN   NaN   NaN   NaN   
3         65.0  ...  28.0  310.0  300.0  19.9  30.0   NaN   NaN   NaN   NaN   
4         55.0  ...  26.0  300.0  290.0  16.1  23.9   NaN   NaN   NaN   NaN   

datatype  WT08  
0          NaN  
1          NaN  
2    

In [None]:
test = fetch_station_monthly(
    station_id=airport_to_station["ATL"],
    start_date="2015-01-01",
    end_date="2015-02-01",
    token=NOAA_TOKEN
)

print(test.shape)
test.head()


   ‚Üí Fetching GHCND:USW00013874 | 2015-01-01 to 2015-02-01
(32, 23)


datatype,date,ADPT,ASLP,ASTP,AWBT,AWND,PRCP,RHAV,RHMN,RHMX,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT08
0,2015-01-01,-17.0,10271.0,9888.0,33.0,4.5,0.07,58.0,28.0,79.0,...,58.0,32.0,340.0,330.0,10.1,16.1,,,,
1,2015-01-02,61.0,10257.0,9878.0,72.0,3.8,0.95,88.0,76.0,93.0,...,49.0,43.0,70.0,40.0,13.0,16.1,1.0,,,
2,2015-01-03,106.0,10220.0,9854.0,106.0,8.5,0.16,95.0,86.0,100.0,...,62.0,48.0,110.0,110.0,14.1,18.1,1.0,1.0,,
3,2015-01-04,106.0,10203.0,9821.0,122.0,11.4,1.64,80.0,47.0,96.0,...,66.0,44.0,310.0,300.0,23.0,33.1,1.0,,1.0,
4,2015-01-05,-33.0,10312.0,9925.0,17.0,12.1,0.0,55.0,32.0,74.0,...,48.0,34.0,320.0,320.0,25.9,33.1,,,,


In [None]:
from pymongo import MongoClient

MONGO_URI = "mongodb://localhost:27017"
client = MongoClient(MONGO_URI)

db = client["flight_weather"]
weather_coll = db["noaa_weather_raw"]

weather_docs = df_weather_all.to_dict("records")
result = weather_coll.insert_many(weather_docs)
print(f"‚úÖ Inserted {len(result.inserted_ids)} documents into 'noaa_weather_raw'")


‚úÖ Inserted 54601 documents into 'noaa_weather_raw'
