In [30]:
import pandas as pd
import numpy as np

# ---------- 1. "API" endpoint for hurricane data (NOAA IBTrACS, North Atlantic) ----------
IBTRACS_NA_URL = (
    "https://www.ncei.noaa.gov/data/"
    "international-best-track-archive-for-climate-stewardship-ibtracs/"
    "v04r01/access/csv/ibtracs.NA.list.v04r01.csv"
)

In [31]:
# (Optional) save a local copy so results are reproducible
LOCAL_CSV_PATH = "ibtracs_NA_v04r01.csv"

In [32]:
# ---------- 2. Download & read into pandas ----------
# You can either read directly from the URL:
# raw_df = pd.read_csv(IBTRACS_NA_URL, low_memory=False)

# or first save then read (a bit clearer for projects):
raw_df = pd.read_csv(IBTRACS_NA_URL, low_memory=False)
raw_df.to_csv(LOCAL_CSV_PATH, index=False)

print("Raw hurricane shape:", raw_df.shape)
print("Sample columns:", list(raw_df.columns)[:20])


Raw hurricane shape: (127650, 174)
Sample columns: ['SID', 'SEASON', 'NUMBER', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'NATURE', 'LAT', 'LON', 'WMO_WIND', 'WMO_PRES', 'WMO_AGENCY', 'TRACK_TYPE', 'DIST2LAND', 'LANDFALL', 'IFLAG', 'USA_AGENCY', 'USA_ATCF_ID', 'USA_LAT']


In [34]:
# ---------- 3. Select useful columns (only keep what actually exists) ----------
desired_cols = [
    "SID",         # storm identifier
    "SEASON",      # year
    "BASIN",
    "SUBBASIN",
    "NAME",
    "ISO_TIME",    # timestamp
    "USA_LAT",     # US-best-track latitude
    "USA_LON",     # US-best-track longitude
    "USA_WIND",    # max wind (kt)
    "USA_PRES",    # min central pressure (hPa)
    "USA_SSHS"     # Saffir–Simpson category
]

existing_cols = [c for c in desired_cols if c in raw_df.columns]
hurricane_df = raw_df[existing_cols].copy()

# ---------- 4. Filter to recent years ----------
if "SEASON" in hurricane_df.columns:
    
    # convert to integer
    hurricane_df["SEASON"] = pd.to_numeric(hurricane_df["SEASON"], errors="coerce")
    
    # drop rows where SEASON is NaN (invalid)
    hurricane_df = hurricane_df.dropna(subset=["SEASON"])
    
    # filter for your relevant years
    hurricane_df = hurricane_df[hurricane_df["SEASON"] >= 2015]


# ---------- 5. Basic cleaning ----------
# Replace NaN with None so MongoDB will accept them
hurricane_df = hurricane_df.replace({np.nan: None})

print("Cleaned hurricane_df shape:", hurricane_df.shape)
hurricane_df.head()


Cleaned hurricane_df shape: (12173, 11)


Unnamed: 0,SID,SEASON,BASIN,SUBBASIN,NAME,ISO_TIME,USA_LAT,USA_LON,USA_WIND,USA_PRES,USA_SSHS
115477,2015126N27281,2015.0,,,ANA,2015-05-06 06:00:00,26.8,-79.2,25,1016,-3
115478,2015126N27281,2015.0,,,ANA,2015-05-06 09:00:00,27.5,-78.9,25,1016,-3
115479,2015126N27281,2015.0,,,ANA,2015-05-06 12:00:00,28.2,-78.5,25,1015,-3
115480,2015126N27281,2015.0,,,ANA,2015-05-06 15:00:00,29.0,-78.1,25,1015,-3
115481,2015126N27281,2015.0,,,ANA,2015-05-06 18:00:00,29.7,-77.8,25,1014,-3


In [35]:
from pymongo import MongoClient

# ---------- 1. MongoDB connection ----------
# Change this URI if you're using Atlas or a different host
MONGO_URI = "mongodb://localhost:27017"

client = MongoClient(MONGO_URI)

# Use the same DB as your other collections if you want everything together
DB_NAME = "flight_weather_project"        # <- change to your actual DB name
COLLECTION_NAME = "hurricane_ibtracs_na"  # new collection just for hurricanes

db = client[DB_NAME]
hurricane_collection = db[COLLECTION_NAME]

# ---------- 2. Convert DataFrame → list of dicts ----------
records = hurricane_df.to_dict("records")

# Optional: clear old data if you're re-running
# hurricane_collection.delete_many({})

# ---------- 3. Insert into MongoDB ----------
if records:
    result = hurricane_collection.insert_many(records)
    print(f"Inserted {len(result.inserted_ids)} hurricane documents into MongoDB.")
else:
    print("No hurricane records to insert – check hurricane_df shape / filters.")


Inserted 12173 hurricane documents into MongoDB.
