# Data Extraction

In [1]:
import os
from pathlib import Path

import earthaccess as ea
import netCDF4 as nc
import numpy as np
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import pandas as pd
import xarray as xr
from dotenv import load_dotenv, find_dotenv
from matplotlib.path import Path as PolygonPath
from src.services.utils import get_logger
from src.services.google import Google

logger = get_logger()
env_loaded = load_dotenv(find_dotenv())
if env_loaded:
    logger.info("Environment variables loaded successfully.")
else:
    logger.error("Failed to load environment variables.")

  from .autonotebook import tqdm as notebook_tqdm


[32m2025-10-04T18:28:28.829601-0500[0m | [1mINFO[0m | Environment variables loaded successfully.


## Get data from different sources

### Earth data login

In [2]:
EARTH_ACCESS_USERNAME = os.getenv("EARTH_ACCESS_USERNAME")
EARTH_ACCESS_PASSWORD = os.getenv("EARTH_ACCESS_PASSWORD")
auth = ea.login(EARTH_ACCESS_USERNAME, EARTH_ACCESS_PASSWORD)

In [1]:
short_name = "TEMPO_NO2_L3"  # collection name to search for in the EarthData
# short_name = "OMHCHOd"
version = "V03"
# version = "003"

date_start = "2025-01-01 00:00:00"
date_end = "2025-01-01 15:59:59"
polygon_coords = [
    (-120.0091050, 41.9727325),
    (-124.6045661, 41.8898826),
    (-120.4462801, 33.9044735),
    (-117.1073262, 32.6184122),
    (-114.2955756, 32.6554188),
    (-114.1637748, 34.3047333),
    (-114.7349117, 35.0995465),
    (-120.0948112, 39.0254518),
    (-120.0091050, 41.9727325),
]

In [2]:
from src.services.earth_data import EarthDataClient

eac = EarthDataClient()
df = eac.get_data(
    dataset_name=short_name,
    dataset_version=version,
    start_date=date_start,
    end_date=date_end,
    polygon=polygon_coords,
)
df.head()

  from .autonotebook import tqdm as notebook_tqdm


earthdata-client | [32m2025-10-04T18:50:15.398342-0500[0m | [1mINFO[0m | Found 2 granules.


QUEUEING TASKS | : 100%|██████████| 1/1 [00:00<00:00, 1733.18it/s]
PROCESSING TASKS | : 100%|██████████| 1/1 [00:16<00:00, 16.38s/it]
COLLECTING RESULTS | : 100%|██████████| 1/1 [00:00<00:00, 18893.26it/s]

earthdata-client | [32m2025-10-04T18:50:31.807162-0500[0m | [1mINFO[0m | Opening file: /var/folders/78/l07nwp291d17m9j1z3ylv1n40000gn/T/earthdata_oxnu76kf/TEMPO_NO2_L3_V03_20250101T144826Z_S005.nc



QUEUEING TASKS | : 100%|██████████| 1/1 [00:00<00:00, 3890.82it/s]
PROCESSING TASKS | : 100%|██████████| 1/1 [00:16<00:00, 16.46s/it]
COLLECTING RESULTS | : 100%|██████████| 1/1 [00:00<00:00, 16070.13it/s]

earthdata-client | [32m2025-10-04T18:50:53.606690-0500[0m | [1mINFO[0m | Opening file: /var/folders/78/l07nwp291d17m9j1z3ylv1n40000gn/T/earthdata_oxnu76kf/TEMPO_NO2_L3_V03_20250101T144826Z_S005.nc
earthdata-client | [32m2025-10-04T18:50:53.608700-0500[0m | [1mINFO[0m | Opening file: /var/folders/78/l07nwp291d17m9j1z3ylv1n40000gn/T/earthdata_oxnu76kf/TEMPO_NO2_L3_V03_20250101T154826Z_S006.nc





Unnamed: 0,latitude,longitude,time,weight
7217793,32.630001,-117.129997,2025-01-01 14:48:44.028173568,4.162722
7217794,32.630001,-117.110001,2025-01-01 14:48:44.028173568,4.162722
7217795,32.630001,-117.089996,2025-01-01 14:48:44.028173568,4.360855
7217796,32.630001,-117.07,2025-01-01 14:48:44.028173568,4.162722
7217797,32.630001,-117.050003,2025-01-01 14:48:44.028173568,4.162722


In [3]:
print(f"Total records retrieved: {len(df)}")
df.dropna(inplace=True)
print(f"Total records after dropping NaNs: {len(df)}")

Total records retrieved: 214348
Total records after dropping NaNs: 214348


In [4]:
df.isna().sum()

latitude     0
longitude    0
time         0
weight       0
dtype: int64

## Load data to NO2 historical data

In [None]:
import time
from src.etl.extract_load_no2 import extract_and_load_no2

short_name = "TEMPO_NO2_L3"  # collection name to search for in the EarthData
# short_name = "OMHCHOd"
version = "V03"
# version = "003"

date_start = "2024-01-01 00:00:00"
date_end = "2024-01-31 23:59:59"
polygon_coords = [
    (-120.0091050, 41.9727325),
    (-124.6045661, 41.8898826),
    (-120.4462801, 33.9044735),
    (-117.1073262, 32.6184122),
    (-114.2955756, 32.6554188),
    (-114.1637748, 34.3047333),
    (-114.7349117, 35.0995465),
    (-120.0948112, 39.0254518),
    (-120.0091050, 41.9727325),
]

start_run_time = time.time()
_ = extract_and_load_no2(
    dataset_name=short_name,
    dataset_version=version,
    start_date=date_start,
    end_date=date_end,
    polygon=polygon_coords,
)
end_run_time = time.time()
elapsed_time = end_run_time - start_run_time
print(f"ETL process completed in {elapsed_time:.2f} seconds.")

In [None]:
google = Google()
_ = google.bigquery.upload_data_from_dataframe(
    df,
    dataset="earth_data",
    table_id="no2_historical"
)

[32m2025-10-04T17:12:05.948804-0500[0m | [1mINFO[0m | BigQuery client initialized (project=%s, location=%s)




[32m2025-10-04T17:12:07.074801-0500[0m | [1mINFO[0m | Dataset %s already exists (project=%s).
[32m2025-10-04T17:12:11.427748-0500[0m | [1mINFO[0m | Data uploaded successfully.
