In [7]:
from cropnet.data_downloader import DataDownloader
import os
from herbie import Herbie
from datetime import datetime

# Use the "target_dir" to specify where the data should be downloaded to
downloader = DataDownloader(target_dir="./data")

# Download 2022 USDA Soybean data
# Note that most of the 2023 USDA data are not yet available
downloader.download_USDA("Soybean", fips_codes=["10003", "22007"], years=["2022"])

# Download the 2023 (the 1st and 2nd quarters) Sentinel-2 Imagery
downloader.download_Sentinel2(fips_codes=["10003", "22007"], years=["2023"], image_type="AG")
downloader.download_Sentinel2(fips_codes=["10003", "22007"], years=["2023"], image_type="NDVI")


# --- CONFIG ---
fips_codes = {
    "10003": "DE",  # Delaware - New Castle
    "22007": "LA",  # Louisiana - Assumption
}
date = "2023-01-01"
forecast_hour = "f00"  # Initial forecast only

# --- MAIN ---
for fips, state in fips_codes.items():
    output_dir = f"data/HRRR/realtime_wrf/2023/20230101"

    # Forecast times: 00 UTC to 23 UTC
    for hour in range(24):
        run_time = datetime.strptime(f"{date} {hour:02d}", "%Y-%m-%d %H")

        # Initialize Herbie
        H = Herbie(
            date=run_time,
            model="hrrr",
            product="sfc",
            fxx=0,  # f00 forecast
            save_dir=output_dir,
            verbose=True,
        )

        # Define output filename
        filename = f"hrrr.{date.replace('-', '')}.{hour:02d}.00.grib2"
        file_path = os.path.join(output_dir, filename)

        # Skip if file exists
        if os.path.exists(file_path):
            print(f"✅ File exists, skipping: {file_path}")
            continue

        # Download the file
        try:
            # Download and get the original file path
            downloaded_file = H.download()

            if downloaded_file and os.path.exists(downloaded_file):
                final_path = os.path.join(output_dir, f"hrrr.{run_time:%Y%m%d}.{run_time:%H}.00.grib2")
                os.rename(downloaded_file, final_path)
                print(f"✅ Renamed: {downloaded_file} -> {final_path}")
            else:
                print(f"⚠️ File not found after download: {downloaded_file}")
        except Exception as e:
            print(f"❌ Failed at {file_path}: {e}")



Progress: [ 1/1 ], Downloading USDA Data, Year: 2022, Crop: Soybean
 Downloading Sentinel-2 Imagery (AG), Progress: [1 / 2], FIPS: 10003, State Name: DELAWARE, County Name: New Castle
Year Progress: [1 / 1], Downloading 2023's AG Imagery for the county 10003


dates processed: 100%|██████████| 6/6 [00:00<00:00, 10.07it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.80it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.98it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.81it/s]


 Downloading Sentinel-2 Imagery (AG), Progress: [2 / 2], FIPS: 22007, State Name: LOUISIANA, County Name: Assumption
Year Progress: [1 / 1], Downloading 2023's AG Imagery for the county 22007


dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.71it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.97it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.90it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 12.00it/s]


 Downloading Sentinel-2 Imagery (NDVI), Progress: [1 / 2], FIPS: 10003, State Name: DELAWARE, County Name: New Castle
Year Progress: [1 / 1], Downloading 2023's NDVI Imagery for the county 10003


dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.88it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.82it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 12.00it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.92it/s]


 Downloading Sentinel-2 Imagery (NDVI), Progress: [2 / 2], FIPS: 22007, State Name: LOUISIANA, County Name: Assumption
Year Progress: [1 / 1], Downloading 2023's NDVI Imagery for the county 22007


dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.94it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 12.19it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.95it/s]
dates processed: 100%|██████████| 6/6 [00:00<00:00, 11.90it/s]


✅ Found ┊ model=hrrr ┊ [3mproduct=sfc[0m ┊ [38;2;41;130;13m2023-Jan-01 00:00 UTC[92m F00[0m ┊ [38;2;255;153;0m[3mGRIB2 @ aws[0m ┊ [38;2;255;153;0m[3mIDX @ aws[0m
✅ File exists, skipping: data/HRRR/realtime_wrf/2023/20230101/hrrr.20230101.00.00.grib2
✅ Found ┊ model=hrrr ┊ [3mproduct=sfc[0m ┊ [38;2;41;130;13m2023-Jan-01 01:00 UTC[92m F00[0m ┊ [38;2;255;153;0m[3mGRIB2 @ aws[0m ┊ [38;2;255;153;0m[3mIDX @ aws[0m
✅ File exists, skipping: data/HRRR/realtime_wrf/2023/20230101/hrrr.20230101.01.00.grib2
✅ Found ┊ model=hrrr ┊ [3mproduct=sfc[0m ┊ [38;2;41;130;13m2023-Jan-01 02:00 UTC[92m F00[0m ┊ [38;2;255;153;0m[3mGRIB2 @ aws[0m ┊ [38;2;255;153;0m[3mIDX @ aws[0m
✅ File exists, skipping: data/HRRR/realtime_wrf/2023/20230101/hrrr.20230101.02.00.grib2
✅ Found ┊ model=hrrr ┊ [3mproduct=sfc[0m ┊ [38;2;41;130;13m2023-Jan-01 03:00 UTC[92m F00[0m ┊ [38;2;255;153;0m[3mGRIB2 @ aws[0m ┊ [38;2;255;153;0m[3mIDX @ aws[0m
✅ File exists, skipping: data/HRRR/realtime_

In [9]:
# Use the "base_fir" to specify where the CropNet data is stored
retriever = DataRetriever(base_dir="/mnt/data/CropNet")
   
# Retrieve the 2022 USDA Soybean data
usda_data = retriever.retrieve_USDA(crop_type="Soybean", fips_codes=["10003", "22007"], years=["2022"])
   
# Retrieve the 2022 Sentinel-2 Imagery data
sentinel2_data = retriever.retrieve_Sentinel2(fips_codes=["10003", "22007"], years=["2022"], image_type="AG")
sentinel2_data = retriever.retrieve_Sentinel2(fips_codes=["10003", "22007"], years=["2022"], image_type="NDVI")
   
# Retrieve the 2022 WRF-HRRR data
hrrr_data = retriever.retrieve_HRRR(fips_codes=["10003","22007"], years=["2022"])

NameError: name 'DataRetriever' is not defined

In [15]:
import pandas as pd

# Read the USDA Corn County 2022 data
usda_corn_path = '/Users/RavenMott1/Downloads/Cropnet/data/USDA Crop Dataset/data/Corn/2022/USDA_Corn_County_2022.csv'
usda_corn_df = pd.read_csv(usda_corn_path)

# Read the HRRR computed data for AL, March 2022
hrrr_march_path = '/Users/RavenMott1/Downloads/Cropnet/data/New Folder With Items/WRF-HRRR Computed Dataset 7/data/2022/AL/HRRR_01_AL_2022-03.csv'
hrrr_march_df = pd.read_csv(hrrr_march_path)

In [None]:
usda_corn_df.head()


Unnamed: 0,commodity_desc,reference_period_desc,year,state_ansi,state_name,county_ansi,county_name,asd_code,asd_desc,domain_desc,source_desc,agg_level_desc,"PRODUCTION, MEASURED IN BU","YIELD, MEASURED IN BU / ACRE"
0,CORN,YEAR,2022,1,ALABAMA,3,BALDWIN,50,COASTAL PLAINS & GULF COAST,TOTAL,SURVEY,COUNTY,1020000.0,139.0
1,CORN,YEAR,2022,1,ALABAMA,5,BARBOUR,60,WIREGRASS,TOTAL,SURVEY,COUNTY,262000.0,159.8
2,CORN,YEAR,2022,1,ALABAMA,9,BLOUNT,20,MOUNTAINS & EASTERN VALLEY,TOTAL,SURVEY,COUNTY,242000.0,100.4
3,CORN,YEAR,2022,1,ALABAMA,15,CALHOUN,20,MOUNTAINS & EASTERN VALLEY,TOTAL,SURVEY,COUNTY,319000.0,142.4
4,CORN,YEAR,2022,1,ALABAMA,19,CHEROKEE,20,MOUNTAINS & EASTERN VALLEY,TOTAL,SURVEY,COUNTY,633000.0,138.5


In [17]:
usda_corn_df.tail()

Unnamed: 0,commodity_desc,reference_period_desc,year,state_ansi,state_name,county_ansi,county_name,asd_code,asd_desc,domain_desc,source_desc,agg_level_desc,"PRODUCTION, MEASURED IN BU","YIELD, MEASURED IN BU / ACRE"
1511,CORN,YEAR,2022,55,WISCONSIN,133,WAUKESHA,90,SOUTHEAST,TOTAL,SURVEY,COUNTY,3493000.0,181.0
1512,CORN,YEAR,2022,55,WISCONSIN,135,WAUPACA,50,CENTRAL,TOTAL,SURVEY,COUNTY,6588000.0,175.2
1513,CORN,YEAR,2022,55,WISCONSIN,137,WAUSHARA,50,CENTRAL,TOTAL,SURVEY,COUNTY,5800000.0,174.7
1514,CORN,YEAR,2022,55,WISCONSIN,139,WINNEBAGO,60,EAST CENTRAL,TOTAL,SURVEY,COUNTY,5862000.0,171.9
1515,CORN,YEAR,2022,55,WISCONSIN,141,WOOD,50,CENTRAL,TOTAL,SURVEY,COUNTY,4312000.0,167.8


In [18]:
hrrr_march_df.head()


Unnamed: 0,Year,Month,Day,Daily/Monthly,State,County,FIPS Code,Grid Index,Lat (llcrnr),Lon (llcrnr),...,Max Temperature (K),Min Temperature (K),Precipitation (kg m**-2),Relative Humidity (%),Wind Gust (m s**-1),Wind Speed (m s**-1),U Component of Wind (m s**-1),V Component of Wind (m s**-1),Downward Shortwave Radiation Flux (W m**-2),Vapor Pressure Deficit (kPa)
0,2022,3,1.0,Daily,ALABAMA,AUTAUGA,1001,0.0,32.340803,-86.917595,...,294.298,274.949,0.0,54.8,2.544,4.196,0.27,-3.479,5848.2,0.779
1,2022,3,2.0,Daily,ALABAMA,AUTAUGA,1001,0.0,32.340803,-86.917595,...,298.036,273.7,0.0,55.8,1.832,2.132,1.848,-0.676,5893.6,0.933
2,2022,3,3.0,Daily,ALABAMA,AUTAUGA,1001,0.0,32.340803,-86.917595,...,298.46,275.565,0.0,55.7,2.034,2.748,2.565,0.303,5891.3,1.0
3,2022,3,4.0,Daily,ALABAMA,AUTAUGA,1001,0.0,32.340803,-86.917595,...,301.517,276.623,0.0,50.2,2.46,2.383,0.238,-0.048,5915.7,1.228
4,2022,3,5.0,Daily,ALABAMA,AUTAUGA,1001,0.0,32.340803,-86.917595,...,301.18,281.089,0.0,52.4,5.587,6.994,-1.547,6.306,5592.2,1.147


In [19]:
hrrr_march_df.tail()

Unnamed: 0,Year,Month,Day,Daily/Monthly,State,County,FIPS Code,Grid Index,Lat (llcrnr),Lon (llcrnr),...,Max Temperature (K),Min Temperature (K),Precipitation (kg m**-2),Relative Humidity (%),Wind Gust (m s**-1),Wind Speed (m s**-1),U Component of Wind (m s**-1),V Component of Wind (m s**-1),Downward Shortwave Radiation Flux (W m**-2),Vapor Pressure Deficit (kPa)
48794,2022,3,,Monthly,ALABAMA,TUSCALOOSA,1125,,,,...,294.599131,281.633147,3.876492,59.495776,5.155607,3.558309,0.057709,0.554891,4486.816263,0.777955
48795,2022,3,,Monthly,ALABAMA,WALKER,1127,,,,...,293.792082,281.236236,4.471177,60.890092,5.136324,2.84657,0.025378,0.561467,4404.396332,0.726105
48796,2022,3,,Monthly,ALABAMA,WASHINGTON,1129,,,,...,296.365106,282.628488,4.379315,63.770634,5.341186,5.059961,0.143978,-0.015529,4515.20185,0.772841
48797,2022,3,,Monthly,ALABAMA,WILCOX,1131,,,,...,296.035998,281.887556,4.491034,61.483455,4.969465,4.544999,0.396532,0.384566,4559.145533,0.811691
48798,2022,3,,Monthly,ALABAMA,WINSTON,1133,,,,...,292.649974,280.635458,4.835881,62.832258,5.573339,2.467787,0.068394,0.453745,4368.483452,0.650171


In [11]:
# Number of rows and columns
print("usda_corn_df shape:", usda_corn_df.shape)
print("hrrr_march_df shape:", hrrr_march_df.shape)

# Data types
print("\nusda_corn_df data types:\n", usda_corn_df.dtypes)
print("\nhrrr_march_df data types:\n", hrrr_march_df.dtypes)

# Check for missing values
print("\nusda_corn_df missing values:\n", usda_corn_df.isnull().sum())
print("\nhrrr_march_df missing values:\n", hrrr_march_df.isnull().sum())

# Quick summary of categorical and numerical columns
print("\nusda_corn_df categorical columns:", usda_corn_df.select_dtypes(include='object').columns.tolist())
print("usda_corn_df numerical columns:", usda_corn_df.select_dtypes(include=['number']).columns.tolist())

print("\nhrrr_march_df categorical columns:", hrrr_march_df.select_dtypes(include='object').columns.tolist())
print("hrrr_march_df numerical columns:", hrrr_march_df.select_dtypes(include=['number']).columns.tolist())


usda_corn_df shape: (1516, 14)
hrrr_march_df shape: (48799, 23)

usda_corn_df data types:
 commodity_desc                   object
reference_period_desc            object
year                              int64
state_ansi                        int64
state_name                       object
county_ansi                       int64
county_name                      object
asd_code                          int64
asd_desc                         object
domain_desc                      object
source_desc                      object
agg_level_desc                   object
PRODUCTION, MEASURED IN BU      float64
YIELD, MEASURED IN BU / ACRE    float64
dtype: object

hrrr_march_df data types:
 Year                                             int64
Month                                            int64
Day                                            float64
Daily/Monthly                                   object
State                                           object
County                          

In [None]:
# Data Cleaning and Standardization
# --- 1. Standardize column names (snake_case, no special characters) ---
def clean_column_names(df):
    df.columns = (
        df.columns.str.strip()
                  .str.lower()
                  .str.replace(r"[^\w\s]", "", regex=True)
                  .str.replace(r"\s+", "_", regex=True)
    )
    return df

usda_corn_df = clean_column_names(usda_corn_df)
hrrr_march_df = clean_column_names(hrrr_march_df)

# --- 2. Standardize county/state names (uppercase, no extra spaces) ---
def standardize_location_names(df, county_col="county", state_col="state"):
    if county_col in df.columns:
        df[county_col] = df[county_col].astype(str).str.strip().str.upper()
    if state_col in df.columns:
        df[state_col] = df[state_col].astype(str).str.strip().str.upper()
    return df

usda_corn_df = standardize_location_names(usda_corn_df, county_col="county_name", state_col="state_name")
hrrr_march_df = standardize_location_names(hrrr_march_df)

# --- 3. Convert temperature values from Kelvin to Celsius ---
temp_k_cols = [col for col in hrrr_march_df.columns if "temperature" in col and "(k)" in col.lower()]
for col in temp_k_cols:
    hrrr_march_df[col.replace("(k)", "(celsius)")] = hrrr_march_df[col] - 273.15
    del hrrr_march_df[col]  # remove original Kelvin column if not needed

# --- 4. Combine year, month, and day into a single date column ---
# Fill missing Day values if necessary (e.g., with 1 for monthly data)
hrrr_march_df["day"] = hrrr_march_df["day"].fillna(1).astype(int)
hrrr_march_df["date"] = pd.to_datetime(hrrr_march_df[["year", "month", "day"]], errors="coerce")
