### Importing Libraries

### Exploring GFS Data

In [2]:
# Base directory containing the 5 folders
base_dir = "GFS_Data_6to48"
subfolders = ["Prate", "Pwater", "Pressure", "Temperature", "RH"]

In [3]:
for subfolder in subfolders:
    folder_path = os.path.join(base_dir, subfolder)
    
    # Pick the first NetCDF file in the folder
    files = [f for f in os.listdir(folder_path) if f.endswith(".nc")]
    if not files:
        print(f"❌ No NetCDF files found in {subfolder}")
        continue
    
    file_path = os.path.join(folder_path, files[0])
    
    # Load the dataset
    print(f"\n🔍 Exploring: {file_path}")
    ds = xr.open_dataset(file_path)
    
    # Show basic info
    print(ds)
    
    # Print variables
    print("\n📦 Variables:")
    for var in ds.data_vars:
        print(f" - {var}")
    
    # Print dimensions
    print("\n📏 Dimensions:")
    for dim in ds.dims:
        print(f" - {dim}: {ds.dims[dim]}")
    
    # Print coordinate ranges
    print("\n🗺️ Coordinates:")
    for coord in ds.coords:
        print(f" - {coord}: {ds[coord].values[:3]} ... {ds[coord].values[-3:]}")


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'GFS_Data_6to48\\Prate'

In [None]:
# Define constants
VARIABLES = ['R_H_L100', 'TMP_L100', 'PRES_L1', 'P_WAT_L200', 'PRATE_L1']
INIT_TIMES = ['00', '06', '12', '18']
FHR_DICT = {
    '00': range(18, 43, 3),  # 18 to 42
    '06': range(12, 37, 3),
    '12': range(6, 31, 3),
    '18': range(24, 49, 3),
}

# Folder where NetCDF files are stored
BASE_DIR = "GFS_Data_6to48"  # Update to actual path
OUTPUT_DIR = "output_features"
os.makedirs(OUTPUT_DIR, exist_ok=True)


def parse_filename(filename):
    # Example: gfs.0p25.20190101_00_f018.grib2.nc
    parts = filename.split('.')
    datetime_part = parts[2]  # '20190101_00_f018'
    dt_str = datetime_part.split('_')
    date_str, hour_str, fhr_str = dt_str[0], dt_str[1], dt_str[2]
    init_date = datetime.strptime(date_str, "%Y%m%d")
    init_hour = hour_str
    fhr = int(fhr_str[1:])
    return init_date, init_hour, fhr


# Build file index
file_index = {var: defaultdict(dict) for var in VARIABLES}

for var in VARIABLES:
    folder = os.path.join(BASE_DIR, var)
    for fname in os.listdir(folder):
        if not fname.endswith(".nc"):
            continue
        init_date, init_hour, fhr = parse_filename(fname)
        file_index[var][(init_date.strftime("%Y-%m-%d"), init_hour)][fhr] = os.path.join(folder, fname)

# Extract lat/lon grid from any one file
sample_file = next(iter(file_index['TMP_L100'].values()))[18]
ds_sample = xr.open_dataset(sample_file)
lats = ds_sample['latitude'].values
lons = ds_sample['longitude'].values
lat_lon_pairs = [(float(lat), float(lon)) for lat in lats for lon in lons]

# Initialize data holders
data_by_location = {loc: [] for loc in lat_lon_pairs}

# Define date range (update as needed)
start_date = datetime(2019, 1, 1)
end_date = datetime(2019, 1, 3)
current_date = start_date

while current_date <= end_date:
    utc_day = current_date - timedelta(hours=5, minutes=30)
    rows_by_loc = {loc: {'date': current_date.strftime("%Y-%m-%d")} for loc in lat_lon_pairs}

    valid_day = True

    for init_time in INIT_TIMES:
        if init_time == '18':
            init_date = utc_day.date() - timedelta(days=2)
        else:
            init_date = utc_day.date() - timedelta(days=1)

        for var in VARIABLES:
            for fhr in FHR_DICT[init_time]:
                init_key = (init_date.strftime("%Y-%m-%d"), init_time)
                filepath = file_index.get(var, {}).get(init_key, {}).get(fhr)

                if not filepath or not os.path.exists(filepath):
                    valid_day = False
                    continue

                try:
                    ds = xr.open_dataset(filepath)
                    data_array = ds[var].isel(time=0).values  # 2D array
                    for i, lat in enumerate(lats):
                        for j, lon in enumerate(lons):
                            key = (float(lat), float(lon))
                            colname = f"{var}_{init_time}_f{fhr:03d}"
                            rows_by_loc[key][colname] = float(data_array[i, j])
                except Exception as e:
                    print(f"Error reading {filepath}: {e}")
                    valid_day = False

    if valid_day:
        for loc, row in rows_by_loc.items():
            data_by_location[loc].append(row)

    current_date += timedelta(days=1)

# Save CSVs
for loc, records in data_by_location.items():
    lat, lon = loc
    df = pd.DataFrame(records)
    filename = f"lat_{lat:.2f}_lon_{lon:.2f}.csv"
    df.to_csv(os.path.join(OUTPUT_DIR, filename), index=False)

"Code completed. Sample CSVs will be saved in output_features directory."



IndexError: list index out of range

#### Converting data into date-feature data for individual lat-long pairs 

In [3]:
import os
import pandas as pd
from collections import defaultdict
from datetime import datetime, timedelta
from netCDF4 import Dataset
from tqdm import tqdm
from pathlib import Path

# ------------------------ Step 0: Setup ------------------------
variables = ['R_H_L100', 'TMP_L100', 'PRES_L1', 'P_WAT_L200', 'PRATE_L1']
init_times = ['00', '06', '12', '18']
forecast_hours_needed = {
    '00': range(18, 43, 3),
    '06': range(12, 37, 3),
    '12': range(6, 31, 3),
    '18': range(24, 49, 3),
}

data_root = 'GFS_Data_6to48'
output_dir = 'output_features'
os.makedirs(output_dir, exist_ok=True)

checkpoint_file = Path("last_processed_date.txt")

# ------------------------ Step 1: Parse filename ------------------------
def parse_filename(filename):
    # gfs.0p25.2019011712.f030.grib2.nc
    parts = filename.split('.')
    init_str = parts[2]  # '2019011712'
    fhr = int(parts[3][1:])  # 'f030' -> 30
    init_date = init_str[:8]  # '20190117'
    init_hour = init_str[8:]  # '12'
    return init_date, init_hour, fhr

# ------------------------ Step 2: Build file index ------------------------
file_index = {var: defaultdict(dict) for var in variables}

for var in tqdm(variables, desc="Indexing variables"):
    var_dir = os.path.join(data_root, var)
    for file in os.listdir(var_dir):
        if not file.endswith('.nc'):
            continue
        init_date, init_hour, fhr = parse_filename(file)
        full_path = os.path.join(var_dir, file)
        file_index[var][(init_date, init_hour)][fhr] = full_path

# ------------------------ Step 3: Get lat/lon grid ------------------------
sample_path = next(iter(next(iter(file_index[variables[0]].values())).values()))
with Dataset(sample_path, 'r') as ds:
    lats = ds.variables['lat'][:]
    lons = ds.variables['lon'][:]

lat_lon_grid = [(float(lat), float(lon)) for lat in lats for lon in lons]

# ------------------------ Step 4: Date Range ------------------------
start_date = datetime(2019, 7, 1)
end_date = datetime(2025, 3, 30)
all_dates_in_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# Resume support: skip dates already processed
if checkpoint_file.exists():
    last_date_str = checkpoint_file.read_text().strip()
    last_date = datetime.strptime(last_date_str, '%Y-%m-%d')
    all_dates_in_range = [d for d in all_dates_in_range if d > last_date]

# ------------------------ Step 5: Feature Extraction and CSV Writing ------------------------
for day_d in tqdm(all_dates_in_range, desc="Processing dates"):
    utc_d = day_d - timedelta(hours=5, minutes=30)
    feature_row_by_location = defaultdict(dict)

    for init_time in init_times:
        if init_time == '18':
            init_day = (utc_d - timedelta(days=2)).strftime('%Y%m%d')
        else:
            init_day = (utc_d - timedelta(days=1)).strftime('%Y%m%d')

        for var in variables:
            for fhr in forecast_hours_needed[init_time]:
                file = file_index[var].get((init_day, init_time), {}).get(fhr, None)
                if not file or not os.path.exists(file):
                    print(f"Missing: {var} | {init_day} {init_time} | f{fhr:03d}")
                    continue

                try:
                    with Dataset(file, 'r') as ds:
                        data = ds.variables[var][0, :, :]  # (lat, lon)
                        for i, lat in enumerate(lats):
                            for j, lon in enumerate(lons):
                                key = (float(lat), float(lon))
                                col_name = f"{var}_{init_time}_f{fhr:03d}"
                                feature_row_by_location[key][col_name] = float(data[i, j])
                except Exception as e:
                    print(f"Error reading {file}: {e}")
                    continue

    # Write to CSV immediately
    for loc, row in feature_row_by_location.items():
        row['date'] = day_d.strftime('%Y-%m-%d')
        df_row = pd.DataFrame([row])
        lat, lon = loc
        out_path = os.path.join(output_dir, f"lat_{lat}_lon_{lon}.csv")
        write_header = not os.path.exists(out_path)
        df_row.to_csv(out_path, index=False, mode='a', header=write_header)

    # Save checkpoint
    checkpoint_file.write_text(day_d.strftime('%Y-%m-%d'))


Indexing variables: 100%|██████████| 5/5 [00:03<00:00,  1.52it/s]
Processing dates:  65%|██████▌   | 1369/2100 [4:26:35<2:12:16, 10.86s/it]

Missing: R_H_L100 | 20230328 18 | f042
Missing: TMP_L100 | 20230328 18 | f042
Missing: PRES_L1 | 20230328 18 | f042
Missing: P_WAT_L200 | 20230328 18 | f042
Missing: PRATE_L1 | 20230328 18 | f042


Processing dates:  78%|███████▊  | 1641/2100 [5:17:22<1:36:55, 12.67s/it]

Missing: R_H_L100 | 20231225 18 | f024
Missing: R_H_L100 | 20231225 18 | f027
Missing: R_H_L100 | 20231225 18 | f030
Missing: R_H_L100 | 20231225 18 | f033
Missing: R_H_L100 | 20231225 18 | f036
Missing: R_H_L100 | 20231225 18 | f039
Missing: R_H_L100 | 20231225 18 | f042
Missing: R_H_L100 | 20231225 18 | f045
Missing: R_H_L100 | 20231225 18 | f048
Missing: TMP_L100 | 20231225 18 | f024
Missing: TMP_L100 | 20231225 18 | f027
Missing: TMP_L100 | 20231225 18 | f030
Missing: TMP_L100 | 20231225 18 | f033
Missing: TMP_L100 | 20231225 18 | f036
Missing: TMP_L100 | 20231225 18 | f039
Missing: TMP_L100 | 20231225 18 | f042
Missing: TMP_L100 | 20231225 18 | f045
Missing: TMP_L100 | 20231225 18 | f048
Missing: PRES_L1 | 20231225 18 | f024
Missing: PRES_L1 | 20231225 18 | f027
Missing: PRES_L1 | 20231225 18 | f030
Missing: PRES_L1 | 20231225 18 | f033
Missing: PRES_L1 | 20231225 18 | f036
Missing: PRES_L1 | 20231225 18 | f039
Missing: PRES_L1 | 20231225 18 | f042
Missing: PRES_L1 | 20231225 18 |

Processing dates:  85%|████████▌ | 1787/2100 [5:43:30<53:50, 10.32s/it]  

Missing: R_H_L100 | 20240519 18 | f024
Missing: R_H_L100 | 20240519 18 | f027
Missing: R_H_L100 | 20240519 18 | f030
Missing: R_H_L100 | 20240519 18 | f033
Missing: R_H_L100 | 20240519 18 | f036
Missing: R_H_L100 | 20240519 18 | f039
Missing: R_H_L100 | 20240519 18 | f042
Missing: R_H_L100 | 20240519 18 | f045
Missing: R_H_L100 | 20240519 18 | f048
Missing: TMP_L100 | 20240519 18 | f024
Missing: TMP_L100 | 20240519 18 | f027
Missing: TMP_L100 | 20240519 18 | f030
Missing: TMP_L100 | 20240519 18 | f033
Missing: TMP_L100 | 20240519 18 | f036
Missing: TMP_L100 | 20240519 18 | f039
Missing: TMP_L100 | 20240519 18 | f042
Missing: TMP_L100 | 20240519 18 | f045
Missing: TMP_L100 | 20240519 18 | f048
Missing: PRES_L1 | 20240519 18 | f024
Missing: PRES_L1 | 20240519 18 | f027
Missing: PRES_L1 | 20240519 18 | f030
Missing: PRES_L1 | 20240519 18 | f033
Missing: PRES_L1 | 20240519 18 | f036
Missing: PRES_L1 | 20240519 18 | f039
Missing: PRES_L1 | 20240519 18 | f042
Missing: PRES_L1 | 20240519 18 |

Processing dates:  85%|████████▌ | 1795/2100 [5:44:52<51:54, 10.21s/it]

Missing: R_H_L100 | 20240528 06 | f015
Missing: R_H_L100 | 20240528 06 | f018
Missing: R_H_L100 | 20240528 06 | f021
Missing: R_H_L100 | 20240528 06 | f024
Missing: R_H_L100 | 20240528 06 | f027
Missing: R_H_L100 | 20240528 06 | f030
Missing: R_H_L100 | 20240528 06 | f033
Missing: R_H_L100 | 20240528 06 | f036
Missing: TMP_L100 | 20240528 06 | f015
Missing: TMP_L100 | 20240528 06 | f018
Missing: TMP_L100 | 20240528 06 | f021
Missing: TMP_L100 | 20240528 06 | f024
Missing: TMP_L100 | 20240528 06 | f027
Missing: TMP_L100 | 20240528 06 | f030
Missing: TMP_L100 | 20240528 06 | f033
Missing: TMP_L100 | 20240528 06 | f036
Missing: PRES_L1 | 20240528 06 | f015
Missing: PRES_L1 | 20240528 06 | f018
Missing: PRES_L1 | 20240528 06 | f021
Missing: PRES_L1 | 20240528 06 | f024
Missing: PRES_L1 | 20240528 06 | f027
Missing: PRES_L1 | 20240528 06 | f030
Missing: PRES_L1 | 20240528 06 | f033
Missing: PRES_L1 | 20240528 06 | f036
Missing: P_WAT_L200 | 20240528 06 | f015
Missing: P_WAT_L200 | 20240528 

Processing dates:  91%|█████████ | 1905/2100 [6:23:38<1:30:43, 27.92s/it]

Missing: R_H_L100 | 20240915 00 | f039
Missing: PRES_L1 | 20240915 00 | f039
Missing: P_WAT_L200 | 20240915 00 | f039
Missing: PRATE_L1 | 20240915 00 | f039


Processing dates:  92%|█████████▏| 1941/2100 [6:40:11<1:11:52, 27.12s/it]

Missing: R_H_L100 | 20241021 12 | f006
Missing: R_H_L100 | 20241021 12 | f009
Missing: R_H_L100 | 20241021 12 | f012
Missing: R_H_L100 | 20241021 12 | f015
Missing: R_H_L100 | 20241021 12 | f018
Missing: R_H_L100 | 20241021 12 | f021
Missing: R_H_L100 | 20241021 12 | f024
Missing: R_H_L100 | 20241021 12 | f027
Missing: R_H_L100 | 20241021 12 | f030
Missing: TMP_L100 | 20241021 12 | f006
Missing: TMP_L100 | 20241021 12 | f009
Missing: TMP_L100 | 20241021 12 | f012
Missing: TMP_L100 | 20241021 12 | f015
Missing: TMP_L100 | 20241021 12 | f018
Missing: TMP_L100 | 20241021 12 | f021
Missing: TMP_L100 | 20241021 12 | f024
Missing: TMP_L100 | 20241021 12 | f027
Missing: TMP_L100 | 20241021 12 | f030
Missing: PRES_L1 | 20241021 12 | f006
Missing: PRES_L1 | 20241021 12 | f009
Missing: PRES_L1 | 20241021 12 | f012
Missing: PRES_L1 | 20241021 12 | f015
Missing: PRES_L1 | 20241021 12 | f018
Missing: PRES_L1 | 20241021 12 | f021
Missing: PRES_L1 | 20241021 12 | f024
Missing: PRES_L1 | 20241021 12 |

Processing dates:  93%|█████████▎| 1957/2100 [6:47:26<1:05:44, 27.58s/it]

Missing: R_H_L100 | 20241106 06 | f030
Missing: R_H_L100 | 20241106 06 | f033
Missing: R_H_L100 | 20241106 06 | f036
Missing: TMP_L100 | 20241106 06 | f030
Missing: TMP_L100 | 20241106 06 | f033
Missing: TMP_L100 | 20241106 06 | f036
Missing: PRES_L1 | 20241106 06 | f030
Missing: PRES_L1 | 20241106 06 | f033
Missing: PRES_L1 | 20241106 06 | f036
Missing: P_WAT_L200 | 20241106 06 | f030
Missing: P_WAT_L200 | 20241106 06 | f033
Missing: P_WAT_L200 | 20241106 06 | f036
Missing: PRATE_L1 | 20241106 06 | f030
Missing: PRATE_L1 | 20241106 06 | f033
Missing: PRATE_L1 | 20241106 06 | f036
Missing: R_H_L100 | 20241106 12 | f006
Missing: R_H_L100 | 20241106 12 | f009
Missing: R_H_L100 | 20241106 12 | f012
Missing: R_H_L100 | 20241106 12 | f015
Missing: R_H_L100 | 20241106 12 | f018
Missing: R_H_L100 | 20241106 12 | f021
Missing: R_H_L100 | 20241106 12 | f024
Missing: R_H_L100 | 20241106 12 | f027
Missing: R_H_L100 | 20241106 12 | f030
Missing: TMP_L100 | 20241106 12 | f006
Missing: TMP_L100 | 20

Processing dates: 100%|██████████| 2100/2100 [7:52:18<00:00, 13.49s/it]  
