In [1]:
import os
import h5py
import numpy as np
import re
from tqdm import tqdm
from datetime import datetime
from scipy.interpolate import griddata

In [17]:
#Input folder should be the folder where data is
input_folder = r"C:\Users\Shantanoo Aher\OneDrive\Documents\Jul25_134497"
output_X = r"C:\Users\Shantanoo Aher\OneDrive\Documents\Insat-3DR\X_coords" #create a parent folder and in that folder make two different folders
output_Y = r"C:\Users\Shantanoo Aher\OneDrive\Documents\Insat-3DR\Y_coords" # that will be your output_X and output_Y
grid_res = 0.25  # degrees
input_len = 6
target_len = 2

In [18]:
lat_grid = np.arange(6, 38.01, grid_res)
lon_grid = np.arange(68, 98.01, grid_res)
grid_lon, grid_lat = np.meshgrid(lon_grid, lat_grid)

In [19]:
#Extract Timestamps from the data

def extract_timestamp(file):
    match = re.search(r'_(\d{2})([A-Z]{3})(\d{4})_(\d{4})_', file)
    if not match: return None
    day, mon, year, hhmm = match.groups()
    months = {'JAN':1,'FEB':2,'MAR':3,'APR':4,'MAY':5,'JUN':6,
              'JUL':7,'AUG':8,'SEP':9,'OCT':10,'NOV':11,'DEC':12}
    dt = datetime(int(year), months[mon], int(day), int(hhmm[:2]), int(hhmm[2:]))
    return np.datetime64(dt)

In [20]:
def process_file(file_path):
    with h5py.File(file_path, 'r') as f:
        lat = f['Latitude'][:] / 100.0
        lon = f['Longitude'][:] / 100.0
        hem = f['HEM'][0]
        hem = np.where((hem <= 0) | (hem > 200), np.nan, hem)
    #Flatten and interpolate into grids
    points = np.column_stack((lon.ravel(), lat.ravel()))
    values = hem.ravel()
    interp_hem = griddata(points, values, (grid_lon, grid_lat), method='linear')
    
    # Normalize
    interp_hem = (interp_hem - np.nanmean(interp_hem)) / (np.nanstd(interp_hem) + 1e-6)
    return np.nan_to_num(interp_hem)   

In [21]:
# Preprocess the files
file_list = sorted([f for f in os.listdir(input_folder) if f.endswith(".h5")])
timestamps = [extract_timestamp(f) for f in file_list]
valid = [(f, t) for f, t in zip(file_list, timestamps) if t is not None]
file_list, timestamps = zip(*valid)

In [None]:
X_seq = []
Y_seq = []

print("Building input-output sequences...")
for i in tqdm(range(len(file_list) - input_len - target_len + 1)):
    try:
        X_stack = []
        for j in range(input_len):
            fpath = os.path.join(input_folder, file_list[i + j])
            X_stack.append(process_file(fpath))
        Y_stack = []
        for j in range(target_len):
            fpath = os.path.join(input_folder, file_list[i + input_len + j])
            Y_stack.append(process_file(fpath))
        X_seq.append(np.stack(X_stack))  # shape: (6, H, W)
        Y_seq.append(np.stack(Y_stack))  # shape: (2, H, W)
    except Exception as e:
        print(f"Failed at index {i}: {e}")

X_seq = np.stack(X_seq)  # shape: (N, 6, H, W)
Y_seq = np.stack(Y_seq)  # shape: (N, 2, H, W)

os.makedirs(os.path.dirname(output_X), exist_ok=True)
np.save(output_X, X_seq)
np.save(output_Y, Y_seq)
print(f"Saved X: {X_seq.shape} to {output_X}")
print(f"Saved Y: {Y_seq.shape} to {output_Y}")

Building input-output sequences...


  0%|                                                                                            | 0/9 [00:00<?, ?it/s]