In [None]:
%pip install netCDF4
%pip install numpy

In [3]:
import netCDF4
from netCDF4 import Dataset
import numpy as np
import glob
import os 
import pandas as pd

In [None]:
nc = Dataset("2A.GPM.Ku.V9-20211125.20160101-S002036-E015307.010461.V07A.HDF5.nc4", mode='r')
print(nc.variables)


lon = nc.variables["FS_Longitude"][:]       
lat = nc.variables["FS_Latitude"][:]        
flag_precip = nc.variables["FS_PRE_flagPrecip"][:] 
type_precip = nc.variables["FS_CSF_typePrecip"][:]  

year = nc.variables["FS_ScanTime_Year"][:]         
doy = nc.variables["FS_ScanTime_DayOfYear"][:]     
month = nc.variables["FS_ScanTime_Month"][:]       
day = nc.variables["FS_ScanTime_DayOfMonth"][:]    

def broadcast_to_grid(arr_1d, target_shape):
    if arr_1d.ndim == 1:
        return np.repeat(arr_1d[:, np.newaxis], target_shape[1], axis=1)
    return arr_1d

year_2d = broadcast_to_grid(year, lon.shape)
doy_2d = broadcast_to_grid(doy, lon.shape)
month_2d = broadcast_to_grid(month, lon.shape)
day_2d = broadcast_to_grid(day, lon.shape)

def flat(a):
    return a.ravel()

df = pd.DataFrame({
    "Longitude": flat(lon),
    "Latitude": flat(lat),
    "FlagPrecip": flat(flag_precip),
    "TypePrecip": flat(type_precip),
    "Year": flat(year_2d),
    "Month": flat(month_2d),
    "DayOfMonth": flat(day_2d),
    "DayOfYear": flat(doy_2d),
})

print(df.head())
print(f"DataFrame shape: {df.shape}")

df.to_csv("gpm_fs_full.csv", index=False)

In [None]:
def nc_filenames():
    file_path = "data/*.nc4"
    nc4_files = glob.glob(file_path)
    return nc4_files

In [None]:
def broadcast_to_grid(arr_1d, target_shape):
    if arr_1d.ndim == 1:
        return np.repeat(arr_1d[:, np.newaxis], target_shape[1], axis=1)
    return arr_1d

In [None]:
def flat(a):
    return a.ravel()

In [None]:
fnames = nc_filenames()
if not fnames:
    raise FileNotFoundError("No .nc4 files found in 'data/' folder.")

In [None]:
for f, ncfile in enumerate(fnames, start=1):
    print(f"\nProcessing file {f}/{len(fnames)}: {ncfile}")

    with Dataset(ncfile, mode="r") as nc:
        lon = nc.variables["FS_Longitude"][:]                     
        lat = nc.variables["FS_Latitude"][:]                     
        flag_precip = nc.variables["FS_PRE_flagPrecip"][:]        
        air_temp = nc.variables["FS_VER_airTemperature"][:]       
        precip_water = nc.variables["FS_SLV_precipWater"][:]      
        precip_water_int = nc.variables["FS_SLV_precipWaterIntegrated"][:]  
        year = nc.variables["FS_ScanTime_Year"][:]
        month = nc.variables["FS_ScanTime_Month"][:]

        year_2d = broadcast_to_grid(year, lon.shape)
        month_2d = broadcast_to_grid(month, lon.shape)

        if air_temp.ndim == 3:
            air_temp = air_temp[0, :, :]
        if precip_water.ndim == 3:
            precip_water = precip_water[0, :, :]

        if precip_water_int.ndim == 3:
            precip_water_int = precip_water_int[:, :, 0]

        print({
            "lon": lon.shape,
            "lat": lat.shape,
            "flag_precip": flag_precip.shape,
            "air_temp": air_temp.shape,
            "precip_water": precip_water.shape,
            "precip_water_int": precip_water_int.shape,
        })

        df = pd.DataFrame({
            "Longitude": flat(lon),
            "Latitude": flat(lat),
            "FlagPrecip": flat(flag_precip),
            "AirTemperature": flat(air_temp),
            "PrecipWater": flat(precip_water),
            "PrecipWaterIntegrated": flat(precip_water_int),
            "Year": flat(year_2d),
            "Month": flat(month_2d),
        })

        if hasattr(flag_precip, "_FillValue"):
            df["FlagPrecip"].replace(flag_precip._FillValue, 0, inplace=True)

        df["ItRained"] = (df["FlagPrecip"] > 0).astype(int)

    base = os.path.splitext(os.path.basename(ncfile))[0]
    csv_path = f"nc4tocsv/{base}.csv"
    df.to_csv(csv_path, index=False)

    rain_cells = df["ItRained"].sum()
    total_points = len(df)
    print(f"→ Rain detected in {rain_cells}/{total_points} grid cells ({rain_cells/total_points*100:.2f}%)")
    print(f"✅ Saved CSV: {csv_path}")


print("\n🎉 All .nc4 files successfully processed and converted to CSV.")

In [7]:

def combine_files(save=False):
    file_path = "nc4tocsv/*.csv"
    csv_files = glob.glob(file_path)

    combined = []
    for file in csv_files:
        combined.append(pd.read_csv(file))

    combined = pd.concat(combined, ignore_index=True)
    os.makedirs("data_comb", exist_ok=True)
    if save:
        combined.to_csv("data_comb/combined_data_new.csv", index=False)
    return combined

cc = combine_files(False)
cc.describe()
print(cc["ItRained"].describe())
cc

count    25137.000000
mean         0.015913
std          0.125141
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: ItRained, dtype: float64


Unnamed: 0,Longitude,Latitude,FlagPrecip,AirTemperature,PrecipWater,PrecipWaterIntegrated,Year,Month,ItRained
0,98.069440,-66.285120,0,235.72566,0.0,0.0,2016,1,0
1,98.068860,-66.234560,0,235.60180,0.0,0.0,2016,1,0
2,98.068390,-66.184044,0,235.47795,0.0,0.0,2016,1,0
3,98.068040,-66.134200,0,235.35410,0.0,0.0,2016,1,0
4,98.067800,-66.084820,0,235.23024,0.0,0.0,2016,1,0
...,...,...,...,...,...,...,...,...,...
25132,40.927044,-65.488790,0,231.14207,0.0,0.0,2018,1,0
25133,40.924290,-65.442020,0,231.10574,0.0,0.0,2018,1,0
25134,40.921642,-65.395355,0,231.06941,0.0,0.0,2018,1,0
25135,40.919100,-65.348720,0,231.03308,0.0,0.0,2018,1,0
