In [2]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import os

### finding stations with no missing values

In [12]:

def extract_lat_lon(filename):
    base = os.path.basename(filename).replace('.csv', '')
    parts = base.split('_')
    
    try:
        lat_str = parts[-2]
        lon_str = parts[-1]

        def convert(coord_str):
            val = float(coord_str)
            degrees = int(val // 100)
            minutes = val - degrees * 100
            return round(degrees + minutes / 60, 4)

        return convert(lat_str), convert(lon_str)
    except Exception as e:
        print(f"Error parsing filename {filename}: {e}")
        return None, None


def print_files_no_missing_dates_jun_to_sep(folder_path="select-stations"):
    full_date_range = pd.date_range(start='2019-05-01', end='2024-10-31')
    full_date_range = full_date_range[full_date_range.month.isin([6,7,8,9])]
    
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    lat_lon_list = []

    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        df = pd.read_csv(file_path, parse_dates=['Date'])

        df_filtered = df[df['Date'].dt.month.isin([6,7,8,9])]
        df_dates = df_filtered['Date']
        missing_dates = full_date_range.difference(df_dates)

        if len(missing_dates) <= 20:
            lat, lon = extract_lat_lon(csv_file)
            if lat is not None and lon is not None:
                lat_lon_list.append((csv_file, lat, lon))

    print("Files with NO missing dates from June to September (2019–2024):")
    for f, lat, lon in lat_lon_list:
        print(f"{f}: lat={lat}, lon={lon}")
    
    print(f"\nTotal files with no missing dates: {len(lat_lon_list)}")
    return lat_lon_list


# Run
lat_lon_list = print_files_no_missing_dates_jun_to_sep("select-stations")

#print the length of each file 
for f, lat, lon in lat_lon_list:
    file_path = os.path.join("select-stations", f)
    df = pd.read_csv(file_path)
    print(f"{f}: {len(df)} rows")

Files with NO missing dates from June to September (2019–2024):
AMBERNATH_9999.0_9999.0.csv: lat=100.65, lon=100.65
BHIWANDI_1918.0_7303.0.csv: lat=19.3, lon=73.05
CANACONA_9999.0_9999.0.csv: lat=100.65, lon=100.65
CHIPLUN_1732.0_7331.0.csv: lat=17.5333, lon=73.5167
COLABA_-_IMD_OBSY_1854.0_7249.0.csv: lat=18.9, lon=72.8167
DAHANU_-_IMD_OBSY_1958.0_7243.0.csv: lat=19.9667, lon=72.7167
DAPOLI_AGRI_1746.0_7312.0.csv: lat=17.7667, lon=73.2
DEVGAD_1623.0_7321.0.csv: lat=16.3833, lon=73.35
DODAMARG_9999.0_9999.0.csv: lat=100.65, lon=100.65
GAGANBAWADA_1633.0_7350.0.csv: lat=16.55, lon=73.8333
GUHAGARH_1728.0_7312.0.csv: lat=17.4667, lon=73.2
HARNAI_IMD_OBSY_1749.0_7306.0.csv: lat=17.8167, lon=73.1
JAWHAR_1955.0_7314.0.csv: lat=19.9167, lon=73.2333
KALYAN_1915.0_7307.0.csv: lat=19.25, lon=73.1167
KANKAVLI_1616.0_7342.0.csv: lat=16.2667, lon=73.7
KARJAT_AGRI_1855.0_7320.0.csv: lat=18.9167, lon=73.3333
KHALAPUR_1852.0_7317.0.csv: lat=18.8667, lon=73.2833
KUDAL_1601.0_7342.0.csv: lat=16.0167, l

- ### combining feature label for XGB

In [14]:
import os
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree

def standardize_date_col(df):
    for col in df.columns:
        if col.lower() == 'date':
            df.rename(columns={col: 'Date'}, inplace=True)
            break
    return df

def load_gfs_features(output_features_folder):
    gfs_dfs = {}
    for file in os.listdir(output_features_folder):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(output_features_folder, file))
            df = standardize_date_col(df)
            # Extract lat, lon from filename: lat_<lat>_lon_<lon>.csv
            parts = file.replace('.csv', '').split('_')
            lat = float(parts[1])
            lon = float(parts[3])
            gfs_dfs[(lat, lon)] = df
    return gfs_dfs

def load_station_labels(per_station_folder, lat_lon_list):
    station_dfs = {}
    for filename, lat, lon in lat_lon_list:
        path = os.path.join(per_station_folder, filename)
        if os.path.exists(path):
            df = pd.read_csv(path)
            df = standardize_date_col(df)
            station_dfs[(lat, lon)] = df
        else:
            print(f"Warning: Station file {path} does not exist.")
    return station_dfs

def find_nearest_grid_points(station_coords, grid_coords, k=4):
    tree = cKDTree(grid_coords)
    dists, idxs = tree.query(station_coords, k=k)
    return dists, idxs

def interpolate_features_for_station(station_coord, grid_coords, gfs_dfs, idxs, dists):
    weights = 1 / (dists + 1e-8)  # avoid div by zero
    weights /= weights.sum()      # normalize weights to sum to 1

    combined_features = None
    for i, idx in enumerate(idxs):
        grid_point = grid_coords[idx]
        df = gfs_dfs[tuple(grid_point)].copy()
        df = standardize_date_col(df)

        # Multiply only numeric columns by weight
        numeric_cols = df.select_dtypes(include='number').columns

        if combined_features is None:
            combined_features = df.copy()
            combined_features[numeric_cols] = df[numeric_cols] * weights[i]
        else:
            combined_features[numeric_cols] += df[numeric_cols] * weights[i]

    return combined_features

def create_combined_df(lat_lon_list, per_station_folder, output_features_folder):
    gfs_dfs = load_gfs_features(output_features_folder)
    station_dfs = load_station_labels(per_station_folder, lat_lon_list)

    grid_coords = np.array(list(gfs_dfs.keys()))
    station_coords = np.array([(lat, lon) for _, lat, lon in lat_lon_list])
    dists, idxs = find_nearest_grid_points(station_coords, grid_coords, k=4)

    combined_data = []

    for i, ((filename, slat, slon), dist_list, idx_list) in enumerate(zip(lat_lon_list, dists, idxs)):

        station_df = station_dfs.get((slat, slon))
        if station_df is None:
            print(f"Skipping station {(slat, slon)} - data not found.")
            continue

        # Interpolate features for this station
        interp_features = interpolate_features_for_station(
            (slat, slon), grid_coords, gfs_dfs, idx_list, dist_list
        )

        # Merge on Date column
        merged_df = pd.merge(interp_features, station_df, on='Date', how='inner')

        merged_df['Station'] = filename.replace('.csv', '')
        combined_data.append(merged_df)

    combined_df = pd.concat(combined_data, ignore_index=True)
    return combined_df

# === Your given paths and lat_lon_list ===
per_station_folder = 'per_station_data'
output_features_folder = 'output_features'

# Example lat_lon_list format: [('station1.csv', 12.34, 56.78), ('station2.csv', 12.35, 56.79), ...]
# Make sure this list is properly defined in your code

combined_df = create_combined_df(lat_lon_list, per_station_folder, output_features_folder)
print(combined_df.head())


   R_H_L100_00_f018  R_H_L100_00_f021  R_H_L100_00_f024  R_H_L100_00_f027  \
0         97.901640         98.797032         97.718716         94.869518   
1         98.705669         98.167657         98.437773         92.388017   
2         98.489275         98.271643         97.167092         98.721086   
3         97.646624         98.851686         96.931761         98.512505   
4         99.344286         87.609273         98.603764         99.432101   

   R_H_L100_00_f030  R_H_L100_00_f033  R_H_L100_00_f036  R_H_L100_00_f039  \
0         98.090332         96.404188         99.437046         98.947604   
1         88.546134         91.951644         87.685549         83.421324   
2         96.625117         99.226096         98.921860         98.473635   
3         97.594515         88.123801         95.681743         90.930526   
4         94.408516         88.805648         93.315889         95.219386   

   R_H_L100_00_f042  TMP_L100_00_f018  ...  PRATE_L1_18_f030  \
0         

In [15]:
# put combined_df to a CSV file
output_file = 'combined_interpolated_data.csv'
combined_df.to_csv(output_file, index=False)

- ### combining feature-label for GNN

- kirging

In [4]:
import os
import pandas as pd
import numpy as np
from pykrige.ok import OrdinaryKriging
from datetime import datetime
from tqdm import tqdm
from geopy.distance import geodesic

def load_gfs_features(output_features_folder):
    gfs_dfs = {}
    for file in os.listdir(output_features_folder):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(output_features_folder, file))
            parts = file.replace('.csv', '').split('_')
            lat = float(parts[1])
            lon = float(parts[3])
            gfs_dfs[(lat, lon)] = df
    return gfs_dfs

def load_station_labels(per_station_folder, lat_lon_list):
    station_dfs = {}
    for filename, lat, lon in lat_lon_list:
        path = os.path.join(per_station_folder, filename)
        if os.path.exists(path):
            df = pd.read_csv(path)
            if 'Date' not in df.columns and 'date' in df.columns:
                df.rename(columns={'date': 'Date'}, inplace=True)
            station_dfs[(lat, lon)] = df
    return station_dfs

def standardize_date_col(df):
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
    elif 'date' in df.columns:
        df['Date'] = pd.to_datetime(df['date'])
    else:
        raise ValueError("No Date column found")
    return df

def apply_kriging_to_features(date_str, feature_name, grid_coords, gfs_dfs, station_coord, k=30):
    distances = []
    for (glat, glon) in grid_coords:
        dist = geodesic(station_coord, (glat, glon)).kilometers
        distances.append(((glat, glon), dist))

    nearest_points = sorted(distances, key=lambda x: x[1])[:k]

    values = []
    lats = []
    lons = []

    for (glat, glon), _ in nearest_points:
        df = gfs_dfs[(glat, glon)]
        df = standardize_date_col(df)
        row = df.loc[df['Date'] == date_str]
        if not row.empty:
            val = row.iloc[0][feature_name]
            if pd.isna(val):
                continue
            values.append(val)
            lats.append(glat)
            lons.append(glon)

    if len(values) < 3:
        return None

    try:
        OK = OrdinaryKriging(
            lons, lats, values,
            variogram_model='linear',
            verbose=False,
            enable_plotting=False
        )
    except Exception:
        return None

    return OK

def interpolate_features_for_station(station_coord, date_str, gfs_dfs, grid_coords, feature_names):
    interpolated = {}
    for feature in feature_names:
        kriging_model = apply_kriging_to_features(date_str, feature, grid_coords, gfs_dfs, station_coord)
        if kriging_model is None:
            interpolated[feature] = np.nan
        else:
            z, ss = kriging_model.execute('points', [station_coord[1]], [station_coord[0]])
            interpolated[feature] = z[0]
    return interpolated

def save_station_outputs(lat_lon_list, per_station_folder, output_features_folder, output_dir='station_outputs'):
    os.makedirs(output_dir, exist_ok=True)

    gfs_dfs = load_gfs_features(output_features_folder)
    station_dfs = load_station_labels(per_station_folder, lat_lon_list)
    grid_coords = np.array(list(gfs_dfs.keys()))
    feature_names = list(next(iter(gfs_dfs.values())).columns)
    feature_names = [f for f in feature_names if f != 'Date']

    for filename, slat, slon in lat_lon_list:
        print(f"Processing station {filename} at ({slat}, {slon})...")
        station_df = station_dfs[(slat, slon)]
        station_df = standardize_date_col(station_df)

        rows = []
        for idx, row in tqdm(station_df.iterrows(), total=len(station_df), desc=f"{filename}"):
            date_val = row['Date']
            interp_feats = interpolate_features_for_station((slat, slon), date_val, gfs_dfs, grid_coords, feature_names)
            if any(pd.isna(list(interp_feats.values()))):
                continue
            out_row = interp_feats.copy()
            out_row['Date'] = date_val
            out_row['Rainfall'] = row['Rainfall'] if 'Rainfall' in row else np.nan
            out_row['station_lat'] = slat
            out_row['station_lon'] = slon
            out_row['month'] = date_val.month
            out_row['dayofyear'] = date_val.dayofyear
            rows.append(out_row)

        out_df = pd.DataFrame(rows)
        out_path = os.path.join(output_dir, f"subset_{slat}_{slon}.csv")
        out_df.to_csv(out_path, index=False)
        print(f"Saved to {out_path}")

# Example usage:
# lat_lon_list = [('station1.csv', 19.07, 72.88), ('station2.csv', 18.96, 72.83)]
save_station_outputs(lat_lon_list, 'per_station_data', 'output_features')


Processing station DINDORI_2012.0_7350.0.csv at (20.2, 73.8333)...


DINDORI_2012.0_7350.0.csv: 100%|██████████| 731/731 [6:26:47<00:00, 31.75s/it]  


Saved to station_outputs/subset_20.2_73.8333.csv
Processing station MAHUR_9999.0_9999.0.csv at (100.65, 100.65)...


  return cls(*args)
MAHUR_9999.0_9999.0.csv:   0%|          | 0/729 [00:00<?, ?it/s]


ValueError: Latitude must be in the [-90; 90] range.

In [None]:
import os
import pandas as pd
import numpy as np
from pykrige.ok import OrdinaryKriging
from datetime import datetime
from sklearn.neighbors import NearestNeighbors

def load_gfs_features(output_features_folder):
    gfs_dfs = {}
    for file in os.listdir(output_features_folder):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(output_features_folder, file))
            parts = file.replace('.csv', '').split('_')
            lat = float(parts[1])
            lon = float(parts[3])
            gfs_dfs[(lat, lon)] = df
    return gfs_dfs

def load_station_labels(per_station_folder, lat_lon_list):
    station_dfs = {}
    for filename, lat, lon in lat_lon_list:
        path = os.path.join(per_station_folder, filename)
        if os.path.exists(path):
            df = pd.read_csv(path)
            if 'Date' not in df.columns and 'date' in df.columns:
                df.rename(columns={'date': 'Date'}, inplace=True)
            station_dfs[(lat, lon)] = df
    return station_dfs

def standardize_date_col(df):
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
    elif 'date' in df.columns:
        df['Date'] = pd.to_datetime(df['date'])
    else:
        raise ValueError("No Date column found")
    return df

def apply_local_kriging(date_str, feature_name, station_coord, gfs_dfs, grid_coords, k=30):
    # Collect all grid points values for the date and feature
    values = []
    lats = []
    lons = []
    for (glat, glon), df in gfs_dfs.items():
        df = standardize_date_col(df)
        row = df.loc[df['Date'] == date_str]
        if not row.empty:
            val = row.iloc[0][feature_name]
            if pd.isna(val):
                continue
            values.append(val)
            lats.append(glat)
            lons.append(glon)

    if len(values) < 3:
        # Not enough points to krige
        return np.nan

    # Convert to numpy arrays
    lats = np.array(lats)
    lons = np.array(lons)
    values = np.array(values)

    # Find nearest k neighbors to the station coordinate
    nbrs = NearestNeighbors(n_neighbors=min(k, len(lats)), algorithm='auto')
    coords = np.column_stack((lats, lons))
    nbrs.fit(coords)
    dist, idx = nbrs.kneighbors(np.array([station_coord]))

    # Select local neighborhood points
    local_lats = lats[idx[0]]
    local_lons = lons[idx[0]]
    local_vals = values[idx[0]]

    # Fit kriging on local neighbors
    try:
        OK = OrdinaryKriging(
            local_lons,
            local_lats,
            local_vals,
            variogram_model='linear',
            verbose=False,
            enable_plotting=False,
        )
        z, ss = OK.execute('points', [station_coord[1]], [station_coord[0]])
        return z[0]
    except Exception as e:
        # If kriging fails fallback to nan
        return np.nan

def interpolate_features_for_station(station_coord, date_str, gfs_dfs, grid_coords, feature_names, k=30):
    interpolated = {}
    for feature in feature_names:
        val = apply_local_kriging(date_str, feature, station_coord, gfs_dfs, grid_coords, k=k)
        interpolated[feature] = val
    return interpolated

from tqdm import tqdm  # Add this import at the top

def save_station_outputs(lat_lon_list, per_station_folder, output_features_folder, output_dir='station_outputs', k=30):
    os.makedirs(output_dir, exist_ok=True)
    
    gfs_dfs = load_gfs_features(output_features_folder)
    station_dfs = load_station_labels(per_station_folder, lat_lon_list)
    grid_coords = np.array(list(gfs_dfs.keys()))
    feature_names = list(next(iter(gfs_dfs.values())).columns)
    feature_names = [f for f in feature_names if f not in ['Date']]  # exclude Date

    # Use tqdm for progress bar
    for filename, slat, slon in tqdm(lat_lon_list, desc="Stations processed"):
        print(f"Processing station {filename} at ({slat}, {slon})...")
        station_df = station_dfs[(slat, slon)]
        station_df = standardize_date_col(station_df)
        
        rows = []
        for idx, row in station_df.iterrows():
            date_val = row['Date']
            interp_feats = interpolate_features_for_station((slat, slon), date_val, gfs_dfs, grid_coords, feature_names, k=k)
            if any(pd.isna(list(interp_feats.values()))):
                continue
            
            out_row = interp_feats.copy()
            out_row['Date'] = date_val
            out_row['Rainfall'] = row['Rainfall'] if 'Rainfall' in row else np.nan
            out_row['station_lat'] = slat
            out_row['station_lon'] = slon
            out_row['month'] = date_val.month
            out_row['dayofyear'] = date_val.dayofyear
            
            rows.append(out_row)
        
        out_df = pd.DataFrame(rows)
        out_path = os.path.join(output_dir, f"subset_{slat}_{slon}.csv")
        out_df.to_csv(out_path, index=False)
        print(f"Saved to {out_path}")

# Usage example:
# lat_lon_list = [('station1.csv', 19.07, 72.88), ('station2.csv', 18.96, 72.83)]
save_station_outputs(lat_lon_list, per_station_folder, output_features_folder, k=4)


Stations processed:   0%|          | 0/30 [00:00<?, ?it/s]

Processing station DHARMABAD_1853.0_7750.0.csv at (18.8833, 77.8333)...


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x79ba51b29250>>
Traceback (most recent call last):
  File "/home/manoj/Desktop/MH/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


- IDW 

In [15]:
import os
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree

def standardize_date_col(df):
    for col in df.columns:
        if col.lower() == 'date':
            df.rename(columns={col: 'Date'}, inplace=True)
            break
    return df

def load_gfs_features(output_features_folder):
    gfs_dfs = {}
    for file in os.listdir(output_features_folder):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(output_features_folder, file))
            df = standardize_date_col(df)
            parts = file.replace('.csv', '').split('_')
            lat = float(parts[1])
            lon = float(parts[3])
            gfs_dfs[(lat, lon)] = df
    return gfs_dfs

def load_station_labels(per_station_folder, lat_lon_list):
    station_dfs = {}
    for filename, lat, lon in lat_lon_list:
        path = os.path.join(per_station_folder, filename)
        if os.path.exists(path):
            df = pd.read_csv(path)
            df = standardize_date_col(df)
            station_dfs[(lat, lon)] = df
        else:
            print(f"Warning: Station file {path} does not exist.")
    return station_dfs


def find_nearest_grid_points(station_coords, grid_coords, k=(len(lat_lon_list) - 1)):
    tree = cKDTree(grid_coords)
    dists, idxs = tree.query(station_coords, k=k)
    return dists, idxs

def interpolate_features_for_station(station_coord, grid_coords, gfs_dfs, idxs, dists):
    weights = 1 / (dists + 1e-8)
    weights /= weights.sum()

    combined_features = None
    for i, idx in enumerate(idxs):
        grid_point = grid_coords[idx]
        df = gfs_dfs[tuple(grid_point)].copy()
        df = standardize_date_col(df)
        numeric_cols = df.select_dtypes(include='number').columns

        if combined_features is None:
            combined_features = df.copy()
            combined_features[numeric_cols] = df[numeric_cols] * weights[i]
        else:
            combined_features[numeric_cols] += df[numeric_cols] * weights[i]

    return combined_features

def generate_station_wise_outputs(lat_lon_list, per_station_folder, output_features_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    gfs_dfs = load_gfs_features(output_features_folder)
    station_dfs = load_station_labels(per_station_folder, lat_lon_list)

    grid_coords = np.array(list(gfs_dfs.keys()))
    station_coords = np.array([(lat, lon) for _, lat, lon in lat_lon_list])
    dists, idxs = find_nearest_grid_points(station_coords, grid_coords, k=4)

    for (filename, slat, slon), dist_list, idx_list in zip(lat_lon_list, dists, idxs):
        station_df = station_dfs.get((slat, slon))
        if station_df is None:
            print(f"Skipping station {(slat, slon)} - data not found.")
            continue

        interp_features = interpolate_features_for_station(
            (slat, slon), grid_coords, gfs_dfs, idx_list, dist_list
        )

        merged_df = pd.merge(interp_features, station_df, on='Date', how='inner')
        merged_df['Station'] = filename.replace('.csv', '')

        output_path = os.path.join(output_folder, filename)
        merged_df.to_csv(output_path, index=False)
        print(f"Saved: {output_path}")


In [16]:
per_station_folder = 'select-stations'
output_features_folder = 'output_features'
output_folder = 'feature-label-comb'

generate_station_wise_outputs(lat_lon_list, per_station_folder, output_features_folder, output_folder)
#print length of each file in the output folder
for filename in os.listdir(output_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(output_folder, filename)
        df = pd.read_csv(file_path)
        print(f"{filename}: {len(df)} rows")

Saved: feature-label-comb\AMBERNATH_9999.0_9999.0.csv
Saved: feature-label-comb\BHIWANDI_1918.0_7303.0.csv
Saved: feature-label-comb\CANACONA_9999.0_9999.0.csv
Saved: feature-label-comb\CHIPLUN_1732.0_7331.0.csv
Saved: feature-label-comb\CANACONA_9999.0_9999.0.csv
Saved: feature-label-comb\CHIPLUN_1732.0_7331.0.csv
Saved: feature-label-comb\COLABA_-_IMD_OBSY_1854.0_7249.0.csv
Saved: feature-label-comb\DAHANU_-_IMD_OBSY_1958.0_7243.0.csv
Saved: feature-label-comb\COLABA_-_IMD_OBSY_1854.0_7249.0.csv
Saved: feature-label-comb\DAHANU_-_IMD_OBSY_1958.0_7243.0.csv
Saved: feature-label-comb\DAPOLI_AGRI_1746.0_7312.0.csv
Saved: feature-label-comb\DAPOLI_AGRI_1746.0_7312.0.csv
Saved: feature-label-comb\DEVGAD_1623.0_7321.0.csv
Saved: feature-label-comb\DODAMARG_9999.0_9999.0.csv
Saved: feature-label-comb\DEVGAD_1623.0_7321.0.csv
Saved: feature-label-comb\DODAMARG_9999.0_9999.0.csv
Saved: feature-label-comb\GAGANBAWADA_1633.0_7350.0.csv
Saved: feature-label-comb\GUHAGARH_1728.0_7312.0.csv
Saved:

## binarizing labels

- station-wise binarizing

In [23]:
import os
import pandas as pd

def binarize_and_add_features(input_folder, lat_lon_list, output_folder='binarized_outputs'):
    os.makedirs(output_folder, exist_ok=True)

    for (filename, lat, lon) in lat_lon_list:
        path = os.path.join(input_folder, filename)
        if not os.path.exists(path):
            print(f"File not found: {filename}")
            continue

        df = pd.read_csv(path)

        if 'Rainfall' not in df.columns or 'Date' not in df.columns:
            print(f"Skipping {filename}: Required columns missing.")
            continue

        # Convert Date to datetime to extract month and dayofyear
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.dropna(subset=['Date'])  # drop rows where date parsing failed
        df['month'] = df['Date'].dt.month
        df['dayofyear'] = df['Date'].dt.dayofyear

        # Drop 'Date' and 'Station' columns
        df = df.drop(columns=[col for col in ['Date', 'Station'] if col in df.columns])

        # Binarize Rainfall using 85th percentile
        threshold = df['Rainfall'].quantile(0.85)
        df['Rainfall'] = (df['Rainfall'] >= threshold).astype(int)

        # Add station coordinates
        df['station_lat'] = lat
        df['station_lon'] = lon

        # Save file
        out_filename = f"subset_{lat}_{lon}.csv"
        df.to_csv(os.path.join(output_folder, out_filename), index=False)
        print(f"Saved: {out_filename}")
input_folder = 'station_outputs'

binarize_and_add_features(input_folder, lat_lon_list)

Saved: subset_18.8833_77.8333.csv
Saved: subset_18.5333_73.3167.csv
Saved: subset_17.6667_75.9.csv
Saved: subset_16.05_73.4667.csv
Saved: subset_19.5167_75.9833.csv
Saved: subset_17.9833_73.4667.csv
Saved: subset_17.8333_73.7.csv
Saved: subset_21.75_74.0.csv
Saved: subset_20.2_76.0167.csv
Saved: subset_21.25_76.0333.csv
Saved: subset_16.9833_73.3333.csv
Saved: subset_16.8333_73.95.csv
Saved: subset_21.5667_74.2167.csv
Saved: subset_19.1167_72.85.csv
Saved: subset_16.0167_73.7.csv
Saved: subset_19.6_76.2167.csv
Saved: subset_19.7167_77.15.csv
Saved: subset_20.2667_75.7667.csv
Saved: subset_18.1333_73.1167.csv
Saved: subset_18.8667_73.2833.csv
Saved: subset_18.7333_73.1.csv
Saved: subset_18.2333_73.2833.csv
Saved: subset_19.65_76.3833.csv
Saved: subset_18.05_73.0167.csv
Saved: subset_20.9333_75.3333.csv
Saved: subset_18.9_72.9167.csv
Saved: subset_19.7833_71.0667.csv
Saved: subset_18.9_72.8167.csv
Saved: subset_18.3167_72.9667.csv
Saved: subset_19.5667_74.2167.csv


- global binarizing

In [17]:
import os
import pandas as pd

def binarize_global_threshold(input_folder, lat_lon_list, output_folder='overall_binarized_outputs'):
    os.makedirs(output_folder, exist_ok=True)

    # Step 1: Gather all rainfall values and collect dates from all stations
    all_rainfall = []
    date_sets = []
    for (filename, _, _) in lat_lon_list:
        path = os.path.join(input_folder, filename)
        if os.path.exists(path):
            df = pd.read_csv(path)
            if 'Rainfall' in df.columns and 'Date' in df.columns:
                all_rainfall.extend(df['Rainfall'].tolist())
                df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
                date_sets.append(set(df['Date'].dropna().dt.strftime('%Y-%m-%d')))

    # Find intersection of all date sets (dates present in every file)
    if date_sets:
        common_dates = set.intersection(*date_sets)
    else:
        common_dates = set()

    # Step 2: Compute global 85th percentile threshold
    global_threshold = pd.Series(all_rainfall).quantile(0.85)
    print(f"Global 85th percentile rainfall threshold: {global_threshold:.4f}")
    print(f"Number of common dates across all files: {len(common_dates)}")

    # Step 3: Apply binarization and feature addition per station
    for (filename, lat, lon) in lat_lon_list:
        path = os.path.join(input_folder, filename)
        if not os.path.exists(path):
            print(f"File not found: {filename}")
            continue

        df = pd.read_csv(path)

        if 'Rainfall' not in df.columns or 'Date' not in df.columns:
            print(f"Skipping {filename}: Required columns missing.")
            continue

        # Parse date and extract features
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        # Keep only rows with Date in common_dates
        df = df[df['Date'].dt.strftime('%Y-%m-%d').isin(common_dates)]
        df['month'] = df['Date'].dt.month
        df['dayofyear'] = df['Date'].dt.dayofyear

        # Drop Date and Station columns if present
        df = df.drop(columns=[col for col in ['Date', 'Station'] if col in df.columns])

        # Binarize using global threshold
        df['Rainfall'] = (df['Rainfall'] >= global_threshold).astype(int)

        # Add station lat/lon
        df['station_lat'] = lat
        df['station_lon'] = lon

        # Replace any missing values in the DataFrame with 0
        df.fillna(0, inplace=True)

        # Print the length of the DataFrame
        print(f"Length of {filename}: {len(df)} rows after filtering common dates")

        # Save the processed file
        out_filename = f"subset_{lat}_{lon}.csv"
        df.to_csv(os.path.join(output_folder, out_filename), index=False)
        print(f"Saved: {out_filename}")
input_folder = 'feature-label-comb'

binarize_global_threshold(input_folder, lat_lon_list)

Global 85th percentile rainfall threshold: 65.0000
Number of common dates across all files: 615
Length of AMBERNATH_9999.0_9999.0.csv: 615 rows after filtering common dates
Saved: subset_100.65_100.65.csv
Length of BHIWANDI_1918.0_7303.0.csv: 615 rows after filtering common dates
Saved: subset_19.3_73.05.csv
Length of CANACONA_9999.0_9999.0.csv: 615 rows after filtering common dates
Saved: subset_100.65_100.65.csv
Length of CHIPLUN_1732.0_7331.0.csv: 615 rows after filtering common dates
Length of CANACONA_9999.0_9999.0.csv: 615 rows after filtering common dates
Saved: subset_100.65_100.65.csv
Length of CHIPLUN_1732.0_7331.0.csv: 615 rows after filtering common dates
Saved: subset_17.5333_73.5167.csv
Length of COLABA_-_IMD_OBSY_1854.0_7249.0.csv: 615 rows after filtering common dates
Saved: subset_18.9_72.8167.csv
Length of DAHANU_-_IMD_OBSY_1958.0_7243.0.csv: 615 rows after filtering common dates
Saved: subset_17.5333_73.5167.csv
Length of COLABA_-_IMD_OBSY_1854.0_7249.0.csv: 615 rows