In [5]:
import os
import pandas as pd
from tqdm import tqdm

# Paths
station_data_dir = 'per_station_data'
gfs_data_dir = 'output_features'
output_file = 'combined_dataset.csv'

# Helper: Extract decimal lat/lon from station filename
def extract_station_coords(filename):
    try:
        parts = filename.replace('.csv', '').split('_')
        lat4 = float(parts[-2])     # e.g., 1435.0
        lon4 = float(parts[-1])     # e.g., 7656.0
        if lat4 == 9999.0 or lon4 == 9999.0:
            return None
        return round(lat4 / 100.0, 2), round(lon4 / 100.0, 2)  # → (14.35, 76.56)
    except:
        return None

# Helper: Snap to nearest 0.25° GFS grid point
def snap_to_grid(lat, lon):
    return round(lat * 4) / 4, round(lon * 4) / 4  # e.g., 14.35 → 14.25

# Gather all station files
all_merged_rows = []

station_files = [f for f in os.listdir(station_data_dir) if f.endswith('.csv')]

for station_file in tqdm(station_files, desc="Processing station files"):
    coords = extract_station_coords(station_file)
    if coords is None:
        continue

    station_lat, station_lon = coords
    grid_lat, grid_lon = snap_to_grid(station_lat, station_lon)

    gfs_filename = f"lat_{grid_lat}_lon_{grid_lon}.csv"
    gfs_path = os.path.join(gfs_data_dir, gfs_filename)
    station_path = os.path.join(station_data_dir, station_file)

    if not os.path.exists(gfs_path):
        print(f"GFS file not found: {gfs_filename}")
        continue

    try:
        gfs_df = pd.read_csv(gfs_path)
        gfs_df['date'] = pd.to_datetime(gfs_df['date'])

        station_df = pd.read_csv(station_path)
        station_df['date'] = pd.to_datetime(station_df['Date'])  # Rename for merge

        merged_df = pd.merge(gfs_df, station_df, on='date', how='inner')

        merged_df['station_lat'] = station_lat
        merged_df['station_lon'] = station_lon
        merged_df['grid_lat'] = grid_lat
        merged_df['grid_lon'] = grid_lon
        merged_df['station_file'] = station_file

        all_merged_rows.append(merged_df)
    except Exception as e:
        print(f"Error merging {station_file}: {e}")

# Save final combined dataset
if all_merged_rows:
    final_df = pd.concat(all_merged_rows, ignore_index=True)
    final_df.to_csv(output_file, index=False)
    print(f"✅ Merged dataset saved to {output_file} with shape {final_df.shape}")
else:
    print("❌ No data merged. Check for matching GFS files or date mismatches.")


Processing station files: 100%|██████████| 292/292 [00:18<00:00, 15.79it/s]


✅ Merged dataset saved to combined_dataset.csv with shape (139037, 188)


In [9]:
import pandas as pd

df = pd.read_csv('combined_dataset.csv')

# Replace this with your actual rainfall column name:
rain_col = 'Rainfall'  # e.g., 'Rainfall' or 'target' or whatever you find

# Step 1: Calculate 85th percentile
threshold = df[rain_col].quantile(0.85)

# Step 2: Create labels
df['label'] = df[rain_col].apply(lambda x: 'high' if x >= threshold else 'low')

# Step 3 (Optional): Numeric encoding
df['label_num'] = df['label'].map({'low': 0, 'high': 1})

# Save the result
df.to_csv('classified_combined_data.csv', index=False)

print(f"85th percentile threshold: {threshold:.2f} mm")
print(df['label'].value_counts())


85th percentile threshold: 26.00 mm
label
low     117904
high     21133
Name: count, dtype: int64


In [6]:
import pandas as pd
# Load the combined CSV
df = pd.read_csv("classified_combined_data.csv")

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Extract temporal features
df['month'] = df['date'].dt.month
df['dayofyear'] = df['date'].dt.dayofyear

# Drop 'date' column if not needed
df.drop(columns=['date'], inplace=True)

# Define feature columns (0-159 weather + 183-186 lat/lon + 190-191 temporal)
feature_columns = list(df.columns[0:180]) + list(df.columns[182:186]) + ['month', 'dayofyear']

# Define target column (189 is label_num)
target_column = df.columns[188]

# Final feature matrix and target
X = df[feature_columns]
y = df[target_column]

# Final DataFrame
final_df = X.copy()
final_df['label_num'] = y

# Save final DataFrame (optional)
final_df.to_csv("final_training_data.csv", index=False)
