# Imports


In [1]:
import pandas as pd
import os

In [2]:
# Load the CSV
df = pd.read_csv('data/dem_features.csv')

# Rename the column
df = df.rename(columns={'file_id': 'ID'})

# (Optional) Save it back to overwrite or to a new file
df.to_csv('data/dem_features.csv', index=False)

# Input data

In [2]:
# Define your input folder and list of CSVs
folder_path = 'data'
file_names = [
    'dem_features.csv',
    'HWM_Depth_m.csv',
    'imp_surface_features.csv',
    'precipitation.csv',
    'sentinel1_combined_features.csv'
]

In [4]:
# # Rename file_id to ID only for dem_features.csv
# if file == 'dem_features.csv' and 'file_id' in df.columns:
#     df = df.rename(columns={'file_id': 'ID'})

In [3]:
# Check each file for presence of 'ID' column
for file in file_names:
    path = os.path.join(folder_path, file)
    df = pd.read_csv(path, nrows=1)  # Load just the header
    if 'ID' not in df.columns:
        print(f"❌ 'ID' column NOT found in: {file}")
    else:
        print(f"✅ 'ID' column found in: {file}")

✅ 'ID' column found in: dem_features.csv
✅ 'ID' column found in: HWM_Depth_m.csv
✅ 'ID' column found in: imp_surface_features.csv
✅ 'ID' column found in: precipitation.csv
✅ 'ID' column found in: sentinel1_combined_features.csv


In [4]:

# Read and merge all files on 'ID'
merged_df = pd.read_csv(os.path.join(folder_path, file_names[0]))

for file in file_names[1:]:
    df = pd.read_csv(os.path.join(folder_path, file))
    merged_df = pd.merge(merged_df, df, on='ID')

# Save the merged result
# merged_df.to_csv('combined_features.csv', index=False)



In [5]:
combined_features = merged_df.drop(columns={'year','projection','peak_date_x', 'nlcd_year','peak_date_y','S1_Date'})

combined_features

Unnamed: 0,ID,dem_min,dem_max,dem_mean,dem_iqr,Year,HWMdepth_m,total_area_km2,pct_area_1,pct_area_2,...,VV_Max,VV_Mean,VV_IQR,VV_SD,VH_Min,VH_Max,VH_Mean,VH_IQR,VH_SD,VH_VV_Ratio
0,2802,75.050060,113.713745,94.023910,16.608824,2016,-66.782340,9.1809,11.243996,13.273209,...,1.043524,-11.472542,2.505715,2.261898,-35.542110,0.000000,-16.544544,2.574304,2.523780,1.442099
1,6660,-0.067546,9.529492,2.542827,5.400071,2018,1.049216,9.1809,10.155867,17.743358,...,4.302662,-10.566951,4.660270,3.647917,-46.994900,0.000000,-17.248505,5.403145,5.657322,1.632307
2,3597,11.146758,24.847448,19.206924,1.413374,2017,-0.306641,9.1809,27.860014,50.210764,...,9.830022,-9.666992,3.897693,3.487658,-39.449627,4.953662,-16.086483,3.761292,3.652782,1.664063
3,3546,107.713455,159.806640,121.656044,12.458797,2017,-1.835019,9.1809,7.214979,0.637192,...,8.432064,-10.534246,3.046627,2.487751,-43.073082,0.000000,-15.950310,3.270462,2.752164,1.514139
4,4001,47.300000,71.384770,58.280823,3.454547,2017,-0.046101,9.1809,18.606019,21.674346,...,23.717863,-11.656052,5.378590,4.823078,-48.831882,11.265720,-17.683647,5.527677,4.569455,1.517122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142,3327,16.622460,27.674389,24.896990,0.681934,2017,0.097493,9.1809,32.310558,39.113812,...,8.832932,-9.098587,3.334823,2.964290,-33.998844,0.000000,-15.620875,3.098783,2.922523,1.716846
1143,5436,-0.658639,5.620000,-0.199159,0.500000,2017,1.149144,9.1809,2.813450,0.323498,...,6.621090,-9.728526,3.016736,2.546916,-56.219555,0.000000,-22.624592,9.343960,7.230828,2.325593
1144,2739,14.098309,33.741104,22.088285,2.536531,2016,2.000187,9.1809,12.851681,2.872267,...,0.000000,-10.487715,2.927360,2.547273,-32.665400,0.000000,-15.666946,3.280381,2.899613,1.493838
1145,3642,14.856120,22.962086,18.951130,2.359785,2017,-1.722499,9.1809,9.087344,4.440741,...,9.926535,-10.677384,4.007378,3.677724,-41.812954,0.000000,-16.470260,4.314484,4.294586,1.542537


In [6]:
import numpy as np
# Convert to datetime format
combined_features['peak_date'] = pd.to_datetime(df['peak_date'])

combined_features['year'] = combined_features['peak_date'].dt.year
combined_features['month'] = combined_features['peak_date'].dt.month
combined_features['day'] = combined_features['peak_date'].dt.day
combined_features['hour'] = combined_features['peak_date'].dt.hour
combined_features


combined_features['month_sin'] = np.sin(2* np.pi * combined_features['month']/12)
combined_features['day_sin'] = np.sin(2* np.pi * combined_features['day']/31)
combined_features['hour_sin'] = np.sin(2* np.pi * combined_features['hour']/24)


  combined_features['peak_date'] = pd.to_datetime(df['peak_date'])


In [7]:
# combined_features.isna().sum()
df = combined_features.dropna(axis=0, how='any', inplace=False)
df.isna().sum()
# # df['month_sin'].unique()
# df['day_sin'].unique()
# # df['hour_sin'].unique()
df.to_csv('data/combined_features.csv', index=False)
print("Merged CSV saved as 'merged_all_features.csv'")


Merged CSV saved as 'merged_all_features.csv'
