# Importing python libraries


In [1]:
import os
import pandas as pd
import numpy as np
print("All libraries imported successfully")

All libraries imported successfully


# Merging all data

In [2]:
# Defining the input folder and list of CSVs
folder_path = '../data'
file_names = [
    'dem_features.csv',
    'combined_nlcd_features.csv',
    'precipitation.csv',
    'sentinel1_combined_features.csv',
    'USGS_HWM_Height.csv',
    'soil_moisture.csv',
    'weather_data.csv'
]

# Read and merge all files on 'ID' with custom suffixes
merged_df = pd.read_csv(os.path.join(folder_path, file_names[0]))

if 'file_id' in merged_df.columns:
    merged_df = merged_df.rename(columns={'file_id': 'ID'})
unnamed_cols = [col for col in merged_df.columns if col.startswith('Unnamed')]
merged_df = merged_df.drop(columns=unnamed_cols)

unique_columns = ['ID', 'peak_date', 'S1_Date']

for i, file in enumerate(file_names[1:], 1):
    df = pd.read_csv(os.path.join(folder_path, file))
    unnamed_cols = [col for col in df.columns if col.startswith('Unnamed')]
    df = df.drop(columns=unnamed_cols)
    cols_to_keep = ['ID'] + [col for col in df.columns
                            if col not in merged_df.columns or col not in unique_columns]
    df_filtered = df[cols_to_keep]
    merged_df = pd.merge(merged_df, df_filtered, on='ID')
combined_features = merged_df.drop(columns={'year','projection', 'nlcd_year'}, errors='ignore')
print("All data merged successfully")

All data merged successfully


## DateTime Cyclical Transformation

In [3]:
# Convert to datetime and extract temporal features
combined_features['peak_date'] = pd.to_datetime(combined_features['peak_date'], format='%m/%d/%Y %I:%M:%S %p')
# Set ID as index to preserve it
combined_features = combined_features.set_index('ID')

# Extract and encode temporal features in one go
temporal_features = {
    'month': (combined_features['peak_date'].dt.month, 12),
    'day': (combined_features['peak_date'].dt.day, 31),
    'hour': (combined_features['peak_date'].dt.hour, 24)
}
# Create cyclical features and add year
combined_features['year'] = combined_features['peak_date'].dt.year
for feature, (values, period) in temporal_features.items():
    combined_features[f'{feature}_sin'] = np.sin(2 * np.pi * values / period)

combined_features = combined_features.drop(columns=['peak_date', 'S1_Date']).select_dtypes(exclude=['datetime64'])
print("Cyclical features created successfully")

Cyclical features created successfully


# Handling Missing Data with KNN Imputation

In [4]:
from sklearn.impute import KNNImputer
import numpy as np
imputer = KNNImputer(n_neighbors=3)
df_imputed_array = imputer.fit_transform(combined_features)

# Convert back to DataFrame to preserve column names
df_imputed = pd.DataFrame(df_imputed_array,
                         columns=combined_features.columns,
                         index=combined_features.index)
df_imputed['year'] = df_imputed['year'].astype('Int64')
print("All missing data handled successfully")

All missing data handled successfully


 # Exporting the combined features to a file

In [7]:
# df_imputed.to_csv(os.path.join('../data/combined_features.csv'))
df_imputed.to_csv(os.path.join('data', 'combined_features.csv'), index=False)
print("Combined features saved successfully")
df_imputed

Combined features saved successfully


Unnamed: 0_level_0,dem_min,dem_max,dem_mean,dem_iqr,total_area_km2,pct_area_1,pct_area_2,area_km_1,area_km_2,cai_1,...,feelslike_c,windchill_c,heatindex_c,chance_of_rain,chance_of_snow,vis_km,year,month_sin,day_sin,hour_sin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2802,75.050060,113.713745,94.023910,16.608824,9.1809,11.243996,13.273209,1.0323,1.2186,23.266857,...,8.9,8.9,10.4,0.0,0.0,10.0,2016,-0.866025,0.651372,-0.866025
6660,-0.067546,9.529492,2.542827,5.400071,9.1809,10.155867,17.743358,0.9324,1.6290,6.375839,...,28.0,25.3,28.0,100.0,0.0,5.0,2018,-1.000000,0.299363,-0.866025
3597,11.146758,24.847448,19.206924,1.413374,9.1809,27.860014,50.210764,2.5578,4.6098,30.544090,...,25.1,22.8,25.1,100.0,0.0,5.0,2017,-0.866025,-0.394356,-0.965926
3546,107.713455,159.806640,121.656044,12.458797,9.1809,7.214979,0.637192,0.6624,0.0585,17.934003,...,24.6,22.2,24.6,0.0,0.0,10.0,2017,-0.866025,-0.394356,-0.965926
4001,47.300000,71.384770,58.280823,3.454547,9.1809,18.606019,21.674346,1.7082,1.9899,26.815981,...,24.9,22.7,24.9,0.0,0.0,10.0,2017,-0.866025,-0.201299,-0.965926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3327,16.622460,27.674389,24.896990,0.681934,9.1809,32.310558,39.113812,2.9664,3.5910,28.198032,...,25.0,22.6,25.0,100.0,0.0,5.0,2017,-0.866025,-0.394356,-0.965926
5436,-0.658639,5.620000,-0.199159,0.500000,9.1809,2.813450,0.323498,0.2583,0.0297,5.952381,...,35.6,29.2,35.6,100.0,0.0,5.0,2017,-1.000000,0.897805,0.707107
2739,14.098309,33.741104,22.088285,2.536531,9.1809,12.851681,2.872267,1.1799,0.2637,24.575514,...,11.1,11.1,12.3,0.0,0.0,10.0,2016,-0.866025,0.651372,-0.866025
3642,14.856120,22.962086,18.951130,2.359785,9.1809,9.087344,4.440741,0.8343,0.4077,0.000000,...,25.0,22.6,25.0,100.0,0.0,5.0,2017,-0.866025,-0.394356,-0.965926
