In [1]:
#Disaster Prediction & Early Warning System (ML + GIS) — Without IoT

In [3]:

!pip install numpy pandas scikit-learn matplotlib folium joblib shapely geopandas pyproj




In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, TimeSeriesSplit, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer
import joblib
import folium
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 200)
np.random.seed(42)


In [4]:
df=pd.read_csv(r"C:\Users\vedan\Downloads\synthetic_disaster_dataset.csv")

In [7]:
# ---- Synthetic Data Generator (Flood classification + Drought regression) ----
# Geography box (e.g., a region in Maharashtra)
lat_min, lat_max = 18.5, 20.0
lon_min, lon_max = 73.0, 76.0

n_points = 2500  # locations x dates
start_date = datetime(2020, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(365)]

# Sample lat/lon and assign each a random date (simulate daily observations)
lats = np.random.uniform(lat_min, lat_max, n_points)
lons = np.random.uniform(lon_min, lon_max, n_points)
date_idx = np.random.randint(0, len(dates), n_points)
date_series = np.array([dates[i] for i in date_idx])

# Weather-like features
rain_mm = np.clip(np.random.gamma(2.0, 15.0, n_points) - 5 + 25*np.random.binomial(1, 0.2, n_points), 0, None)
temp_c = np.random.normal(28, 4, n_points) - 0.05*(rain_mm > 40)  # rainy days slightly cooler
wind_kph = np.random.normal(15, 6, n_points) + 0.2*(rain_mm > 60) * np.random.normal(8, 3, n_points)
humidity = np.clip(np.random.normal(60, 15, n_points) + 0.2*rain_mm, 10, 100)
river_level_m = np.clip(1.2 + 0.02*rain_mm + np.random.normal(0, 0.3, n_points), 0, None)
soil_moisture = np.clip(20 + 0.6*rain_mm - 0.3*temp_c + np.random.normal(0, 5, n_points), 5, 100)
evap_mm = np.clip(5 + 0.2*temp_c - 0.05*humidity + np.random.normal(0, 1.5, n_points), 0, None)

# Flood label (classification): high rainfall + river level + soil saturation
flood_risk_score = 0.03*rain_mm + 1.5*river_level_m + 0.02*soil_moisture + 0.01*wind_kph - 0.02*temp_c
flood_threshold = np.percentile(flood_risk_score, 80)  # top 20% risk = flood event
flood_event = (flood_risk_score >= flood_threshold).astype(int)

# Drought index (regression target): lower rainfall, high evap, low soil moisture -> higher drought index
drought_index = np.clip(80 - 0.35*rain_mm + 0.6*evap_mm - 0.3*soil_moisture + np.random.normal(0, 3, n_points), 0, 100)

df = pd.DataFrame({
    'date': date_series,
    'lat': lats,
    'lon': lons,
    'rain_mm': rain_mm,
    'temp_c': temp_c,
    'wind_kph': wind_kph,
    'humidity': humidity,
    'river_level_m': river_level_m,
    'soil_moisture': soil_moisture,
    'evap_mm': evap_mm,
    'flood_event': flood_event,
    'drought_index': drought_index
})

df.sort_values('date', inplace=True)
df.reset_index(drop=True, inplace=True)

print('Synthetic dataset shape:', df.shape)
df.head()

Synthetic dataset shape: (2500, 12)


Unnamed: 0,date,lat,lon,rain_mm,temp_c,wind_kph,humidity,river_level_m,soil_moisture,evap_mm,flood_event,drought_index
0,2020-01-01,18.708241,74.350763,27.172204,26.853746,17.415451,66.403607,1.496472,30.674063,6.343199,0,67.65048
1,2020-01-01,19.033959,74.44267,69.529848,29.004376,13.874468,76.906338,3.319376,50.550377,8.022703,1,49.862756
2,2020-01-01,18.550919,73.632379,34.844697,27.977871,17.720272,62.174314,1.714159,32.428313,8.671048,0,65.907568
3,2020-01-01,19.954868,75.271849,7.013067,27.257663,18.653639,65.679849,1.604903,10.968427,10.18876,0,79.710447
4,2020-01-02,19.769678,73.029936,31.734303,22.191026,11.732328,60.767464,1.442845,26.598245,5.338447,0,67.748704


In [9]:
# Create spatial buckets (approximate grid) to compute rolling stats by area
df['lat_bucket'] = df['lat'].round(2)
df['lon_bucket'] = df['lon'].round(2)

df = df.sort_values(['lat_bucket','lon_bucket','date']).copy()

def add_group_rolls(data, group_cols, feat, windows=(3,7,14)):
    for w in windows:
        col = f'{feat}_roll{w}'
        data[col] = data.groupby(group_cols)[feat].transform(lambda x: x.rolling(w, min_periods=1).mean())
    return data

for feat in ['rain_mm','temp_c','wind_kph','humidity','soil_moisture','evap_mm','river_level_m']:
    df = add_group_rolls(df, ['lat_bucket','lon_bucket'], feat)

# Example lags
for feat in ['rain_mm','river_level_m','soil_moisture']:
    df[f'{feat}_lag1'] = df.groupby(['lat_bucket','lon_bucket'])[feat].shift(1)

df.dropna(inplace=True)  # remove rows where lags may be NaN after shifting
df.reset_index(drop=True, inplace=True)
print('After FE:', df.shape)
df.head()

After FE: (72, 38)


Unnamed: 0,date,lat,lon,rain_mm,temp_c,wind_kph,humidity,river_level_m,soil_moisture,evap_mm,flood_event,drought_index,lat_bucket,lon_bucket,rain_mm_roll3,rain_mm_roll7,rain_mm_roll14,temp_c_roll3,temp_c_roll7,temp_c_roll14,wind_kph_roll3,wind_kph_roll7,wind_kph_roll14,humidity_roll3,humidity_roll7,humidity_roll14,soil_moisture_roll3,soil_moisture_roll7,soil_moisture_roll14,evap_mm_roll3,evap_mm_roll7,evap_mm_roll14,river_level_m_roll3,river_level_m_roll7,river_level_m_roll14,rain_mm_lag1,river_level_m_lag1,soil_moisture_lag1
0,2020-12-28,18.511496,73.961204,39.677109,30.778931,13.124881,63.761034,2.121165,34.532754,8.564068,0,60.284061,18.51,73.96,30.55177,30.55177,30.55177,30.028684,30.028684,30.028684,14.32959,14.32959,14.32959,72.465266,72.465266,72.465266,26.046601,26.046601,26.046601,8.222982,8.222982,8.222982,1.803914,1.803914,1.803914,21.42643,1.486663,17.560448
1,2020-09-10,18.521817,74.870946,77.601088,23.786065,20.001849,72.001772,2.400213,62.727514,5.64344,1,38.361202,18.52,74.87,49.572298,49.572298,49.572298,24.190584,24.190584,24.190584,17.633497,17.633497,17.633497,63.185862,63.185862,63.185862,43.782922,43.782922,43.782922,7.078621,7.078621,7.078621,2.037094,2.037094,2.037094,21.543508,1.673974,24.83833
2,2020-12-24,18.562359,75.349894,9.341835,36.709382,12.598994,49.60446,2.035854,26.243863,11.225842,0,77.406438,18.56,75.35,23.226598,23.226598,23.226598,33.885695,33.885695,33.885695,11.629014,11.629014,11.629014,62.890622,62.890622,62.890622,24.898129,24.898129,24.898129,9.520138,9.520138,9.520138,1.794377,1.794377,1.794377,37.111362,1.5529,23.552396
3,2020-12-01,18.577735,74.509397,64.959055,31.711982,16.491333,74.966287,2.73681,45.212839,7.327697,1,47.606065,18.58,74.51,36.507286,36.507286,36.507286,28.87806,28.87806,28.87806,11.16878,11.16878,11.16878,79.889455,79.889455,79.889455,34.297937,34.297937,34.297937,6.515082,6.515082,6.515082,1.984999,1.984999,1.984999,8.055517,1.233188,23.383034
4,2020-11-09,18.591617,75.216169,62.946784,29.664798,17.384745,81.342168,2.617841,44.759712,6.553126,1,47.912781,18.59,75.22,62.043967,62.043967,62.043967,26.952341,26.952341,26.952341,15.662204,15.662204,15.662204,72.715275,72.715275,72.715275,49.577281,49.577281,49.577281,7.673515,7.673515,7.673515,2.507076,2.507076,2.507076,61.14115,2.396311,54.39485


In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 38 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   date                  72 non-null     datetime64[ns]
 1   lat                   72 non-null     float64       
 2   lon                   72 non-null     float64       
 3   rain_mm               72 non-null     float64       
 4   temp_c                72 non-null     float64       
 5   wind_kph              72 non-null     float64       
 6   humidity              72 non-null     float64       
 7   river_level_m         72 non-null     float64       
 8   soil_moisture         72 non-null     float64       
 9   evap_mm               72 non-null     float64       
 10  flood_event           72 non-null     int32         
 11  drought_index         72 non-null     float64       
 12  lat_bucket            72 non-null     float64       
 13  lon_bucket            

In [13]:
print(df.describe())

                      date        lat        lon     rain_mm     temp_c  \
count                   72  72.000000  72.000000   72.000000  72.000000   
mean   2020-09-09 02:20:00  19.252382  74.610338   36.789645  27.812760   
min    2020-03-21 00:00:00  18.511496  73.097330    0.000000  17.618174   
25%    2020-07-11 06:00:00  18.800455  73.934897   12.880511  25.139500   
50%    2020-09-22 00:00:00  19.300397  74.625349   37.295284  27.424502   
75%    2020-11-10 00:00:00  19.688545  75.320500   54.129245  30.602387   
max    2020-12-28 00:00:00  19.950690  75.974598  110.959757  38.444024   
std                    NaN   0.450166   0.816926   25.879227   3.981314   

        wind_kph    humidity  river_level_m  soil_moisture    evap_mm  \
count  72.000000   72.000000      72.000000      72.000000  72.000000   
mean   15.121790   70.012511       1.878451      33.780203   6.874076   
min     2.557305   37.586182       0.806351       8.106713   2.480139   
25%    10.865464   60.596765    

In [15]:
print(df.isnull().sum())

date                    0
lat                     0
lon                     0
rain_mm                 0
temp_c                  0
wind_kph                0
humidity                0
river_level_m           0
soil_moisture           0
evap_mm                 0
flood_event             0
drought_index           0
lat_bucket              0
lon_bucket              0
rain_mm_roll3           0
rain_mm_roll7           0
rain_mm_roll14          0
temp_c_roll3            0
temp_c_roll7            0
temp_c_roll14           0
wind_kph_roll3          0
wind_kph_roll7          0
wind_kph_roll14         0
humidity_roll3          0
humidity_roll7          0
humidity_roll14         0
soil_moisture_roll3     0
soil_moisture_roll7     0
soil_moisture_roll14    0
evap_mm_roll3           0
evap_mm_roll7           0
evap_mm_roll14          0
river_level_m_roll3     0
river_level_m_roll7     0
river_level_m_roll14    0
rain_mm_lag1            0
river_level_m_lag1      0
soil_moisture_lag1      0
dtype: int64

In [17]:
# Time-based split (last 20% dates for test)
split_time = df['date'].quantile(0.8)
train_df = df[df['date'] <= split_time].copy()
test_df  = df[df['date'] >  split_time].copy()

print('Train size:', train_df.shape, 'Test size:', test_df.shape)

# Common features
base_feats = [c for c in df.columns if c not in ['date','flood_event','drought_index','lat_bucket','lon_bucket']]
target_flood = 'flood_event'
target_drought = 'drought_index'

X_train_flood, y_train_flood = train_df[base_feats], train_df[target_flood]
X_test_flood,  y_test_flood  = test_df[base_feats],  test_df[target_flood]

X_train_drought, y_train_drought = train_df[base_feats], train_df[target_drought]
X_test_drought,  y_test_drought  = test_df[base_feats],  test_df[target_drought]

num_features = base_feats  # all are numeric here


Train size: (57, 38) Test size: (15, 38)
