In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Cleaned_flood_risk_prediction_dataset.csv', encoding="latin1")
df.head()

Unnamed: 0,date,latitude,longitude,city,flow_rate,river_level,rain_mm,temp_c,humidity_percentage,wind_speed_2m,wind_speed_10m,wind_direction_deg,landcover_class,elevation_m,impervious_percent,city_population,city_wise_pop_density,soil_class,flood_flag,month
0,2024-01-01,16.781944,74.632222,Arjunwad,12.041433,525.445,0.0,22.68,52.07,0.290215,0.362769,89.118546,Agriculture,543,0.840733,4765.41,476.541,no data,0,1
1,2024-01-02,16.781944,74.632222,Arjunwad,14.514683,525.575,0.0,22.76,51.19,0.239648,0.29956,211.04608,Agriculture,543,0.840733,4765.41,476.541,no data,0,1
2,2024-01-03,16.781944,74.632222,Arjunwad,17.959683,525.745,0.0,23.22,57.06,0.239327,0.299159,132.66367,Agriculture,543,0.840733,4765.41,476.541,no data,0,1
3,2024-01-04,16.781944,74.632222,Arjunwad,58.816118,526.725,0.01,23.82,60.2,1.167319,1.459149,176.85283,Agriculture,543,0.840733,4765.41,476.541,no data,0,1
4,2024-01-05,16.781944,74.632222,Arjunwad,73.497979,526.875,0.03,23.2,57.59,0.451708,0.564635,304.84027,Agriculture,543,0.840733,4765.41,476.541,no data,0,1


In [2]:
df1 = df.copy()

In [3]:
df.shape

(35622, 20)

In [4]:
# === Option B — Full pipeline matched to your dataset column names ===

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# ---------- 0. Make a working copy ----------
df = df1.copy()   # use df variable internally

# ---------- 1. Normalize column names (lowercase, no spaces) and map common variants ----------
orig_cols = list(df.columns)
lc = {c: c.strip().lower().replace(" ", "_") for c in orig_cols}
df.rename(columns=lc, inplace=True)

# mapping common variants to standard names we will use
col_map = {}

# soil
for c in ['soil_texture_class','soil_texture','soil_class','soil_texture_class_']:
    if c in df.columns:
        col_map[c] = 'soil_texture_class'
        break

# Apply mapping
rename_map = {}
for a, b in col_map.items():
    if a in df.columns:
        rename_map[a] = b
df.rename(columns=rename_map, inplace=True)

# Print mapping summary and missing important columns check
print("=== Column mapping summary ===")
for k,v in rename_map.items():
    print(f"{k} -> {v}")
print("=== End mapping ===\n")

=== Column mapping summary ===
soil_class -> soil_texture_class
=== End mapping ===



In [5]:
# Check essential columns
required = ['date','city','flow_rate','river_level','rain_mm','temp_c','humidity_percentage']

# ---------- 2. Drop flood_flag if present ----------
if 'flood_flag' in df.columns:
    df = df.drop(columns=['flood_flag'])


# drop negative values in flow rate and river level
df = df[(df['flow_rate'] >= 0.5) & (df['river_level'] > 0)]



In [6]:
df = df.apply(
    lambda col: col.str.strip() if col.dtype == "object" else col
)

In [7]:
# ---------- 3. Ensure date is datetime and sort ----------
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['city','date']).reset_index(drop=True)

log feauture

In [8]:
# For flow rate, we use log1p to handle zero values (log(0) is undefined)
df['flow_rate_log'] = np.log1p(df['flow_rate'])

# For river level, we can also use log1p
df['river_level_log'] = np.log1p(df['river_level'])

## LAG & ROLLING FEAUTURES 

In [9]:
# ---------- 4. Create lag and rolling features by city ----------
# 1-day lags
df['rain_mm_lag1']     = df.groupby('city')['rain_mm'].shift(1)
df['river_level_lag1'] = df.groupby('city')['river_level_log'].shift(1)
df['flow_rate_lag1']   = df.groupby('city')['flow_rate_log'].shift(1)

In [10]:
# additional flow lags for improved flow model
df['flow_rate_lag2'] = df.groupby('city')['flow_rate_log'].shift(2)

In [11]:
df[['flow_rate_lag2',"flow_rate_log","flow_rate_lag1"]] 

Unnamed: 0,flow_rate_lag2,flow_rate_log,flow_rate_lag1
0,,6.167540,
1,,6.302483,6.167540
2,6.167540,6.233736,6.302483
3,6.302483,6.237259,6.233736
4,6.233736,6.042596,6.237259
...,...,...,...
29497,3.357550,3.357550,3.357550
29498,3.357550,3.357550,3.357550
29499,3.357550,3.357550,3.357550
29500,3.357550,3.357550,3.357550


In [12]:
# 3-day & 7-day rolling sums/means (min_periods=1 so early rows get values)
df['rain_3d_sum']   = df.groupby('city')['rain_mm'].rolling(3, min_periods=1).sum().reset_index(level=0, drop=True)
df['rain_7d_sum']   = df.groupby('city')['rain_mm'].rolling(7, min_periods=1).sum().reset_index(level=0, drop=True)
df['river_7d_mean'] = df.groupby('city')['river_level_log'].rolling(7, min_periods=1).mean().reset_index(level=0, drop=True)
df['flow_3d_mean']  = df.groupby('city')['flow_rate_log'].rolling(3, min_periods=1).mean().reset_index(level=0, drop=True)
df['flow_7d_mean']  = df.groupby('city')['flow_rate_log'].rolling(7, min_periods=1).mean().reset_index(level=0, drop=True)

In [13]:
df[['flow_7d_mean','flow_rate_log','flow_3d_mean']].head(10)

Unnamed: 0,flow_7d_mean,flow_rate_log,flow_3d_mean
0,6.16754,6.16754,6.16754
1,6.235012,6.302483,6.235012
2,6.234586,6.233736,6.234586
3,6.235255,6.237259,6.257826
4,6.196723,6.042596,6.171197
5,6.15292,5.933903,6.071253
6,6.163076,6.224012,6.066837
7,5.83661,3.882279,5.346731
8,5.437568,3.509188,4.538493
9,5.031247,3.389493,3.593653


In [14]:
df.shape

(29502, 30)

In [15]:
# trend features
df['river_diff_1']  = df['river_level_log'] - df['river_level_lag1']
df['rain_change_1'] = df['rain_mm'] - df['rain_mm_lag1']

# Drop rows that do not have essential lag history
df = df.dropna(subset=['rain_mm_lag1','river_level_lag1','flow_rate_lag1']).reset_index(drop=True)


In [16]:
df.shape

(29308, 32)

In [17]:
# ---------- 5. Encode categorical columns (train-based mapping later) ----------
df['soil_class_enc'] = LabelEncoder().fit_transform(df['soil_texture_class'].astype(str))
df['landcover_class_enc'] = LabelEncoder().fit_transform(df['landcover_class'].astype(str))
df['city_enc'] = LabelEncoder().fit_transform(df['city'].astype(str))

In [18]:
df[["city_enc","city"]].value_counts().sort_index(ascending=False)

city_enc  city            
190       Yerramsettipalem    189
189       Yenkampeta          129
188       Wadapally            89
187       Vilayur             207
186       Vennikulam          198
                             ... 
4         Badlapur            212
3         Arjunwad            212
2         Aranmanai Pudhur    174
1         Apati               211
0         Akteshwar           170
Name: count, Length: 191, dtype: int64

In [19]:
df[["landcover_class_enc","landcover_class"]].value_counts().sort_index()

landcover_class_enc  landcover_class
0                    Agriculture        17091
1                    Forest              8853
2                    Grassland            949
3                    Shrubland            212
4                    Urban               1437
5                    Water                766
Name: count, dtype: int64

In [20]:
df[["soil_texture_class","soil_class_enc"]].value_counts().sort_index()

soil_texture_class  soil_class_enc
Clay                0                 13616
Clay loam           1                 12169
Sandy clay loam     2                   212
no data             3                  3311
Name: count, dtype: int64

In [21]:
# ---------- 6. Build month column and split train/test (Jan-Jun train, July test) ----------
df['month'] = df['date'].dt.month
train = df[df['month'] <= 6].copy()
test  = df[df['month'] == 7].copy()
print(f"Train rows: {len(train)}, Test (July) rows: {len(test)}")


Train rows: 24085, Test (July) rows: 5223


In [22]:
df[['rain_mm','rain_3d_sum']].head()

Unnamed: 0,rain_mm,rain_3d_sum
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [23]:
# ---------- 7. Set feature lists for river & flow regressors ----------
base_features = [
    'rain_mm','rain_mm_lag1','rain_3d_sum','rain_7d_sum',
    'temp_c','humidity_percentage',
    'wind_speed_2m','wind_speed_10m','wind_direction_deg',
    'elevation_m','impervious_percent',
    'soil_class_enc','landcover_class_enc'
]

# extra hydrological features
river_features = base_features + [
    'river_level_lag1','river_7d_mean',
    'flow_rate_lag1','flow_3d_mean'
]

flow_features = base_features + [
    'flow_rate_lag1','flow_rate_lag2','flow_3d_mean','flow_7d_mean',
    'river_level_log','river_level_lag1','river_7d_mean'
]

# ensure features exist in df (drop missing ones gracefully)
river_features = [c for c in river_features if c in train.columns]
flow_features  = [c for c in flow_features if c in train.columns]

print("Using base features:", base_features)
print("Using river features:", river_features)
print("Using flow features:", flow_features)

Using base features: ['rain_mm', 'rain_mm_lag1', 'rain_3d_sum', 'rain_7d_sum', 'temp_c', 'humidity_percentage', 'wind_speed_2m', 'wind_speed_10m', 'wind_direction_deg', 'elevation_m', 'impervious_percent', 'soil_class_enc', 'landcover_class_enc']
Using river features: ['rain_mm', 'rain_mm_lag1', 'rain_3d_sum', 'rain_7d_sum', 'temp_c', 'humidity_percentage', 'wind_speed_2m', 'wind_speed_10m', 'wind_direction_deg', 'elevation_m', 'impervious_percent', 'soil_class_enc', 'landcover_class_enc', 'river_level_lag1', 'river_7d_mean', 'flow_rate_lag1', 'flow_3d_mean']
Using flow features: ['rain_mm', 'rain_mm_lag1', 'rain_3d_sum', 'rain_7d_sum', 'temp_c', 'humidity_percentage', 'wind_speed_2m', 'wind_speed_10m', 'wind_direction_deg', 'elevation_m', 'impervious_percent', 'soil_class_enc', 'landcover_class_enc', 'flow_rate_lag1', 'flow_rate_lag2', 'flow_3d_mean', 'flow_7d_mean', 'river_level_log', 'river_level_lag1', 'river_7d_mean']


In [24]:
# ---------- 8. Prepare training data (X,y) ----------
X_train_river = train[river_features].apply(pd.to_numeric, errors='coerce').fillna(0)
y_train_river = train['river_level_log'].astype(float)

X_test_river  = test[river_features].apply(pd.to_numeric, errors='coerce').fillna(0)
y_test_river  = test['river_level_log'].astype(float)

X_train_flow = train[flow_features].apply(pd.to_numeric, errors='coerce').fillna(0)
y_train_flow = train['flow_rate_log'].astype(float)

X_test_flow  = test[flow_features].apply(pd.to_numeric, errors='coerce').fillna(0)
y_test_flow  = test['flow_rate_log'].astype(float)

In [25]:
X_test_river.head()

Unnamed: 0,rain_mm,rain_mm_lag1,rain_3d_sum,rain_7d_sum,temp_c,humidity_percentage,wind_speed_2m,wind_speed_10m,wind_direction_deg,elevation_m,impervious_percent,soil_class_enc,landcover_class_enc,river_level_lag1,river_7d_mean,flow_rate_lag1,flow_3d_mean
146,27.6,30.76,80.42,118.79,27.51,89.09,4.32331,5.404138,197.217,44,1.941088,1,0,2.797281,2.798413,6.82515,6.829394
147,7.28,27.6,65.64,108.72,27.72,88.38,4.921418,6.151772,220.52153,44,1.941088,1,0,2.797281,2.798238,6.823358,6.821801
148,4.97,7.28,39.85,111.25,28.74,82.88,4.283816,5.35477,222.5261,44,1.941088,1,0,2.797281,2.798064,6.816896,6.824446
149,5.93,4.97,18.18,105.64,28.65,82.16,3.655227,4.569034,217.71129,44,1.941088,1,0,2.796671,2.798064,6.833083,6.830305
150,8.83,5.93,19.73,107.43,27.72,86.85,3.134889,3.918611,209.71524,44,1.941088,1,0,2.799109,2.795965,6.840936,6.708616


In [26]:
print(y_test_flow.head(5))
df[df['month'] == 7].head(5)

146    6.823358
147    6.816896
148    6.833083
149    6.840936
150    6.451829
Name: flow_rate_log, dtype: float64


Unnamed: 0,date,latitude,longitude,city,flow_rate,river_level,rain_mm,temp_c,humidity_percentage,wind_speed_2m,...,rain_3d_sum,rain_7d_sum,river_7d_mean,flow_3d_mean,flow_7d_mean,river_diff_1,rain_change_1,soil_class_enc,landcover_class_enc,city_enc
146,2024-07-01,21.89,73.65,Akteshwar,918.065716,15.4,27.6,27.51,89.09,4.32331,...,80.42,118.79,2.798413,6.829394,6.843561,0.0,-3.16,1,0,0
147,2024-07-02,21.89,73.65,Akteshwar,912.146401,15.4,7.28,27.72,88.38,4.921418,...,65.64,108.72,2.798238,6.821801,6.836361,0.0,-20.32,1,0,0
148,2024-07-03,21.89,73.65,Akteshwar,927.047503,15.39,4.97,28.74,82.88,4.283816,...,39.85,111.25,2.798064,6.824446,6.835413,-0.00061,-2.31,1,0,0
149,2024-07-04,21.89,73.65,Akteshwar,934.364471,15.43,5.93,28.65,82.16,3.655227,...,18.18,105.64,2.798064,6.830305,6.834121,0.002438,0.96,1,0,0
150,2024-07-05,21.89,73.65,Akteshwar,632.860341,15.21,8.83,27.72,86.85,3.134889,...,19.73,107.43,2.795965,6.708616,6.775847,-0.013481,2.9,1,0,0


## Scaling

In [27]:

# ---------- 9. Scaling (features) ----------
scaler_river = StandardScaler()
X_train_river_s = scaler_river.fit_transform(X_train_river)
X_test_river_s  = scaler_river.transform(X_test_river)

scaler_flow = StandardScaler()
X_train_flow_s = scaler_flow.fit_transform(X_train_flow)
X_test_flow_s  = scaler_flow.transform(X_test_flow)

In [28]:
# ---------- 10. Train XGBoost regressors ----------
print("Training river level model...")
model_river = XGBRegressor(
    n_estimators=300, max_depth=5, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0
)
model_river.fit(X_train_river_s, y_train_river)

print("Training flow rate model...")
model_flow = XGBRegressor(
    n_estimators=300, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0
)
model_flow.fit(X_train_flow_s, y_train_flow)

Training river level model...
Training flow rate model...


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [29]:

# ---------- 11. Predict July (we hide the actual river_level/flow_rate from inputs) ----------
pred_river_test = model_river.predict(X_test_river_s)
pred_flow_test  = model_flow.predict(X_test_flow_s)

In [30]:
pred_river_test

array([2.815204 , 2.8066943, 2.800273 , ..., 3.0516894, 3.0507176,
       3.0516894], shape=(5223,), dtype=float32)

In [31]:
pred_flow_test

array([6.849785 , 6.824294 , 6.827981 , ..., 3.349975 , 3.349778 ,
       3.3488476], shape=(5223,), dtype=float32)

In [32]:
# Save predictions and actuals into test
#  joining in july column that we predicted
test = test.reset_index(drop=True)
test['pred_river_level'] = pred_river_test
test['pred_flow_rate']   = pred_flow_test
test['actual_river_level'] = y_test_river.values
test['actual_flow_rate']   = y_test_flow.values

## Regression model(MSE & R square test)

In [33]:
# ---------- 12. Evaluate regressors (optional) ----------
mse_r = mean_squared_error(test['actual_river_level'], test['pred_river_level'])
r2_r  = r2_score(test['actual_river_level'], test['pred_river_level'])
mse_f = mean_squared_error(test['actual_flow_rate'], test['pred_flow_rate'])
r2_f  = r2_score(test['actual_flow_rate'], test['pred_flow_rate'])
print("\nRiver level regression: MSE = {:.3f}, R2 = {:.3f}".format(mse_r, r2_r))
print("Flow rate regression:   MSE = {:.3f}, R2 = {:.3f}".format(mse_f, r2_f))


River level regression: MSE = 0.004, R2 = 0.998
Flow rate regression:   MSE = 0.079, R2 = 0.980


In [34]:
w_r = 0.7 
w_f = 0.3

In [37]:
# ---------- Severity score for TRAIN (FINAL & CORRECT) ----------

pred_river_train_log = model_river.predict(scaler_river.transform(X_train_river))
pred_flow_train_log  = model_flow.predict(scaler_flow.transform(X_train_flow))

train = train.reset_index(drop=True)

# Log predictions
train['pred_river_level_log'] = pred_river_train_log
train['pred_flow_rate_log']   = pred_flow_train_log

# Back to real scale
train['pred_river_level_real'] = np.expm1(train['pred_river_level_log'])
train['pred_flow_rate_real']   = np.expm1(train['pred_flow_rate_log'])

train['actual_river_level_real'] = np.expm1(train['river_level_log'])
train['actual_flow_rate_real']   = np.expm1(train['flow_rate_log'])


# Percentage error (safe now)
train['river_pct_error'] = (
    (train['pred_river_level_real'] - train['actual_river_level_real']).abs()
    / train['actual_river_level_real']
)

train['flow_pct_error'] = (
    (train['pred_flow_rate_real'] - train['actual_flow_rate_real']).abs()
    / train['actual_flow_rate_real']
)


train['severity_score'] = (
    w_r * train['river_pct_error'] +
    w_f * train['flow_pct_error']
)

In [44]:
train["severity_score"].sort_values(ascending=False)

16521    4.021342
16491    3.845744
16485    2.080492
16497    1.206395
16503    1.059250
           ...   
17098    0.000043
13226    0.000043
6186     0.000030
17728    0.000030
12089    0.000029
Name: severity_score, Length: 24085, dtype: float64

In [39]:
# Convert log predictions back to original scale
test['pred_river_level_real'] = np.expm1(test['pred_river_level'])
test['pred_flow_rate_real']   = np.expm1(test['pred_flow_rate'])

test['actual_river_level_real'] = np.expm1(test['actual_river_level'])
test['actual_flow_rate_real']   = np.expm1(test['actual_flow_rate'])

In [40]:
test['river_pct_error'] = (
    (test['pred_river_level_real'] - test['actual_river_level_real']).abs()
    / test['actual_river_level_real']
)

test['flow_pct_error'] = (
    (test['pred_flow_rate_real'] - test['actual_flow_rate_real']).abs()
    / test['actual_flow_rate_real']
)

test['severity_score'] = (
    w_r * test['river_pct_error'] +
    w_f * test['flow_pct_error']
)

In [42]:
test['severity_score'].sort_values(ascending=False)

4515    1.870297
4508    1.756130
4518    1.131092
4523    1.080082
4521    1.079836
          ...   
3723    0.000233
3726    0.000221
2823    0.000198
3733    0.000185
2251    0.000135
Name: severity_score, Length: 5223, dtype: float64

In [45]:
# ---------- 14. Risk categories based on TRAIN quantiles ----------
q30 = train['severity_score'].quantile(0.30)
q50 = train['severity_score'].quantile(0.50)
q75 = train['severity_score'].quantile(0.75)
q90 = train['severity_score'].quantile(0.90)
print("\nSeverity thresholds (train quantiles):")
print("30%:", q30, "50%:", q50, "75%:", q75, "90%:", q90)




Severity thresholds (train quantiles):
30%: 0.005861666934474487 50%: 0.011517445549434836 75%: 0.024495091696205937 90%: 0.04552943619668622


In [46]:
def severity_to_label(x):
    if x < q30:
        return 'No Risk'
    elif x < q50:
        return 'Low'
    elif x < q75:
        return 'Medium'
    elif x < q90:
        return 'High'
    else:
        return 'Very High'

test['risk_category'] = test['severity_score'].apply(severity_to_label)

In [47]:
# ---------- 14. Risk categories based on TEST quantiles ----------
train['risk_category'] = train['severity_score'].apply(severity_to_label)


In [48]:
# ---------- 15. Output summaries ----------
print("\nTest set risk category counts:")
print(test['risk_category'].value_counts())


Test set risk category counts:
risk_category
Very High    2412
High         1032
Medium        920
Low           431
No Risk       428
Name: count, dtype: int64


In [49]:
print("TRAIN risk category counts:")
print(train.shape)
print(train['risk_category'].value_counts())


TRAIN risk category counts:
(24085, 45)
risk_category
No Risk      7226
Medium       6021
Low          4816
High         3613
Very High    2409
Name: count, dtype: int64


In [50]:
display_cols = ['date','city','pred_river_level','actual_river_level','pred_flow_rate','actual_flow_rate','severity_score','risk_category']
print("\nSample predictions (first 10 rows of July test):")
print(test[display_cols].shape)
print(test[display_cols].head(10).to_string(index=False))


Sample predictions (first 10 rows of July test):
(5223, 8)
      date      city  pred_river_level  actual_river_level  pred_flow_rate  actual_flow_rate  severity_score risk_category
2024-07-01 Akteshwar          2.815204            2.797281        6.849785          6.823358        0.021524        Medium
2024-07-02 Akteshwar          2.806694            2.797281        6.824294          6.816896        0.009280           Low
2024-07-03 Akteshwar          2.800273            2.796671        6.827981          6.833083        0.004218       No Risk
2024-07-04 Akteshwar          2.799989            2.799109        6.844987          6.840936        0.001875       No Risk
2024-07-05 Akteshwar          2.790237            2.785628        6.727748          6.451829        0.098919     Very High
2024-07-06 Akteshwar          2.793436            2.768832        6.285167          6.099796        0.079840     Very High
2024-07-07 Akteshwar          2.771956            2.757475        5.972147     

In [51]:
# ---------- 16. Save results ----------
outfn = "july_flood_risk_prediction_Severity_score.csv"
test.to_csv(outfn, index=False)
print(f"\nSaved test predictions and risk to '{outfn}'")


Saved test predictions and risk to 'july_flood_risk_prediction_Severity_score.csv'


In [52]:
outfn = "jan-june_flood_risk_prediction_Severity_score.csv"
train.to_csv(outfn, index=False)
print(f"\nSaved test predictions and risk to '{outfn}'")


Saved test predictions and risk to 'jan-june_flood_risk_prediction_Severity_score.csv'


In [53]:
# ---------- 17. Show feature importance for both models ----------
def show_xgb_importance(xgb_model, feature_names, topn=15):
    booster = xgb_model.get_booster()
    imp = booster.get_score(importance_type='gain')
    if not imp:
        print("No importances found.")
        return
    fi = pd.DataFrame([{'f':k,'gain':v} for k,v in imp.items()])
    mapping = {f"f{i}": name for i,name in enumerate(feature_names)}
    fi['feature'] = fi['f'].map(mapping)
    fi = fi.sort_values('gain', ascending=False).reset_index(drop=True)
    print(fi[['feature','gain']].head(topn).to_string(index=False))

In [54]:
print("\nTop features for river model:")
show_xgb_importance(model_river, X_train_river.columns.tolist())


Top features for river model:
            feature       gain
        elevation_m 236.185379
   river_level_lag1 113.950897
      river_7d_mean 101.673309
 impervious_percent   0.666327
     soil_class_enc   0.053869
     flow_rate_lag1   0.030893
       rain_mm_lag1   0.029903
        rain_3d_sum   0.025590
             temp_c   0.025187
            rain_mm   0.024274
       flow_3d_mean   0.023955
humidity_percentage   0.022509
 wind_direction_deg   0.020661
        rain_7d_sum   0.019794
      wind_speed_2m   0.019108


In [55]:
show_xgb_importance(model_flow, X_train_flow.columns.tolist())

            feature       gain
       flow_3d_mean 184.563019
       flow_7d_mean  21.679045
     flow_rate_lag1   2.289185
     flow_rate_lag2   2.211925
 impervious_percent   0.645724
        elevation_m   0.283087
        rain_3d_sum   0.262062
       rain_mm_lag1   0.244521
   river_level_lag1   0.230154
    river_level_log   0.220885
        rain_7d_sum   0.218897
landcover_class_enc   0.215964
      river_7d_mean   0.201453
     soil_class_enc   0.156375
 wind_direction_deg   0.144244


In [56]:
combined_df = pd.concat([train, test], axis=0, ignore_index=True)

## Merging 2 dataframe into 1 Cleaned dataframe

In [57]:
combined_df.shape

(29308, 49)

In [58]:
combined_data = combined_df.drop(columns= 
                                 ["river_diff_1","rain_change_1", "soil_class_enc", "landcover_class_enc", "city_enc","pred_river_level", "pred_flow_rate",
                                  	"actual_river_level",	"actual_flow_rate","pred_river_level_real", "pred_flow_rate_real", "actual_river_level_real",
                                    "actual_flow_rate_real", "river_pct_error", "flow_pct_error","severity_score"])

In [59]:
combined_data = combined_data.rename(
    columns={
    'rain_mm_lag1'     : 'yest_rain_mm',
    'river_level_lag1' : 'yest_river_lvl',
    'flow_rate_lag1'   : 'yest_flow_rate',
    'flow_rate_lag2'   : '2day_b4_flow_rate',
    'rain_3d_sum'      : '3day_of_rain_mm',
    'rain_7d_sum'      : '7day_of_rain_mm',
    'river_7d_mean'	   : '7day_of_avg_river_lvl',
    'flow_3d_mean'     : '3day_of_avg_flow_rate',	
    'flow_7d_mean'     : '7day_of_avg_flow_rate',
    })

In [60]:
combined_data.shape

(29308, 33)

In [61]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29308 entries, 0 to 29307
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   29308 non-null  datetime64[ns]
 1   latitude               29308 non-null  float64       
 2   longitude              29308 non-null  float64       
 3   city                   29308 non-null  object        
 4   flow_rate              29308 non-null  float64       
 5   river_level            29308 non-null  float64       
 6   rain_mm                29308 non-null  float64       
 7   temp_c                 29308 non-null  float64       
 8   humidity_percentage    29308 non-null  float64       
 9   wind_speed_2m          29308 non-null  float64       
 10  wind_speed_10m         29308 non-null  float64       
 11  wind_direction_deg     29308 non-null  float64       
 12  landcover_class        29308 non-null  object        
 13  e

In [62]:
outfn = "cleaned_flood_risk_prediction_risk_cat_dataset1.csv"
combined_data.to_csv(outfn, index=False)
print(f"\nSaved test predictions and risk to '{outfn}'")


Saved test predictions and risk to 'cleaned_flood_risk_prediction_risk_cat_dataset1.csv'
