# Estimation of Maximum Annual Peak Flow Discharge for Alabama State (MAPFD-AL) through a hybrid approach combining MLR and XGBR

## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle
from sklearn.inspection import permutation_importance
import time
import math
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import pearsonr
from math import sqrt

## Import Datasets

In [2]:
# Import the alabama state rivers dataset for application
NHD = pd.read_csv('./Data/NHD_AL.csv', converters={'site_no': str}, low_memory=False)

In [3]:
# Spliting the alabama state rivers dataset for application
df_COMID_NHD = NHD[['COMID', "REACHCODE", "FTYPE"]]
X_NHD = NHD[['width_bnk', 'depth_bnk', 'width_mean','depth_mean', 
             'ACC_NDAMS2000', 'CAT_POPDENS00', 'StreamOrde', 'TotDASqKM',
             'NLCD_Developed_%', 'NLCD_Forest_%','NLCD_Agriculture_%', 
             'D50_mm_', 'Mean_AI', 'CAT_SILTAVE', 'CAT_CLAYAVE', 'CAT_SANDAVE',
             'MINELEVSMO', 'SLOPE','QA_cms', 'QC_cms', 'QE_cms', 
             'WETINDEXCAT', 'RUNOFFCAT','TMEAN8110CAT', 'HYDRLCONDCAT']]

## Import Models

### 1. XGBR

In [4]:
# Export the final trained model for XGBR
with open('./Data/XGBR_best_param', "rb") as fp:
    XGBR = pickle.load(fp)

### 2. MLR

In [5]:
# MLR coefficients obtained through JMP software
Intercept = -0.535243
TotDASqKM_coeff = 0.5335121
Mean_AI_coeff = -3.659945
CAT_SANDAVE_coeff = -0.189748
RUNOFFCAT_coeff = 0.7921756

## Apply the MLR-XGBR model to Alabama's streams in NHDPlusV2.1 

In [6]:
# Make prediction using XGBR
xgb_pred_NHD = XGBR.predict(X_NHD)
xgb_pred_NHD0 = pd.DataFrame(xgb_pred_NHD)
xgb_pred_NHD0.columns = ['MAPFD']
xgb_pred_NHD1= pd.merge(X_NHD, xgb_pred_NHD0, left_index=True, right_index=True)
xgb_pred_NHD1= pd.merge(df_COMID_NHD, xgb_pred_NHD1, left_index=True, right_index=True)

In [7]:
# Apply MLR for streams with independent variables values outside of the training range
Q_peak_mlr = xgb_pred_NHD1[xgb_pred_NHD1['MAPFD'] < 0]
Final_df_mlr = pd.DataFrame(Q_peak_mlr)
Final_df_mlr['MAPFD'] = (10**Intercept) * (Q_peak_mlr['TotDASqKM']**TotDASqKM_coeff) * (Q_peak_mlr['Mean_AI']**Mean_AI_coeff) *(Q_peak_mlr['CAT_SANDAVE']**CAT_SANDAVE_coeff) * (Q_peak_mlr['RUNOFFCAT']**RUNOFFCAT_coeff)
Final_df_mlr['XGBR_or_MLR'] = 'MLR'

In [8]:
# Apply XGBR for streams with independent variables values within of the training range
Q_peak_xgb = xgb_pred_NHD1[xgb_pred_NHD1['MAPFD'] >= 0]
Final_df_xgb = pd.DataFrame(Q_peak_xgb)
Final_df_xgb['MAPFD'] = xgb_pred_NHD1['MAPFD']
Final_df_xgb['XGBR_or_MLR'] = 'XGBR'

In [9]:
# Merging both MLR and XGBR estimations to make the final dataset
Final_df = pd.DataFrame()
Final_df = pd.concat([Final_df_mlr, Final_df_xgb])
Final_df = Final_df.loc[:, ["COMID", "REACHCODE", "FTYPE", "TotDASqKM", "StreamOrde", "MAPFD", "XGBR_or_MLR"]]
Final_df.head()

Unnamed: 0,COMID,REACHCODE,FTYPE,TotDASqKM,StreamOrde,MAPFD,XGBR_or_MLR
79,3296990,3130002000419,StreamRiver,0.5121,1.0,16.421934,MLR
104,3298332,3130002000561,StreamRiver,1.5732,2.0,43.959116,MLR
132,3298302,3130002000585,StreamRiver,1.6056,1.0,38.342369,MLR
143,3298470,3130002000595,StreamRiver,1.1133,1.0,33.259024,MLR
146,3298158,3130002000597,StreamRiver,19.2978,2.0,151.709591,MLR


In [10]:
# Statistic descriptive analysis of the estimated value
Final_df['MAPFD'].describe()

count    77421.000000
mean       570.588237
std       1015.470522
min          0.008507
25%         96.768112
50%        247.941711
75%        668.254456
max      12561.292969
Name: MAPFD, dtype: float64

In [11]:
# Export the dataset
Final_df.to_csv('./Data/MAPFD-AL.txt')