In [4]:
import numpy as np
import pandas as pd
import matplotlib
import sklearn

## **exploring dataset**

In [5]:
src_data=pd.read_csv('datastet/train_data.csv')
src_data.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [6]:
src_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 22 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   id                               1117957 non-null  int64  
 1   MonsoonIntensity                 1117957 non-null  int64  
 2   TopographyDrainage               1117957 non-null  int64  
 3   RiverManagement                  1117957 non-null  int64  
 4   Deforestation                    1117957 non-null  int64  
 5   Urbanization                     1117957 non-null  int64  
 6   ClimateChange                    1117957 non-null  int64  
 7   DamsQuality                      1117957 non-null  int64  
 8   Siltation                        1117957 non-null  int64  
 9   AgriculturalPractices            1117957 non-null  int64  
 10  Encroachments                    1117957 non-null  int64  
 11  IneffectiveDisasterPreparedness  1117957 non-null 

In [7]:
src_data.columns

Index(['id', 'MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability'],
      dtype='object')

### **performing and generating the EDA report on the data using pandas profiling**

In [21]:
from ydata_profiling import ProfileReport

profile_report=ProfileReport(src_data, title="eda report", explorative=True)
profile_report.to_file("eda_Report.html")
print("EDA report saved successfully as eda_Report.html!")


Summarize dataset: 100%|██████████| 515/515 [07:33<00:00,  1.13it/s, Completed]                                                               
Generate report structure: 100%|██████████| 1/1 [00:12<00:00, 12.92s/it]
Render HTML: 100%|██████████| 1/1 [00:20<00:00, 20.70s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  5.89it/s]

EDA report saved successfully as eda_Report.html!





In [8]:

#dividing the train data into X and y, ie: the supporting features/variables and target feature
# y= FloodProbability
# X= ['id', 'MonsoonIntensity', 'TopographyDrainage', 'RiverManagement','Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality','Siltation', 'AgriculturalPractices', 'Encroachments','IneffectiveDisasterPreparedness', 'DrainageSystems','CoastalVulnerability', 'Landslides', 'Watersheds','DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss','InadequatePlanning', 'PoliticalFactors', 'FloodProbability']

X=src_data.drop(columns=['FloodProbability'])
y=src_data['FloodProbability']

In [9]:
X.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,0,5,8,5,8,6,4,4,3,3,...,2,5,3,3,5,4,7,5,7,3
1,1,6,7,4,4,8,8,3,5,4,...,9,7,2,0,3,5,3,3,4,3
2,2,6,5,6,7,3,7,1,5,4,...,6,7,3,7,5,6,8,2,3,3
3,3,3,4,6,5,4,8,4,7,6,...,5,2,4,7,4,4,6,5,7,5
4,4,5,3,2,6,4,4,3,3,3,...,5,2,2,6,6,4,1,2,3,5


In [10]:
y.head()

0    0.445
1    0.450
2    0.530
3    0.535
4    0.415
Name: FloodProbability, dtype: float64

In [11]:
#splitted our data into train, test and split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25)

### *training on the baseline ( basic models to set as a referrence to compare with) models*

1. **training on RandomForestRegressor**

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#trainig
ran_forest=RandomForestRegressor(n_estimators=100, random_state=42)
ran_forest.fit(X_train, y_train)

#predicitng on the test ddata
y_pred_RanForest=ran_forest.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred_RanForest, squared=False)}")
print(f"R^2: {r2_score(y_test, y_pred_RanForest)}")

RMSE: 0.030401061403143904
R^2: 0.6461105515264456


2. **XGBosst (better_performance)**

In [14]:
import xgboost as xgb

#training
xgbModel = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300)
xgbModel.fit(X_train, y_train)

#metrics/evaluaiton
y_predXGB=xgbModel.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_predXGB, squared=False)}")


RMSE: 0.021334202824177608


*since we are having a total of 21 supporting variables for predicting the flood in areas, it is important that we decide that whom to give priority and whom not too