In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

import os
for dirname, _, filenames in os.walk("/content/flood_risk_dataset_india.csv"):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [7]:
df = pd.read_csv("/content/flood_risk_dataset_india.csv")
df.head()

Unnamed: 0,Latitude,Longitude,Rainfall (mm),Temperature (°C),Humidity (%),River Discharge (m³/s),Water Level (m),Elevation (m),Land Cover,Soil Type,Population Density,Infrastructure,Historical Floods,Flood Occurred
0,18.861663,78.835584,218.999493,34.144337,43.912963,4236.182888,7.415552,377.465433,Water Body,Clay,7276.742184,1,0,1
1,35.570715,77.654451,55.353599,28.778774,27.585422,2472.585219,8.811019,7330.608875,Forest,Peat,6897.736956,0,1,0
2,29.227824,73.108463,103.991908,43.934956,30.108738,977.328053,4.631799,2205.873488,Agricultural,Loam,4361.518494,1,1,1
3,25.361096,85.610733,198.984191,21.569354,34.45369,3683.208933,2.891787,2512.2778,Desert,Sandy,6163.069701,1,1,0
4,12.524541,81.822101,144.626803,32.635692,36.292267,2093.390678,3.188466,2001.818223,Agricultural,Loam,6167.964591,1,0,0


In [8]:
df.isnull().sum()

Unnamed: 0,0
Latitude,0
Longitude,0
Rainfall (mm),0
Temperature (°C),0
Humidity (%),0
River Discharge (m³/s),0
Water Level (m),0
Elevation (m),0
Land Cover,0
Soil Type,0


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Split features and target
X = df.drop(columns=['Flood Occurred'])
y = df['Flood Occurred']

# Preprocess categorical and numerical columns
numeric_features = ['Latitude', 'Longitude', 'Rainfall (mm)', 'Temperature (°C)', 'Humidity (%)',
                    'River Discharge (m³/s)', 'Water Level (m)', 'Elevation (m)', 'Population Density']

categorical_features = ['Land Cover', 'Soil Type', 'Infrastructure', 'Historical Floods']

# Define transformers for preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Build pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# Predict probabilities of flood occurrence for the dataset
flood_probabilities = pipeline.predict_proba(X)[:, 1]  # Get the probability of flood (class 1)

# Add the flood probability to the dataframe as a new column
df['Flood Probability'] = flood_probabilities

df[['Flood Occurred', 'Flood Probability']]

Unnamed: 0,Flood Occurred,Flood Probability
0,1,0.59
1,0,0.20
2,1,0.78
3,0,0.54
4,0,0.20
...,...,...
9995,0,0.26
9996,0,0.19
9997,1,0.76
9998,1,0.42


In [11]:
df


Unnamed: 0,Latitude,Longitude,Rainfall (mm),Temperature (°C),Humidity (%),River Discharge (m³/s),Water Level (m),Elevation (m),Land Cover,Soil Type,Population Density,Infrastructure,Historical Floods,Flood Occurred,Flood Probability
0,18.861663,78.835584,218.999493,34.144337,43.912963,4236.182888,7.415552,377.465433,Water Body,Clay,7276.742184,1,0,1,0.59
1,35.570715,77.654451,55.353599,28.778774,27.585422,2472.585219,8.811019,7330.608875,Forest,Peat,6897.736956,0,1,0,0.20
2,29.227824,73.108463,103.991908,43.934956,30.108738,977.328053,4.631799,2205.873488,Agricultural,Loam,4361.518494,1,1,1,0.78
3,25.361096,85.610733,198.984191,21.569354,34.453690,3683.208933,2.891787,2512.277800,Desert,Sandy,6163.069701,1,1,0,0.54
4,12.524541,81.822101,144.626803,32.635692,36.292267,2093.390678,3.188466,2001.818223,Agricultural,Loam,6167.964591,1,0,0,0.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,32.872024,93.434120,285.682635,37.621017,69.795616,4830.703665,5.943965,2850.197900,Agricultural,Clay,6943.559433,0,0,0,0.26
9996,34.027756,69.357605,224.347263,37.935808,38.095486,1866.199787,9.466158,3314.692947,Forest,Clay,3011.997459,1,0,0,0.19
9997,35.454530,76.807256,5.836759,23.087083,79.919607,1523.374305,9.209185,3377.296962,Desert,Clay,7149.938303,1,0,1,0.76
9998,19.527152,80.856280,120.301453,28.029593,61.680873,2036.812638,2.004644,1146.986151,Water Body,Sandy,906.031452,1,0,1,0.42


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor # Import RandomForestRegressor here
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle # Import pickle
X = df.drop(columns=['Flood Probability'])
y = df['Flood Probability']

# Preprocess data
numeric_features = ['Latitude', 'Longitude', 'Rainfall (mm)', 'Temperature (°C)', 'Humidity (%)',
                    'River Discharge (m³/s)', 'Water Level (m)', 'Elevation (m)', 'Population Density']
categorical_features = ['Land Cover', 'Soil Type', 'Infrastructure', 'Historical Floods']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(random_state=42))])

pipeline.fit(X, y)

# Save the model
with open('flood_batata.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [18]:
import pickle
import pandas as pd

# Load the saved model from the pickle file
with open('flood_batata.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Check if it's a pipeline
print(type(loaded_model))  # Should be <class 'sklearn.pipeline.Pipeline'>

new_data = pd.DataFrame({
    'Latitude': [18.861663],
    'Longitude': [78.835584],
    'Rainfall (mm)': [218.999493],
    'Temperature (°C)': [34.144337],
    'Humidity (%)': [43.912963],
    'River Discharge (m³/s)': [4236.182888],
    'Water Level (m)': [7.415552],
    'Elevation (m)': [377.465433],
    'Land Cover': ['Water Body'],
    'Soil Type': ['Clay'],
    'Population Density': [7276.742184],
    'Infrastructure': [1],
    'Historical Floods': [0]
})

predicted_probabilities = loaded_model.predict(new_data)

# Output the predicted flood probability
print(f"Predicted Flood Probability: {predicted_probabilities[0]}")

<class 'sklearn.pipeline.Pipeline'>
Predicted Flood Probability: 0.5654000000000008
