<a href="https://colab.research.google.com/github/SashankKantamsetti/Flood-Hazard/blob/main/Floods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load your dataset from Google Drive
file_path = '/content/drive/MyDrive/cleaned_merged_file.xlsx'  # Replace with your actual file path
data = pd.read_excel(file_path)

# Separate features and target variable
X = data.drop(columns=['flood'])  # Replace 'flood' with your target column name
y = data['flood']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check class distribution
print("Initial class distribution:")
print(y_train.value_counts())

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
print("New class distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts())

# Calculate scale_pos_weight
scale_pos_weight = len(y_resampled[y_resampled == 0]) / len(y_resampled[y_resampled == 1])

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Optional: Scale your features
    ('model', xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, eval_metric='logloss'))
])

# Train the model
model_pipeline.fit(X_resampled, y_resampled)

# Get predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Evaluate model performance
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Predict flood probabilities for rows where flood=0
flood_zero_points = data[data['flood'] == 0].drop(columns=['flood'])  # Use the original data for flood=0
predicted_probabilities = model_pipeline.predict_proba(flood_zero_points)

# Add predictions to the dataframe
predicted_df = pd.DataFrame(predicted_probabilities, columns=['probability_no_flood', 'probability_flood'])
predicted_df['x_coordinate'] = flood_zero_points['x_cordinat']  # Adjust according to your column name
predicted_df['y_coordinate'] = flood_zero_points['y_cordinat']  # Adjust according to your column name

# Save the predictions to a new Excel file on Google Drive
output_file_path = '/content/drive/MyDrive/predicted_flood_probabilities.xlsx'  # Path to save the output
predicted_df.to_excel(output_file_path, index=False)

print(f"Predicted probabilities saved to {output_file_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initial class distribution:
flood
0    40215
1     9175
Name: count, dtype: int64
New class distribution after SMOTE:
flood
0    40215
1    40215
Name: count, dtype: int64
Confusion Matrix:
[[9905   84]
 [  46 2313]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9989
           1       0.96      0.98      0.97      2359

    accuracy                           0.99     12348
   macro avg       0.98      0.99      0.98     12348
weighted avg       0.99      0.99      0.99     12348

Predicted probabilities saved to /content/drive/MyDrive/predicted_flood_probabilities.xlsx
