In [1]:
!pip install kagglehub




In [2]:
import pandas as pd
import numpy as np
import kagglehub
import os
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
import kagglehub
import os

# Download dataset
path = kagglehub.dataset_download("arbethi/rainfall-dataset")

print("Dataset downloaded at:", path)

# Check files inside folder
print("Files inside dataset folder:")
print(os.listdir(path))


Using Colab cache for faster access to the 'rainfall-dataset' dataset.
Dataset downloaded at: /kaggle/input/rainfall-dataset
Files inside dataset folder:
['flood dataset.xlsx', 'rainfall in india 1901-2015.xlsx']


In [4]:
import pandas as pd

# Build correct file path
file_path = os.path.join(path, "flood dataset.xlsx")

# Read Excel file
df = pd.read_excel(file_path)

print("First 5 rows:")
display(df.head())

print("\nColumn Names:")
print(df.columns)

print("\nDataset Info:")
df.info()


First 5 rows:


Unnamed: 0,Temp,Humidity,Cloud Cover,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec,avgjune,sub,flood
0,29,70,30,3248.6,73.4,386.2,2122.8,666.1,274.866667,649.9,0
1,28,75,40,3326.6,9.3,275.7,2403.4,638.2,130.3,256.4,1
2,28,75,42,3271.2,21.7,336.3,2343.0,570.1,186.2,308.9,0
3,29,71,44,3129.7,26.7,339.4,2398.2,365.3,366.066667,862.5,0
4,31,74,40,2741.6,23.4,378.5,1881.5,458.1,283.4,586.9,0



Column Names:
Index(['Temp', 'Humidity', 'Cloud Cover', 'ANNUAL', 'Jan-Feb', 'Mar-May',
       'Jun-Sep', 'Oct-Dec', 'avgjune', 'sub', 'flood'],
      dtype='object')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115 entries, 0 to 114
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temp         115 non-null    int64  
 1   Humidity     115 non-null    int64  
 2   Cloud Cover  115 non-null    int64  
 3   ANNUAL       115 non-null    float64
 4   Jan-Feb      115 non-null    float64
 5   Mar-May      115 non-null    float64
 6   Jun-Sep      115 non-null    float64
 7   Oct-Dec      115 non-null    float64
 8   avgjune      115 non-null    float64
 9   sub          115 non-null    float64
 10  flood        115 non-null    int64  
dtypes: float64(7), int64(4)
memory usage: 10.0 KB


In [5]:
X = df.drop("flood", axis=1)
y = df["flood"]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [8]:
from sklearn.tree import DecisionTreeClassifier

dt_params = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_params,
    cv=5,
    scoring='accuracy'
)

dt_grid.fit(X_train, y_train)

best_dt = dt_grid.best_estimator_

dt_train_acc = best_dt.score(X_train, y_train)
dt_test_acc = best_dt.score(X_test, y_test)

print("Decision Tree Train Accuracy:", dt_train_acc)
print("Decision Tree Test Accuracy:", dt_test_acc)


Decision Tree Train Accuracy: 1.0
Decision Tree Test Accuracy: 0.9565217391304348


In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=5,
    scoring='accuracy'
)

rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_

rf_train_acc = best_rf.score(X_train, y_train)
rf_test_acc = best_rf.score(X_test, y_test)

print("Random Forest Train Accuracy:", rf_train_acc)
print("Random Forest Test Accuracy:", rf_test_acc)


Random Forest Train Accuracy: 1.0
Random Forest Test Accuracy: 0.9565217391304348


In [10]:
from sklearn.ensemble import GradientBoostingClassifier

gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_params,
    cv=5,
    scoring='accuracy'
)

gb_grid.fit(X_train, y_train)

best_gb = gb_grid.best_estimator_

gb_train_acc = best_gb.score(X_train, y_train)
gb_test_acc = best_gb.score(X_test, y_test)

print("Gradient Boosting Train Accuracy:", gb_train_acc)
print("Gradient Boosting Test Accuracy:", gb_test_acc)


Gradient Boosting Train Accuracy: 1.0
Gradient Boosting Test Accuracy: 0.9565217391304348


In [11]:
print("\nFINAL MODEL COMPARISON")
print("------------------------")
print("Decision Tree Test:", dt_test_acc)
print("Random Forest Test:", rf_test_acc)
print("Gradient Boosting Test:", gb_test_acc)



FINAL MODEL COMPARISON
------------------------
Decision Tree Test: 0.9565217391304348
Random Forest Test: 0.9565217391304348
Gradient Boosting Test: 0.9565217391304348


In [12]:
final_model = best_gb   # Gradient Boosting model

print("Final Model Train Accuracy:", final_model.score(X_train, y_train))
print("Final Model Test Accuracy:", final_model.score(X_test, y_test))


Final Model Train Accuracy: 1.0
Final Model Test Accuracy: 0.9565217391304348


In [13]:
cv_scores = cross_val_score(final_model, X_train, y_train, cv=5)

print("Cross Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0


In [14]:
print("Train Accuracy:", final_model.score(X_train, y_train))
print("Test Accuracy:", final_model.score(X_test, y_test))


Train Accuracy: 1.0
Test Accuracy: 0.9565217391304348


In [15]:
from sklearn.metrics import confusion_matrix

y_pred = final_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Confusion Matrix:
 [[20  0]
 [ 1  2]]


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.67      0.80         3

    accuracy                           0.96        23
   macro avg       0.98      0.83      0.89        23
weighted avg       0.96      0.96      0.95        23



In [17]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(final_model, X, y, cv=5)

print("Cross Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross Validation Scores: [0.95652174 1.         1.         1.         1.        ]
Mean CV Accuracy: 0.9913043478260869


In [18]:
sample = [[28,75,40,3326.6,9.3,275.7,2403.4,638.2,130.3,256.4]]

sample_scaled = scaler.transform(sample)

prediction = final_model.predict(sample_scaled)

if prediction[0] == 1:
    print("Flood Predicted")
else:
    print("No Flood Predicted")


No Flood Predicted




In [19]:
prob = final_model.predict_proba(sample_scaled)
print("Prediction Probability:", prob)


Prediction Probability: [[9.99051530e-01 9.48469627e-04]]


In [20]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = final_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[20  0]
 [ 1  2]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.67      0.80         3

    accuracy                           0.96        23
   macro avg       0.98      0.83      0.89        23
weighted avg       0.96      0.96      0.95        23



In [21]:
# ================================
# 1️⃣ IMPORT LIBRARIES
# ================================

!pip install kagglehub -q

import pandas as pd
import numpy as np
import kagglehub
import os
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [22]:
# ================================
# 2️⃣ DOWNLOAD DATASET AUTOMATICALLY
# ================================

path = kagglehub.dataset_download("arbethi/rainfall-dataset")
file_path = os.path.join(path, "flood dataset.xlsx")

df = pd.read_excel(file_path)

print("Dataset Shape:", df.shape)
df.head()


Using Colab cache for faster access to the 'rainfall-dataset' dataset.
Dataset Shape: (115, 11)


Unnamed: 0,Temp,Humidity,Cloud Cover,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec,avgjune,sub,flood
0,29,70,30,3248.6,73.4,386.2,2122.8,666.1,274.866667,649.9,0
1,28,75,40,3326.6,9.3,275.7,2403.4,638.2,130.3,256.4,1
2,28,75,42,3271.2,21.7,336.3,2343.0,570.1,186.2,308.9,0
3,29,71,44,3129.7,26.7,339.4,2398.2,365.3,366.066667,862.5,0
4,31,74,40,2741.6,23.4,378.5,1881.5,458.1,283.4,586.9,0


In [23]:
# ================================
# 3️⃣ SPLIT FEATURES AND TARGET
# ================================

X = df.drop("flood", axis=1)
y = df["flood"]


In [24]:
# ================================
# 4️⃣ TRAIN TEST SPLIT (STRATIFIED)
# ================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y   # important for imbalance
)


In [25]:
# ================================
# 5️⃣ FEATURE SCALING
# ================================

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [26]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    class_weight='balanced',   # fixes imbalance
    random_state=42
)

rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

rf_train_acc = rf_model.score(X_train, y_train)
rf_test_acc = rf_model.score(X_test, y_test)

print("Random Forest Train Accuracy:", rf_train_acc)
print("Random Forest Test Accuracy:", rf_test_acc)


Random Forest Train Accuracy: 1.0
Random Forest Test Accuracy: 0.9565217391304348


In [27]:
gb_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb_model.fit(X_train, y_train)

gb_pred = gb_model.predict(X_test)

gb_train_acc = gb_model.score(X_train, y_train)
gb_test_acc = gb_model.score(X_test, y_test)

print("Gradient Boosting Train Accuracy:", gb_train_acc)
print("Gradient Boosting Test Accuracy:", gb_test_acc)


Gradient Boosting Train Accuracy: 1.0
Gradient Boosting Test Accuracy: 0.9565217391304348


In [28]:
print("\n--- RANDOM FOREST ---")
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

print("\n--- GRADIENT BOOSTING ---")
print(confusion_matrix(y_test, gb_pred))
print(classification_report(y_test, gb_pred))



--- RANDOM FOREST ---
[[20  0]
 [ 1  2]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.67      0.80         3

    accuracy                           0.96        23
   macro avg       0.98      0.83      0.89        23
weighted avg       0.96      0.96      0.95        23


--- GRADIENT BOOSTING ---
[[20  0]
 [ 1  2]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.67      0.80         3

    accuracy                           0.96        23
   macro avg       0.98      0.83      0.89        23
weighted avg       0.96      0.96      0.95        23



In [29]:
cv_scores = cross_val_score(gb_model, X, y, cv=5)

print("Cross Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross Validation Scores: [0.95652174 1.         1.         1.         1.        ]
Mean CV Accuracy: 0.9913043478260869


In [30]:
if gb_test_acc >= rf_test_acc:
    final_model = gb_model
    print("Selected Model: Gradient Boosting")
else:
    final_model = rf_model
    print("Selected Model: Random Forest")


Selected Model: Gradient Boosting


In [31]:
sample = [[28,75,40,3326.6,9.3,275.7,2403.4,638.2,130.3,256.4]]

sample_scaled = scaler.transform(sample)

prediction = final_model.predict(sample_scaled)
probability = final_model.predict_proba(sample_scaled)

print("Prediction:", "Flood" if prediction[0] == 1 else "No Flood")
print("Probability:", probability)


Prediction: No Flood
Probability: [[9.99999994e-01 5.52877886e-09]]




In [32]:
flood_cases = df[df["flood"] == 1]
display(flood_cases.head())


Unnamed: 0,Temp,Humidity,Cloud Cover,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec,avgjune,sub,flood
1,28,75,40,3326.6,9.3,275.7,2403.4,638.2,130.3,256.4,1
6,29,74,40,3671.1,23.7,328.0,2737.8,581.7,256.966667,669.5,1
11,28,77,40,3451.3,16.9,351.1,2453.1,630.2,316.066667,730.9,1
22,28,70,30,3484.7,25.3,202.3,2928.4,328.6,240.833333,642.5,1
23,30,71,41,4226.4,22.2,363.0,3451.3,389.9,337.233333,826.3,1


In [33]:
no_flood_cases = df[df["flood"] == 0]
display(no_flood_cases.head())


Unnamed: 0,Temp,Humidity,Cloud Cover,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec,avgjune,sub,flood
0,29,70,30,3248.6,73.4,386.2,2122.8,666.1,274.866667,649.9,0
2,28,75,42,3271.2,21.7,336.3,2343.0,570.1,186.2,308.9,0
3,29,71,44,3129.7,26.7,339.4,2398.2,365.3,366.066667,862.5,0
4,31,74,40,2741.6,23.4,378.5,1881.5,458.1,283.4,586.9,0
5,30,70,38,2708.0,34.1,230.0,1943.1,500.8,138.3,254.1,0


In [34]:
manual_df = pd.DataFrame([{
    "Temp": 28,
    "Humidity": 75,
    "Cloud Cover": 42,
    "ANNUAL": 3271.2,
    "Jan-Feb": 21.7,
    "Mar-May": 336.3,
    "Jun-Sep": 2343.0,
    "Oct-Dec": 570.1,
    "avgjune": 186.200,
    "sub": 308.9
}])

manual_scaled = scaler.transform(manual_df)

prediction = final_model.predict(manual_scaled)

print("Actual:", 1)
print("Predicted:", prediction[0])


Actual: 1
Predicted: 0


In [35]:
import joblib

joblib.dump(final_model, "floods.save")
joblib.dump(scaler, "scaler.save")

print("Model and Scaler saved successfully!")


Model and Scaler saved successfully!


In [36]:
from google.colab import files

files.download("floods.save")
files.download("scaler.save")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
print(df["flood"].value_counts())


flood
0    99
1    16
Name: count, dtype: int64


In [38]:
from sklearn.metrics import roc_auc_score

y_prob = final_model.predict_proba(X_test)[:,1]

roc_score = roc_auc_score(y_test, y_prob)
print("ROC-AUC Score:", roc_score)


ROC-AUC Score: 0.8333333333333333


In [39]:
from sklearn.metrics import roc_auc_score

y_prob = final_model.predict_proba(X_test)[:,1]

roc_score = roc_auc_score(y_test, y_prob)
print("ROC-AUC Score:", roc_score)


ROC-AUC Score: 0.8333333333333333


In [40]:
full_pred = final_model.predict(scaler.transform(X))

full_accuracy = accuracy_score(y, full_pred)

print("Full Dataset Accuracy:", full_accuracy)


Full Dataset Accuracy: 0.991304347826087


In [41]:
custom_pred = (y_prob > 0.3).astype(int)

print(confusion_matrix(y_test, custom_pred))
print(classification_report(y_test, custom_pred))


[[20  0]
 [ 1  2]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.67      0.80         3

    accuracy                           0.96        23
   macro avg       0.98      0.83      0.89        23
weighted avg       0.96      0.96      0.95        23



In [42]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score
)
from sklearn.model_selection import cross_val_score
import numpy as np

print("========================================")
print("        FINAL MODEL EVALUATION")
print("========================================\n")

# ================================
# 1️⃣ Train & Test Accuracy
# ================================
train_acc = final_model.score(X_train, y_train)
test_acc = final_model.score(X_test, y_test)

print("Train Accuracy:", round(train_acc*100,2), "%")
print("Test Accuracy:", round(test_acc*100,2), "%")

# ================================
# 2️⃣ Cross Validation
# ================================
cv_scores = cross_val_score(final_model, scaler.transform(X), y, cv=5)
print("\nCross Validation Scores:", cv_scores)
print("Mean CV Accuracy:", round(cv_scores.mean()*100,2), "%")

# ================================
# 3️⃣ Predictions
# ================================
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:,1]

# ================================
# 4️⃣ Confusion Matrix
# ================================
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ================================
# 5️⃣ Classification Report
# ================================
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ================================
# 6️⃣ Additional Metrics
# ================================
roc_score = roc_auc_score(y_test, y_prob)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("ROC-AUC Score:", round(roc_score,4))
print("Precision (Flood):", round(precision,4))
print("Recall (Flood):", round(recall,4))
print("F1-Score (Flood):", round(f1,4))

# ================================
# 7️⃣ Full Dataset Accuracy
# ================================
full_pred = final_model.predict(scaler.transform(X))
full_accuracy = accuracy_score(y, full_pred)

print("\nFull Dataset Accuracy:", round(full_accuracy*100,2), "%")

print("\n========================================")
print("      MODEL EVALUATION COMPLETE")
print("========================================")


        FINAL MODEL EVALUATION

Train Accuracy: 100.0 %
Test Accuracy: 95.65 %

Cross Validation Scores: [0.95652174 1.         1.         1.         1.        ]
Mean CV Accuracy: 99.13 %

Confusion Matrix:
[[20  0]
 [ 1  2]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.67      0.80         3

    accuracy                           0.96        23
   macro avg       0.98      0.83      0.89        23
weighted avg       0.96      0.96      0.95        23

ROC-AUC Score: 0.8333
Precision (Flood): 1.0
Recall (Flood): 0.6667
F1-Score (Flood): 0.8

Full Dataset Accuracy: 99.13 %

      MODEL EVALUATION COMPLETE
