In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import joblib

The final dataset kerala_flood_final_withRainfall_WITH_LABELS.csv was created by merging raw flood event records (2018) with geographic and environmental data — including average annual rainfall, distance to rivers, population density, and district-level characteristics (coastal, hilly, riverine). This unified dataset serves as the input for training the flood risk prediction model, ensuring predictions are based on physical factors rather than historical labels alone.

For reference, I added some python files for merging the datasets and creating csv files.


Refer kerala_flood_enhanced.py, kerala_flood_rainfall.py, kerala_flood_with_district.py, kerala_flood_with_population.py, kerala_flood_with_river.py, add_flooded_coulumn.py 
for the respective merged datasets.

In [3]:
# Load final data
df = pd.read_csv(r"/kaggle/input/ai-powered-flood-risk-assistant/data/preprocessed/kerala_flood_final_withRainfall_WITH_LABELS.csv")
df.head()

Unnamed: 0,occurrence,label,.geo,latitude,longitude,district,distance_to_river_km,population_density,avg_annual_rainfall_mm,flooded_2018
0,7,0,"{""geodesic"":false,""type"":""Point"",""coordinates""...",10.709939,76.78004,Palakkad,0.327784,18.12034,6.58044,1.0
1,0,0,"{""geodesic"":false,""type"":""Point"",""coordinates""...",10.227095,76.342112,Ernakulam,0.4077,12.373452,6.58044,1.0
2,1,0,"{""geodesic"":false,""type"":""Point"",""coordinates""...",9.211999,76.665505,Pathanamthitta,0.202187,0.0,6.58044,1.0
3,6,0,"{""geodesic"":false,""type"":""Point"",""coordinates""...",9.366958,76.400502,Alappuzha,0.3868,0.0,6.58044,1.0
4,9,0,"{""geodesic"":false,""type"":""Point"",""coordinates""...",10.296714,76.586903,Thrissur,0.322344,13.890693,6.58044,1.0


In [4]:
# Define geographic context features based on domain knowledge
district_info = {
    "Thiruvananthapuram": {"coastal": True, "hilly": False, "riverine": True},
    "Kollam": {"coastal": True, "hilly": False, "riverine": True},
    "Pathanamthitta": {"coastal": False, "hilly": True, "riverine": True},
    "Alappuzha": {"coastal": True, "hilly": False, "riverine": True},
    "Kottayam": {"coastal": False, "hilly": False, "riverine": True},
    "Idukki": {"coastal": False, "hilly": True, "riverine": True},
    "Ernakulam": {"coastal": False, "hilly": False, "riverine": True},
    "Thrissur": {"coastal": False, "hilly": False, "riverine": True},
    "Palakkad": {"coastal": False, "hilly": True, "riverine": True},
    "Malappuram": {"coastal": False, "hilly": True, "riverine": True},
    "Kozhikode": {"coastal": True, "hilly": False, "riverine": True},
    "Wayanad": {"coastal": False, "hilly": True, "riverine": False},
    "Kannur": {"coastal": True, "hilly": False, "riverine": False},
    "Kasaragod": {"coastal": True, "hilly": False, "riverine": False}
}

In [5]:
# Add geographic features to dataframe
df['is_coastal'] = df['district'].map(lambda x: district_info.get(x, {}).get('coastal', False)).astype(int)
df['is_hilly'] = df['district'].map(lambda x: district_info.get(x, {}).get('hilly', False)).astype(int)
df['is_riverine'] = df['district'].map(lambda x: district_info.get(x, {}).get('riverine', False)).astype(int)

In [6]:
# FEATURES: ONLY PHYSICAL + GEOGRAPHIC — NO LATITUDE OR LONGITUDE
feature_cols = [
    'distance_to_river_km',
    'avg_annual_rainfall_mm',
    'population_density',
    'is_coastal',
    'is_hilly',
    'is_riverine'
]

In [7]:
# Prepare X, y
X = df[feature_cols].fillna(0)  # Fill any remaining NaN with 0
y = df['flooded_2018']          # REAL flood events from 2018 — this is our ground truth

# Log transform population density (helps with skew)
X['population_density_log'] = np.log1p(X['population_density'])
X = X.drop(['population_density'], axis=1)  # Drop raw version


In [8]:
# Final feature list after transformation — MUST MATCH APP EXACTLY
final_feature_names = [
    'distance_to_river_km',
    'avg_annual_rainfall_mm',
    'population_density_log',
    'is_coastal',
    'is_hilly',
    'is_riverine'
]
X.columns = final_feature_names 

In [9]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [10]:
# STRATIFIED K-FOLD CROSS-VALIDATION (5-fold) — TO AVOID OVERFITTING
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    RandomForestClassifier(
        n_estimators=100,           # Reduced to prevent overfitting
        max_depth=8,                # Shallower tree
        min_samples_split=10,       # Require at least 10 samples to split
        min_samples_leaf=5,         # Minimum 5 samples in leaf
        random_state=42,
        class_weight='balanced',
        oob_score=True              # Use out-of-bag score for internal validation
    ),
    X_scaled, y, cv=skf, scoring='accuracy'
)

In [11]:
print("Cross-Validation Results (5-Fold):")
print(f"Mean CV Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
print(f"CV Scores per fold: {cv_scores}")

Cross-Validation Results (5-Fold):
Mean CV Accuracy: 0.857 (+/- 0.047)
CV Scores per fold: [0.845  0.8925 0.8775 0.8425 0.83  ]


In [12]:
# Now train final model on full dataset (with best params)
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    class_weight='balanced',
    oob_score=True
)

model.fit(X_scaled, y)

# Evaluate on training set 
y_pred_train = model.predict(X_scaled)
train_acc = accuracy_score(y, y_pred_train)
print(f"\nTraining Accuracy: {train_acc:.3f}")
print(f"Out-of-Bag Score: {model.oob_score_:.3f}")

# Split for final holdout test (smaller size to simulate real-world testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"\nFinal Test Accuracy (on held-out 20%): {accuracy_score(y_test, y_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["High Risk", "Low Risk"]))


Training Accuracy: 0.869
Out-of-Bag Score: 0.838

Final Test Accuracy (on held-out 20%): 0.902

Classification Report:
              precision    recall  f1-score   support

   High Risk       0.78      0.77      0.77        86
    Low Risk       0.94      0.94      0.94       314

    accuracy                           0.90       400
   macro avg       0.86      0.85      0.85       400
weighted avg       0.90      0.90      0.90       400



In [13]:
# Feature importance
importances = model.feature_importances_
print("\nTop Feature Importances (Ranked):")
for feat, imp in sorted(zip(final_feature_names, importances), key=lambda x: x[1], reverse=True):
    print(f"{feat}: {imp:.4f}")

# Save artifacts — these will be loaded by app.py
joblib.dump(model, 'flood_model_v2.pkl')
joblib.dump(scaler, 'scaler_v2.pkl')
joblib.dump(final_feature_names, 'feature_names.pkl')

print("\nArtifacts saved: flood_model_v2.pkl, scaler_v2.pkl, feature_names.pkl")



Top Feature Importances (Ranked):
is_hilly: 0.6237
is_riverine: 0.1335
distance_to_river_km: 0.1219
is_coastal: 0.0854
population_density_log: 0.0355
avg_annual_rainfall_mm: 0.0000

Artifacts saved: flood_model_v2.pkl, scaler_v2.pkl, feature_names.pkl
