In [3]:
import pandas as pd
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [16]:
# 1. Define the correct path to the dataset
file_path = 'bangalore_urban_flood_prediction.csv'

# 2. Load the data
df = pd.read_csv(file_path)

print("✅ Bangalore dataset loaded successfully.")
df.info()

✅ Bangalore dataset loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Latitude                   3000 non-null   float64
 1   Longitude                  3000 non-null   float64
 2   Altitude                   3000 non-null   float64
 3   Rainfall_Intensity         3000 non-null   float64
 4   Temperature                3000 non-null   float64
 5   Humidity                   3000 non-null   float64
 6   Atmospheric_Pressure       3000 non-null   float64
 7   River_Level                3000 non-null   float64
 8   Drainage_Capacity          3000 non-null   float64
 9   Drainage_System_Condition  3000 non-null   int64  
 10  Population_Density         3000 non-null   float64
 11  Urbanization_Level         3000 non-null   int64  
 12  flood                      3000 non-null   int64  
dtypes: floa

In [17]:
# The 'flood' column is what we want to predict (0 for no, 1 for yes)
target_name = 'flood'

# All other columns will be used as features
feature_names = df.columns.drop(target_name).tolist()

X = df[feature_names]
y = df[target_name]

print("Target variable:", target_name)
print("\nFeatures used for training:", feature_names)

Target variable: flood

Features used for training: ['Latitude', 'Longitude', 'Altitude', 'Rainfall_Intensity', 'Temperature', 'Humidity', 'Atmospheric_Pressure', 'River_Level', 'Drainage_Capacity', 'Drainage_System_Condition', 'Population_Density', 'Urbanization_Level']


In [None]:
# 1. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. Initialize and fit the scaler on the training data ONLY
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Use the same scaler for the test data

print("✅ Data has been split and scaled.")

: 

In [21]:
# 1. Initialize the model WITH the class_weight parameter
# This is the crucial change that will fix the bias.
model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42, 
    n_jobs=-1,
    class_weight='balanced'  # <--- THE FIX IS HERE
)

# 2. Train the model
model.fit(X_train_scaled, y_train)

print("✅ Bangalore-specific model training complete with balanced class weights.")

✅ Bangalore-specific model training complete with balanced class weights.


In [22]:
# 1. Define the directory to save the model files
artifacts_dir = 'ml_artifacts'
os.makedirs(artifacts_dir, exist_ok=True)

# 2. Save the trained model
with open(os.path.join(artifacts_dir, 'model.pkl'), 'wb') as f:
    pickle.dump(model, f)
print(f"✅ Model saved to '{os.path.join(artifacts_dir, 'model.pkl')}'")

# 3. Save the fitted scaler
with open(os.path.join(artifacts_dir, 'scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)
print(f"✅ Scaler saved to '{os.path.join(artifacts_dir, 'scaler.pkl')}'")

# 4. Save the list of feature names
with open(os.path.join(artifacts_dir, 'model_features.pkl'), 'wb') as f:
    pickle.dump(feature_names, f)
print(f"✅ Feature list saved to '{os.path.join(artifacts_dir, 'model_features.pkl')}'")

✅ Model saved to 'ml_artifacts/model.pkl'
✅ Scaler saved to 'ml_artifacts/scaler.pkl'
✅ Feature list saved to 'ml_artifacts/model_features.pkl'
