In [3]:
import pandas as pd
import random

# --- 1. Define disaster risk map ---
all_disasters = [
    "Flood", "Earthquake", "Landslide", "Drought", "Severe Storm", 
    "Cyclone", "Heatwave", "Cold Wave"
]

states_and_uts = [
    # States
    "Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar", "Chhattisgarh",
    "Goa", "Gujarat", "Haryana", "Himachal Pradesh", "Jharkhand", "Karnataka",
    "Kerala", "Madhya Pradesh", "Maharashtra", "Manipur", "Meghalaya", "Mizoram",
    "Nagaland", "Odisha", "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu",
    "Telangana", "Tripura", "Uttar Pradesh", "Uttarakhand", "West Bengal",

    # Union Territories
    "Andaman and Nicobar Islands", "Chandigarh", "Dadra and Nagar Haveli and Daman and Diu", 
    "Delhi", "Jammu and Kashmir", "Ladakh", "Lakshadweep", "Puducherry"
]

disaster_risk_map = {region: all_disasters for region in states_and_uts}


# --- 2. Define checklists for each disaster ---
custom_checklists = {
    "Earthquake": [
        "Secured heavy furniture", "Learned how to turn off utilities", "Emergency contact numbers known",
        "Prepared evacuation plan", "Kept shoes and flashlight beside bed", "First aid kit ready",
        "Important documents accessible", "Practiced earthquake drill", "Know nearest safe zone",
        "Know region's seismic history"
    ],
    "Flood": [
        "Know flood evacuation routes", "Have waterproof bags for documents", "First aid kit ready",
        "Food & water supplies", "Follow flood alerts", "Elevated electrical appliances",
        "Flood insurance taken", "Practiced flood drill", "Emergency contact numbers known",
        "Nearby shelters identified"
    ],
    "Cyclone": [
        "Reinforced windows and doors", "Tree branches trimmed", "Evacuation kit ready",
        "Important documents secured", "Battery-operated radio available", "Emergency contact numbers known",
        "Cyclone alerts followed", "Family trained on safety steps", "Mock drill done", "Water storage prepared"
    ],
    "Drought": [
        "Rainwater harvesting in place", "Water usage optimized", "Stored water for drinking",
        "Drip irrigation used", "Drought-resilient crops selected", "Emergency water plan",
        "Family informed of drought coping methods", "Local drought alerts followed",
        "Check wells and pumps", "Stored food"
    ],
    "Cold Wave": [
        "Warm clothing ready", "Heaters in safe condition", "Insulated home",
        "Emergency contact numbers known", "Backup heating source", "Food & water supplies",
        "Family trained for cold exposure", "Medical needs addressed", "Pets prepared", "Followed cold alerts"
    ],
    "Heatwave": [
        "Hydration plan followed", "Access to cool areas", "Avoided outdoor work during peak hours",
        "First aid for heatstroke known", "Family educated on symptoms", "Fans and AC functional",
        "Windows shaded", "Light cotton clothes used", "Followed heatwave alerts", "Mock drill conducted"
    ],
    "Landslide": [
        "Monitored slope signs", "Retaining walls checked", "Evacuation plan ready",
        "Emergency contact numbers known", "Drainage paths cleared", "Important items secured",
        "Followed weather updates", "Nearby shelters identified", "Practiced landslide drill",
        "Avoided unstable ground"
    ],
    "Severe Storm": [
        "Trimmed trees and shrubs", "Secured outdoor objects", "Emergency kit ready",
        "Listened to storm warnings", "Safe room identified", "Power backups ready",
        "Important papers waterproofed", "Mock drill done", "First aid kit ready", "Emergency contacts updated"
    ],
    "Industrial Hazard": [
        "Know local industry risks", "Toxic leak evacuation plan", "Gas masks and filters ready",
        "Government alerts followed", "Emergency contact numbers known", "Safe routes identified",
        "Community drill participated", "Important documents safe", "Family trained", "Nearby hospitals listed"
    ]
}

# --- 3. Generate synthetic data ---
synthetic_data = []

for state, disaster_types in disaster_risk_map.items():
    for disaster_type in disaster_types:
        for _ in range(100):  # entries per state-disaster combination
            row = {
                "Disaster Type": disaster_type,
                "Location": state,
                "Checklist Type": disaster_type + " Checklist",
                "Household Size": random.randint(1, 10),
                "Disaster Kit Owned": random.randint(0, 1)
            }
            awareness_score = 0
            checklist = custom_checklists.get(disaster_type, [])
            for item in checklist:
                val = random.randint(0, 1)
                row[item] = val
                awareness_score += val
            row["Awareness Score"] = awareness_score
            synthetic_data.append(row)

df_disaster_ready = pd.DataFrame(synthetic_data)

# --- 4. Normalize checklist columns across all rows ---
all_checklist_items = set()
for checklist in custom_checklists.values():
    all_checklist_items.update(checklist)

for item in all_checklist_items:
    if item not in df_disaster_ready:
        df_disaster_ready[item] = 0  # fill missing checklist columns with 0

# --- 5. Preparedness Level Binning ---
df_disaster_ready["Preparedness Level"] = pd.cut(
    df_disaster_ready["Awareness Score"],
    bins=[-1, 3, 7, 10],
    labels=["Needs Urgent Prep", "Moderately Prepared", "Well Prepared"]
)

# --- 6. Derived Feature: Checklist Completion Percentage ---
df_disaster_ready["Checklist Completion %"] = (
    df_disaster_ready[list(all_checklist_items)].sum(axis=1) / 10 * 100
)

# --- 7. Derived Feature: Risk Tier by disaster type ---
risk_tiers = {
    "High": ["Earthquake", "Cyclone", "Flood"],
    "Medium": ["Landslide", "Severe Storm", "Industrial Hazard"],
    "Low": ["Drought", "Heatwave", "Cold Wave"]
}

df_disaster_ready["Risk Tier"] = df_disaster_ready["Disaster Type"].apply(
    lambda x: next((k for k, v in risk_tiers.items() if x in v), "Unknown")
)

# --- 8. Final Checks ---
print("\n✅ Data Generation Summary")
print(f"Total Records: {len(df_disaster_ready)}")
print("\nPreparedness Level Distribution:")
print(df_disaster_ready["Preparedness Level"].value_counts())

# --- 9. Save to CSV ---
df_disaster_ready.to_csv("disaster_preparedness_dataset.csv", index=False)
print("\n📁 Dataset exported as: disaster_preparedness_dataset.csv")



✅ Data Generation Summary
Total Records: 28800

Preparedness Level Distribution:
Preparedness Level
Moderately Prepared    22355
Needs Urgent Prep       4893
Well Prepared           1552
Name: count, dtype: int64

📁 Dataset exported as: disaster_preparedness_dataset.csv


In [4]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import pandas as pd

# --- 1. Prepare features and target ---
features = [
    'Household Size', 'Disaster Kit Owned', 'Checklist Completion %', 
    'Awareness Score', 'Risk Tier'
]

# Convert 'Risk Tier' into numerical categories
df_disaster_ready['Risk Tier'] = df_disaster_ready['Risk Tier'].map({'High': 2, 'Medium': 1, 'Low': 0})

# Encode the target variable
label_encoder = LabelEncoder()
X = df_disaster_ready[features]
y = label_encoder.fit_transform(df_disaster_ready['Preparedness Level'])  # Encoded to 0, 1, 2

# --- 2. Split the data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 3. Train XGBoost ---
model = XGBClassifier(
    random_state=42,
    eval_metric='mlogloss',
    early_stopping_rounds=10,
    n_estimators=200
)
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=True
)

# --- 4. Evaluate ---
y_pred = model.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))  # Show original labels

# --- 5. Save artifacts ---
joblib.dump(model, 'disaster_preparedness_xgb_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
print("\nModel and label encoder saved.")
# Save feature list for consistent ordering in future predictions
joblib.dump(features, 'model_features.pkl')


[0]	validation_0-mlogloss:0.70235
[1]	validation_0-mlogloss:0.47979
[2]	validation_0-mlogloss:0.33756
[3]	validation_0-mlogloss:0.24151
[4]	validation_0-mlogloss:0.17463
[5]	validation_0-mlogloss:0.12717
[6]	validation_0-mlogloss:0.09306
[7]	validation_0-mlogloss:0.06834
[8]	validation_0-mlogloss:0.05031
[9]	validation_0-mlogloss:0.03711
[10]	validation_0-mlogloss:0.02741
[11]	validation_0-mlogloss:0.02027
[12]	validation_0-mlogloss:0.01501
[13]	validation_0-mlogloss:0.01112
[14]	validation_0-mlogloss:0.00824
[15]	validation_0-mlogloss:0.00612
[16]	validation_0-mlogloss:0.00455
[17]	validation_0-mlogloss:0.00338
[18]	validation_0-mlogloss:0.00252
[19]	validation_0-mlogloss:0.00188
[20]	validation_0-mlogloss:0.00141
[21]	validation_0-mlogloss:0.00106
[22]	validation_0-mlogloss:0.00080
[23]	validation_0-mlogloss:0.00061
[24]	validation_0-mlogloss:0.00046
[25]	validation_0-mlogloss:0.00036
[26]	validation_0-mlogloss:0.00029
[27]	validation_0-mlogloss:0.00023
[28]	validation_0-mlogloss:0.0

['model_features.pkl']