In [5]:
import os
import osmium
import pandas as pd
import geopandas as gpd
import joblib
import numpy as np
from shapely.geometry import Point

# 📌 Define Paths
osm_file = "/kaggle/input/central-india/central-zone-latest.osm.pbf"
processed_file = "/kaggle/working/processed_places.parquet"
model_file = "/kaggle/working/gis_risk_model.pkl"

# ✅ Check if Processed Data Exists
if os.path.exists(processed_file):
    print("✅ Processed data found! Loading...")
    df_places = pd.read_parquet(processed_file)
else:
    print("❌ Processed data missing! Extracting...")

    # 🔹 OSM Data Extraction
    class OSMHandler(osmium.SimpleHandler):
        def __init__(self):
            super().__init__()
            self.data = []

        def parse_population(self, pop):
            """Handles population values that may be ranges (e.g., '150-200') or invalid."""
            if isinstance(pop, str):
                pop = pop.replace(",", "")  # Remove commas (e.g., "1,000" → "1000")
                if "-" in pop:  # Handle ranges
                    parts = pop.split("-")
                    try:
                        return int((int(parts[0]) + int(parts[1])) / 2)  # Take average
                    except ValueError:
                        return 0  # If parsing fails, set to 0
                elif pop.isdigit():  # If it's a normal number
                    return int(pop)
            return 0  # Default if invalid

        def node(self, n):
            if 'place' in n.tags:
                self.data.append({
                    'id': n.id,
                    'latitude': n.location.lat,
                    'longitude': n.location.lon,
                    'population': self.parse_population(n.tags.get('population', '0')),
                    'place_type': n.tags.get('place', 'unknown')
                })

    # 🔹 Process OSM File
    handler = OSMHandler()
    handler.apply_file(osm_file)

    # 🔹 Convert to DataFrame & Save
    df_places = pd.DataFrame(handler.data)
    df_places.to_parquet(processed_file, index=False)
    print("✅ Data extracted and saved!")

# ✅ Load Saved Data
gdf_places = gpd.GeoDataFrame(df_places, geometry=gpd.points_from_xy(df_places.longitude, df_places.latitude))

# 🔹 Train ML Model (Only If Not Saved)
if os.path.exists(model_file):
    print("✅ Model found! Loading...")
    model = joblib.load(model_file)
else:
    print("❌ Model not found! Training new model...")

    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split

    # Prepare ML Data
    X = df_places[['latitude', 'longitude', 'population']]
    y = df_places['population']  # Example Target Variable

    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Save Model
    joblib.dump(model, model_file)
    print("✅ Model trained and saved!")

# ✅ Predict Risk Scores
df_places['risk_score'] = model.predict(df_places[['latitude', 'longitude', 'population']])
print("✅ Risk Scores Predicted!")

# 🔹 Save Final Data
final_data_file = "/kaggle/working/final_places_with_risk.parquet"
df_places.to_parquet(final_data_file, index=False)
print("✅ Final processed data with risk scores saved!")


❌ Processed data missing! Extracting...
✅ Data extracted and saved!
❌ Model not found! Training new model...
✅ Model trained and saved!
✅ Risk Scores Predicted!
✅ Final processed data with risk scores saved!


 **** Latest Attempt


In [1]:
!pip install osmium osmnx geopandas shapely pandas xgboost folium tqdm

import osmium
import osmnx as ox
import geopandas as gpd
import pandas as pd
import numpy as np
import xgboost as xgb
import folium
from shapely.geometry import Point
from tqdm import tqdm
import json


Collecting osmium
  Downloading osmium-4.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting osmnx
  Downloading osmnx-2.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting geopandas
  Downloading geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Downloading osmium-4.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading osmnx-2.0.1-py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.6/99.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading geopandas-1.0.1-py3-none-any.whl (323 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.6/323.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: osmium, geopandas, osmnx
  Attempting uninstall: geopandas
    Found existing installation: geo

In [2]:
class OSMHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.data = []

    def node(self, n):
        if 'place' in n.tags:
            self.data.append({
                'latitude': n.location.lat,
                'longitude': n.location.lon,
                'population': int(n.tags.get('population', '0').split('-')[0]) if n.tags.get('population') else 0,
                'place_type': n.tags.get('place', 'unknown')
            })

def extract_osm_data(osm_file, output_csv):
    handler = OSMHandler()
    handler.apply_file(osm_file)
    
    df = pd.DataFrame(handler.data)
    df.to_csv(output_csv, index=False)
    print(f"✅ Extracted GIS data saved to {output_csv}")
    return df

osm_file = "/kaggle/input/central-india/central-zone-latest.osm.pbf"
output_csv = "extracted_gis_data.csv"
gis_data = extract_osm_data(osm_file, output_csv)


✅ Extracted GIS data saved to extracted_gis_data.csv


In [3]:
def preprocess_data(df):
    df.dropna(inplace=True)
    df['risk_score'] = np.log1p(df['population'])  # Log transform population
    df = pd.get_dummies(df, columns=['place_type'], drop_first=True)  # One-hot encoding
    return df

gis_data = preprocess_data(gis_data)
gis_data.to_csv("preprocessed_gis_data.csv", index=False)
print("✅ Preprocessed data saved!")


✅ Preprocessed data saved!


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = gis_data.drop(columns=['risk_score'])
y = gis_data['risk_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"✅ Model Trained! RMSE: {rmse:.4f}")

xgb_model.save_model("xgboost_risk_model.json")
print("✅ Model saved successfully!")


✅ Model Trained! RMSE: 0.0210
✅ Model saved successfully!


In [5]:
gis_data['predicted_risk_score'] = xgb_model.predict(X)

def generate_risk_map(df, output_html="risk_map.html"):
    m = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=7)

    for _, row in df.iterrows():
        color = 'green' if row['predicted_risk_score'] < 1 else 'yellow' if row['predicted_risk_score'] < 2 else 'red'
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=5,
            color=color,
            fill=True,
            fill_color=color
        ).add_to(m)

    m.save(output_html)
    print(f"✅ Risk Map generated: {output_html}")

generate_risk_map(gis_data)


✅ Risk Map generated: risk_map.html


****Final Model****

In [1]:
!pip install osmium osmnx geopandas shapely pandas xgboost folium tqdm

import osmium
import osmnx as ox
import geopandas as gpd
import pandas as pd
import numpy as np
import xgboost as xgb
import folium
from shapely.geometry import Point
from tqdm import tqdm
import json
import random


Collecting osmium
  Downloading osmium-4.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting osmnx
  Downloading osmnx-2.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting geopandas
  Downloading geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Downloading osmium-4.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading osmnx-2.0.1-py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.6/99.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading geopandas-1.0.1-py3-none-any.whl (323 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.6/323.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: osmium, geopandas, osmnx
  Attempting uninstall: geopandas
    Found existing installation: geo

In [2]:
class OSMHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.data = []

    def node(self, n):
        feature_type = None
        if 'natural' in n.tags:
            feature_type = n.tags['natural']  # Vegetation, water bodies, biodiversity
        elif 'highway' in n.tags:
            feature_type = 'transportation'
        elif 'building' in n.tags:
            feature_type = 'urban_area'
        
        if feature_type:
            self.data.append({
                'latitude': n.location.lat,
                'longitude': n.location.lon,
                'feature': feature_type
            })

def extract_osm_data(osm_file, output_csv):
    handler = OSMHandler()
    handler.apply_file(osm_file)
    
    df = pd.DataFrame(handler.data)
    df.to_csv(output_csv, index=False)
    print(f"✅ Extracted GIS data saved to {output_csv}")
    return df

osm_file = "/kaggle/input/central-india/central-zone-latest.osm.pbf"
output_csv = "extracted_gis_features.csv"
gis_data = extract_osm_data(osm_file, output_csv)


✅ Extracted GIS data saved to extracted_gis_features.csv


In [3]:
def compute_ldi(df, grid_size=0.1):
    df['grid_x'] = (df['latitude'] // grid_size) * grid_size
    df['grid_y'] = (df['longitude'] // grid_size) * grid_size
    
    grouped = df.groupby(['grid_x', 'grid_y', 'feature']).size().unstack(fill_value=0)
    grouped = grouped.div(grouped.sum(axis=1), axis=0)  # Normalize to get % contribution

    return grouped.reset_index()

ldi_data = compute_ldi(gis_data)
ldi_data.to_csv("land_development_index.csv", index=False)
print("✅ Land Development Index (LDI) computed and saved!")


✅ Land Development Index (LDI) computed and saved!


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

ldi_data.fillna(0, inplace=True)

X = ldi_data.drop(columns=['grid_x', 'grid_y'])
y = np.random.rand(len(X))  # Placeholder risk scores (can be replaced with real disaster/crime data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"✅ Model Trained! RMSE: {rmse:.4f}")

xgb_model.save_model("xgboost_risk_model.json")
print("✅ Model saved successfully!")


✅ Model Trained! RMSE: 0.2818
✅ Model saved successfully!


In [5]:
def generate_feature_risk_map(df, selected_feature, output_html="feature_risk_map.html"):
    m = folium.Map(location=[df['grid_x'].mean(), df['grid_y'].mean()], zoom_start=7)

    for _, row in df.iterrows():
        if selected_feature in row:
            risk_score = row[selected_feature]
            color = 'green' if risk_score < 0.3 else 'yellow' if risk_score < 0.6 else 'red'

            folium.CircleMarker(
                location=[row['grid_x'], row['grid_y']],
                radius=5,
                color=color,
                fill=True,
                fill_color=color
            ).add_to(m)

    m.save(output_html)
    print(f"✅ {selected_feature} Risk Map generated: {output_html}")

# Example: Generate map for "urban_area"
generate_feature_risk_map(ldi_data, "urban_area")


✅ urban_area Risk Map generated: feature_risk_map.html
