In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import classification_report, mean_squared_error

In [4]:
df = pd.read_csv("./Nepal_building_with_Intensity.csv")
df.head()

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,count_floors_pre_eq,count_floors_post_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,height_ft_post_eq,...,other_floor_type,position,plan_configuration,condition_post_eq,damage_grade,technical_solution_proposed,superstructure,damage_score,damage_class,intensity
0,120101000000.0,12,1207,120703,1,1,9,288,9,9,...,Not applicable,Not attached,Rectangular,Damaged-Used in risk,3.0,Major repair,has_superstructure_mud_mortar_stone,6.0,medium,5.339286
1,120101000000.0,12,1207,120703,1,1,15,364,9,9,...,Not applicable,Not attached,Rectangular,Damaged-Repaired and used,3.0,Reconstruction,has_superstructure_mud_mortar_stone,6.0,medium,5.339286
2,120101000000.0,12,1207,120703,1,1,20,384,9,9,...,Not applicable,Not attached,Rectangular,Damaged-Repaired and used,2.0,Minor repair,has_superstructure_mud_mortar_stone,4.0,medium,5.339286
3,120101000000.0,12,1207,120703,1,1,20,312,9,9,...,Not applicable,Not attached,Rectangular,Damaged-Repaired and used,2.0,Minor repair,has_superstructure_mud_mortar_stone,4.0,medium,5.339286
4,120101000000.0,12,1207,120703,1,1,30,308,9,9,...,Not applicable,Not attached,Rectangular,Damaged-Repaired and used,1.0,Minor repair,has_superstructure_mud_mortar_stone,2.0,low,5.339286


In [5]:
y_class = df["damage_class"]
y_score = df["damage_score"]

In [6]:
label_cols = ["damage_score", "damage_class", "damage_grade"]

leakage_cols = ["condition_post_eq", "technical_solution_proposed"]

drop_cols = [col for col in label_cols + leakage_cols if col in df.columns]

X = df.drop(columns=drop_cols)

print("Final number of input features:", X.shape[1])
X.head()

Final number of input features: 19


Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,count_floors_pre_eq,count_floors_post_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,height_ft_post_eq,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,superstructure,intensity
0,120101000000.0,12,1207,120703,1,1,9,288,9,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,has_superstructure_mud_mortar_stone,5.339286
1,120101000000.0,12,1207,120703,1,1,15,364,9,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,has_superstructure_mud_mortar_stone,5.339286
2,120101000000.0,12,1207,120703,1,1,20,384,9,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,has_superstructure_mud_mortar_stone,5.339286
3,120101000000.0,12,1207,120703,1,1,20,312,9,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,has_superstructure_mud_mortar_stone,5.339286
4,120101000000.0,12,1207,120703,1,1,30,308,9,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,has_superstructure_mud_mortar_stone,5.339286


In [7]:
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Numeric features: ['building_id', 'district_id', 'vdcmun_id', 'ward_id', 'count_floors_pre_eq', 'count_floors_post_eq', 'age_building', 'plinth_area_sq_ft', 'height_ft_pre_eq', 'height_ft_post_eq', 'intensity']
Categorical features: ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'superstructure']


In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [12]:
print(df.isna().sum())

building_id                     0
district_id                     0
vdcmun_id                       0
ward_id                         0
count_floors_pre_eq             0
count_floors_post_eq            0
age_building                    0
plinth_area_sq_ft               0
height_ft_pre_eq                0
height_ft_post_eq               0
land_surface_condition          0
foundation_type                 0
roof_type                       0
ground_floor_type               0
other_floor_type                0
position                        1
plan_configuration              1
condition_post_eq               0
damage_grade                   12
technical_solution_proposed    12
superstructure                  0
damage_score                   12
damage_class                   12
intensity                       0
dtype: int64


In [13]:
# Fill numeric NaN with median
for col in X.select_dtypes(include=['int64', 'float64']).columns:
    X[col] = X[col].fillna(X[col].median())

# Fill categorical NaN with mode
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].fillna(X[col].mode()[0])

In [16]:
# Drop rows with missing labels so train_test_split does not receive NaNs.
mask = y_class.notna() & y_score.notna()
if mask.sum() != len(X):
    dropped = len(X) - mask.sum()
    print(f"Dropping {dropped} rows with missing labels before train/test split.")
X_clean = X[mask].reset_index(drop=True)
y_class_clean = y_class[mask].reset_index(drop=True)
y_score_clean = y_score[mask].reset_index(drop=True)

X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
    X_clean, y_class_clean, y_score_clean, test_size=0.2, random_state=42, stratify=y_class_clean
)

Dropping 12 rows with missing labels before train/test split.


In [30]:
models = {
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "RandomForest": RandomForestClassifier(n_estimators=250, random_state=42, n_jobs= 5),
    "LinearRegression": LinearRegression()
}

trained = {}

for name, model in models.items():

    print(f"\n=== Training {name} ===")
    pipe = Pipeline([("preprocessor", preprocessor), ("model", model)])

    # classification vs regression target
    pipe.fit(X_train, y_class_train if name != "LinearRegression" else y_score_train)
    trained[name] = pipe

    if name != "LinearRegression":
        preds = pipe.predict(X_test)
        print(classification_report(y_class_test, preds))
    else:
        preds = pipe.predict(X_test)
        print("Linear Regression MSE:", mean_squared_error(y_score_test, preds))



=== Training KNN ===
              precision    recall  f1-score   support

        high       0.85      0.85      0.85     91922
         low       0.58      0.55      0.57     15763
      medium       0.58      0.59      0.59     44734

    accuracy                           0.74    152419
   macro avg       0.67      0.66      0.67    152419
weighted avg       0.74      0.74      0.74    152419


=== Training RandomForest ===
              precision    recall  f1-score   support

        high       0.89      0.88      0.89     91922
         low       0.68      0.64      0.66     15763
      medium       0.67      0.70      0.68     44734

    accuracy                           0.80    152419
   macro avg       0.75      0.74      0.74    152419
weighted avg       0.80      0.80      0.80    152419


=== Training LinearRegression ===
Linear Regression MSE: 2.5598603259808934


In [31]:
for name, model in trained.items():
    joblib.dump(model, f"{name}.pkl")

print("Models saved successfully!")

Models saved successfully!


In [19]:
# medium damage sample data
sample = pd.DataFrame([{
    "district_id": 25,
    "vdcmun_id": 1234,
    "ward_id": 5,

    "count_floors_pre_eq": 2,
    "count_floors_post_eq": 1,

    "age_building": 30,
    "plinth_area_sq_ft": 450,
    "height_ft_pre_eq": 22,
    "height_ft_post_eq": 18,

    "land_surface_condition": "Flat",
    "foundation_type": "Cement-Stone/Brick",
    "roof_type": "RCC/RB/RBC",
    "ground_floor_type": "RC",
    "other_floor_type": "Timber-Planck",
    "position": "Not attached",
    "plan_configuration": "Rectangular",
    "superstructure": "Cement-Stone/Brick",

    "Intensity": 6.8
}])


In [None]:
# prepare a copy of the sample and ensure required columns exist (names & missing cols)
s = sample.copy()

# normalize intensity column name if needed
if 'Intensity' in s.columns and 'intensity' not in s.columns:
	s = s.rename(columns={'Intensity': 'intensity'})

# required columns come from the preprocessing used during training
required_cols = numeric_features + categorical_features

for col in required_cols:
	if col not in s.columns:
		if col in numeric_features:
			# fill missing numeric feature with median from training data
			s[col] = X[col].median()
		else:
			# fill missing categorical feature with mode from training data
			s[col] = X[col].mode()[0]

# keep only the features expected by the pipeline (order does not need to match)
s = s[required_cols]

knn = joblib.load("RandomForest.pkl")
pred = knn.predict(s)[0]
print("Random Forest Prediction:", pred)

KNN Prediction: low


In [23]:
# high damage sample data
sample_high = pd.DataFrame([{
    "district_id": 11,
    "vdcmun_id": 501,
    "ward_id": 3,

    "count_floors_pre_eq": 3,
    "count_floors_post_eq": 1,

    "age_building": 45,
    "plinth_area_sq_ft": 150,
    "height_ft_pre_eq": 28,
    "height_ft_post_eq": 10,

    "land_surface_condition": "Moderate slope",
    "foundation_type": "Mud mortar-Stone/Brick",
    "roof_type": "Bamboo/Timber-Light roof",
    "ground_floor_type": "Mud",
    "other_floor_type": "Mud",
    "position": "Attached-2 side",
    "plan_configuration": "L-shape",
    "superstructure": "Mud mortar-Stone/Brick",

    "Intensity": 7.9
}])


In [25]:
# low damage sample data
sample_low = pd.DataFrame([{
    "district_id": 30,
    "vdcmun_id": 2001,
    "ward_id": 9,

    "count_floors_pre_eq": 1,
    "count_floors_post_eq": 1,

    "age_building": 5,
    "plinth_area_sq_ft": 650,
    "height_ft_pre_eq": 12,
    "height_ft_post_eq": 12,

    "land_surface_condition": "Flat",
    "foundation_type": "RCC",
    "roof_type": "RCC/RB/RBC",
    "ground_floor_type": "RC",
    "other_floor_type": "RC",
    "position": "Not attached",
    "plan_configuration": "Rectangular",
    
    "superstructure": "RC",
    "Intensity": 3.1
}])


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Load trained RandomForest pipeline
rf = trained["RandomForest"]

# Get the preprocessor and model inside the pipeline
pre = rf.named_steps['preprocessor']
model = rf.named_steps['model']

# Numeric feature names
num_features = pre.transformers_[0][2]

# Categorical original column names
cat_original = pre.transformers_[1][2]

# Get OneHotEncoder categories
ohe = pre.transformers_[1][1]
cat_expanded = ohe.get_feature_names_out(cat_original)

# Combine all feature names
feature_names = np.concatenate([num_features, cat_expanded])

# Get feature importances
importances = model.feature_importances_

# Sort by importance
idx = np.argsort(importances)[::-1]
feature_names_sorted = feature_names[idx]
importances_sorted = importances[idx]

# Plot
plt.figure(figsize=(12, 8))
plt.barh(feature_names_sorted, importances_sorted)
plt.gca().invert_yaxis()
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()