# Nepal Earthquake

From: [Kaggle Nepal Earthquake](https://www.kaggle.com/datasets/imtkaggleteam/nepal-earthquake)

### Context
The Nepal Earthquake Severity Index is designed to provide an overview of estimated severity of impacts resulting from the earthquake of 25 April 2015. It is not a replacement for first hand damage and needs assessment information, but can support prioritisation during early stages of the response. It estimates severity based on: 1) the intensity of the earthquake; 2) population; 3) vulnerability of housing and population. This index will be updated to take account of: validation against first hand reports and improvements to the severity model; improved sources of data (quality, timeliness and scale); changing requirements as the response continues. 

In [None]:
import platform
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns
import sklearn
from sklearn import datasets, decomposition, metrics, preprocessing, utils
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree
from tqdm.notebook import tqdm

### Data

In [None]:
%config InlineBackend.figure_format="retina"  # For high DPI display

sns.set_style("darkgrid")
sns.set_context("notebook")

plotly.offline.init_notebook_mode(connected=True)

tqdm.pandas()

print(sklearn.__version__)  # Version tested on sklearn.__version__ == 1.3.x

### Utility Functions

In [None]:
def pprint_var(**kwargs):
    for k, v in kwargs.items():
        print(f"{k} = {v:.4g}")

#### Data loading

In [None]:
DATA = Path("data")    # Change to your data folder
assert DATA.exists()

In [None]:
eq_raw_df = pd.read_csv(DATA / "nepal-earthquake-severity-index-latest.csv", 
                        low_memory=False)

In [None]:
eq_raw_df.info()

In [None]:
eq_raw_df.columns

In [None]:
columns = ['REGION', 'Hazard (Intensity)',
       'Exposure', 'Housing', 'Poverty', 'Vulnerability',
       'Severity category']
eq_df = eq_raw_df[columns].copy()

rename_dict = {
    "Hazard (Intensity)": "INTENSITY",
    "Severity category": "SEVERITY_CATEGORY",
    "Exposure" : "EXPOSURE",
    "Housing" : "HOUSING",
    "Poverty" : "POVERTY",
    "Vulnerability" : "VULNERABILITY"
}
# rename columns
eq_df.rename(columns=rename_dict, inplace=True)
eq_df

del eq_raw_df
eq_df

In [None]:
# Drop the missing values
eq_df.dropna(axis="index", inplace=True)
eq_df.reset_index(drop=True, inplace=True)
eq_df.info()

In [None]:
#sns.pairplot(eq_df.iloc[:, [1,2,3,4,5,6,7]])
#plt.show

In [None]:
plt.figure(figsize=(10, 5))
mask = np.triu(np.ones_like(eq_df.corr(numeric_only=True), dtype=bool))
ax = sns.heatmap(
    eq_df.corr(numeric_only=True),
    cbar=True,
    annot=True,
    cmap="viridis",
    mask=mask,
)

In [None]:
print(eq_df.describe())
print(eq_df.describe(include='object'))

In [None]:
eq_df.columns

In [None]:
# perform one-hot encoding on categorical features
categorical_features = ['REGION']
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(eq_df[categorical_features])
df_features_encoded = pd.DataFrame(enc.transform(eq_df[categorical_features]).toarray(), columns=enc.get_feature_names_out())
# combine the one-hot encoded features with the numerical features
eq_df = pd.concat([eq_df.drop(categorical_features, axis=1), df_features_encoded ], axis=1)
eq_df

In [None]:
eq_df.columns

In [None]:
severity_mapping = {
    "Lowest" : 0,
    "Low" : 1,
    "Medium-Low" : 2,
    "Medium-High" : 3,
    "High" : 4,
    "Highest" : 5
}
eq_df['SEVERITY_CATEGORY'] = eq_df['SEVERITY_CATEGORY'].map(severity_mapping)
eq_df
    

In [None]:
# Split the data into train and test sets
X = eq_df.drop(columns=["SEVERITY_CATEGORY"])
y = eq_df["SEVERITY_CATEGORY"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# split the data train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_val.shape, X_test.shape

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)



In [None]:
# report performance
print("Linear Regression")
print(f"R2 score: {metrics.r2_score(y_val, y_pred):.4g}")
print(f"Balanced accuracy score: {metrics.balanced_accuracy_score(y_val, y_pred.round()):.4g}")
print(f"MAE: {metrics.mean_absolute_error(y_val, y_pred):.4g}")
print(f"MSE: {metrics.mean_squared_error(y_val, y_pred):.4g}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_val, y_pred)):.4g}")

print ("Classificaiton report")
print(metrics.classification_report(y_val, y_pred.round()))