# Model Building and Evaluation

## Importing Packages
In this part, we will use different models to predict the recovery time for a given paramters using Random Forest, Ridge, and Lasso. The below code is all the libraries necessary to run all the models. 

In [58]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

First, let's observe the data!

In [59]:
import pandas as pd

df = pd.read_csv("data/global_disaster_response_2018_2024.csv")
df.head()

Unnamed: 0,date,country,disaster_type,severity_index,casualties,economic_loss_usd,response_time_hours,aid_amount_usd,response_efficiency_score,recovery_days,latitude,longitude
0,2021-01-31,Brazil,Earthquake,5.99,111,7934365.71,15.62,271603.79,83.21,67,-30.613,-122.557
1,2018-12-23,Brazil,Extreme Heat,6.53,100,8307648.99,5.03,265873.81,96.18,55,10.859,-159.194
2,2020-08-10,India,Hurricane,1.55,22,765136.99,32.54,49356.49,60.4,22,0.643,-160.978
3,2022-09-15,Indonesia,Extreme Heat,4.55,94,1308251.31,7.83,237512.88,86.41,47,-33.547,30.35
4,2022-09-28,United States,Wildfire,3.8,64,2655864.36,21.9,188910.69,72.81,42,-19.17,-117.137


Let's first identify how many different disaster types and countries are in this data!

In [60]:
print("How many disaster types?", len(df['disaster_type'].value_counts()))
print("How many countries?", len(df['country'].value_counts()))

How many disaster types? 10
How many countries? 20


From looking at the data, the data is not orgainzied, so let's first orgainze the data by time. Then, let's split the data for train data and test data. Since we the date starts from 2018 January to 2024 December, we can set train data from 2018 January to 2022 December and set test data from 2023 January to 2024 December.

In [61]:
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)

train = df[df['date'] <= "2022-12-31"].copy()
test  = df[df['date'] >= "2023-01-01"].copy()

Right now, the date data has year, month, and day. So, it would be good to split it into year and month.

In [62]:
for d in [train, test]:
    d["year"] = d["date"].dt.year
    d["month"] = d["date"].dt.month

## Predicting the Recovery Time

After splitting the date, we can drop the "date" column.

In [63]:
train = train.drop(columns = ['date'])
test  = test.drop(columns = ['date'])

Since we are interested in the recovery day, we can set y as the recovery days column and X to be everything beside recovery days. 

In [64]:
y_train = train['recovery_days']
y_test  = test['recovery_days']
X_train = train.drop(columns=['recovery_days'])
X_test  = test.drop(columns=['recovery_days'])

After that is done, we can create lists of num_features and cat_features, where we are going to use these lists of data and preprocess the data using Standard Scalar and One Hot Encoder. This allow us to automatically encode and scale the data. 

In [65]:
num_features = [
    "severity_index",
    "casualties",
    "economic_loss_usd",
    "response_time_hours",
    "aid_amount_usd",
    "response_efficiency_score",
    "latitude",
    "longitude",
    "year",
    "month",
]
cat_features = ["country", "disaster_type"]


In [66]:
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), num_features), ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)])

## Model Fitting
Here, we use Random Forest, Ridge, and Lasso and fit the models, then we predict the outcomes for the given X_test. 

In [67]:
random_forest_model = Pipeline([('prep', preprocessor), ('model', RandomForestRegressor(random_state = 42))])
random_forest_model.fit(X_train, y_train)
ridge_model = Pipeline([('prep', preprocessor), ('model', Ridge())])
ridge_model.fit(X_train, y_train)
lasso_model = Pipeline([('prep', preprocessor), ('model', Lasso())])
lasso_model.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


After the models have been fit, we can then predict the outcome for the given X_test. After all the predictions are out, we can do MSE (Mean Squared Error) to see the best model and check whether our prediction was close to the actual outcome. 

In [75]:
# (Optional) MSE for reference only; main metrics are RMSE/MAE/R2 saved to outputs/ml/metrics.csv
mse_ridge = mean_squared_error(y_test, preds_ridge)
mse_ridge


24.786798609280467

In [76]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

preds_random_forest = random_forest_model.predict(X_test)
preds_ridge = ridge_model.predict(X_test)
preds_lasso = lasso_model.predict(X_test)

baseline_pred = np.full(len(y_test), y_train.mean())

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def metrics_row(model_name, y_true, y_pred):
    return {
        "model": model_name,
        "RMSE": rmse(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
    }

results = pd.DataFrame([
    metrics_row("baseline_mean", y_test, baseline_pred),
    metrics_row("random_forest", y_test, preds_random_forest),
    metrics_row("ridge", y_test, preds_ridge),
    metrics_row("lasso", y_test, preds_lasso),
]).sort_values("RMSE")

Path("outputs/ml").mkdir(parents=True, exist_ok=True)
results.to_csv("outputs/ml/metrics.csv", index=False)

results


Unnamed: 0,model,RMSE,MAE,R2
2,ridge,4.978634,3.970246,0.939425
1,random_forest,5.093356,4.059898,0.936602
3,lasso,5.099404,4.062804,0.936451
0,baseline_mean,20.228813,16.355676,-2.5e-05


In [72]:
from pathlib import Path
Path("outputs/ml/metrics.csv").exists()


True

In [74]:
Path("outputs/ml").mkdir(parents=True, exist_ok=True)
results.to_csv("outputs/ml/metrics.csv", index=False)
results

Unnamed: 0,model,RMSE,MAE,R2
2,ridge,4.978634,3.970246,0.939425
1,random_forest,5.093356,4.059898,0.936602
3,lasso,5.099404,4.062804,0.936451
0,baseline_mean,20.228813,16.355676,-2.5e-05
