In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder

# Step 1: Data Cleaning
# Load dataset
data = pd.read_csv("cleaned_disaster_data.csv")  # Path to your uploaded CSV
data = data[['Year', 'Country', 'Disaster Group', 'Disaster Type', 'Disaster Subtype',
             'Total Events', 'Total Affected', 'Total Deaths', 'Total Damage (USD, adjusted)', 'CPI']]

# Remove rows with missing target values (`Total Damage (USD, adjusted)`)
data = data.dropna(subset=['Total Damage (USD, adjusted)'])

# Fill missing values for numerical columns with 0
data['Total Deaths'] = data['Total Deaths'].fillna(0)
data['Total Affected'] = data['Total Affected'].fillna(0)

In [28]:
# Step 2: Apply Log Transformation to the Target Variable
data['Log_Total_Damage'] = np.log1p(data['Total Damage (USD, adjusted)'])

# Step 3: Encoding Categorical Variables
# One-hot encode categorical variables (Country, Disaster Group, Disaster Type, Disaster Subtype)
encoder = OneHotEncoder(drop='first', sparse=False)
categorical_columns = ['Country', 'Disaster Group', 'Disaster Type', 'Disaster Subtype']
encoded_features = encoder.fit_transform(data[categorical_columns])

# Create a DataFrame for encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# Combine encoded features with numerical data
numerical_columns = ['Year', 'Total Events', 'Total Affected', 'Total Deaths', 'CPI']
data_combined = pd.concat([data[numerical_columns].reset_index(drop=True), encoded_df], axis=1)
target = data['Log_Total_Damage'].reset_index(drop=True)

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data_combined, target, test_size=0.2, random_state=42)

# Step 5: Train the XGBoost Model
xgb_model = XGBRegressor(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_train, y_train)



In [29]:
# Step 6: Model Evaluation
y_pred_log = xgb_model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # Convert predictions back to original scale
y_test_actual = np.expm1(y_test)  # Convert test targets back to original scale

mae = mean_absolute_error(y_test_actual, y_pred)
mse = mean_squared_error(y_test_actual, y_pred)
r2 = r2_score(y_test_actual, y_pred)

# Display Evaluation Metrics
print("MAE:", mae)
print("MSE:", mse)
print("R2 Score:", r2)

MAE: 583936258.9608086
MSE: 1.7131656256962744e+19
R2 Score: 0.6631903405888429


In [30]:
# Save the model and scaler using pickle
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump({'model': xgb_model, 'scaler': scaler}, f)