#Project Title: Wildfire Prediction Challenge
#Name: Kibon Kiprono Solomon
#Date: 25/07/2024
Problem Statement: Each year, thousands of fires blaze across the African continent. Some are natural occurrences, part of a ‘fire cycle’ that can actually benefit some dryland ecosystems. Many are started intentionally, used to clear land or to prepare fields for planting. And some are wildfires, which can rage over large areas and cause huge amounts of damage. Whatever the cause, fires pour vast amounts of CO2 into the atmosphere, along with smoke that degrades air quality for those living downwind.

Figuring out the dynamics that influence where and when these fires occur can help us to better understand their effects. And predicting how these dynamics will play out in the future, under different climatic conditions, could prove extremely useful.

Project goal: The objective of this challenge is to create a machine-learning model capable of predicting the burned area in different locations over 2014 to 2016.


In [None]:
# Upload the dataset
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import scipy.stats as stats

# Load datasets
train = pd.read_csv('/content/drive/My Drive/fire-extent-prediction-challenge/Train.csv')
test = pd.read_csv('/content/drive/My Drive/fire-extent-prediction-challenge/Test.csv')
ss = pd.read_csv('/content/drive/My Drive/fire-extent-prediction-challenge/SampleSubmission.csv')

# Preprocess the data
train['date'] = pd.to_datetime(train['ID'].apply(lambda x: x.split('_')[1]))
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day

test['date'] = pd.to_datetime(test['ID'].apply(lambda x: x.split('_')[1]))
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day

# Drop unnecessary columns and split data
X = train.drop(['burn_area', 'ID', 'date', 'day'], axis=1)
y = train['burn_area']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Convert back to DataFrame to retain feature names
X_train = pd.DataFrame(X_train, columns=X.columns)
X_val = pd.DataFrame(X_val, columns=X.columns)

# Define the model
xgb_reg = XGBRegressor(random_state=42)

# Define the parameter grid for hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': stats.randint(50, 400),
    'learning_rate': stats.uniform(0.01, 0.3),
    'max_depth': stats.randint(3, 10),
    'subsample': stats.uniform(0.6, 0.4),
    'colsample_bytree': stats.uniform(0.6, 0.4),
    'reg_alpha': stats.uniform(0, 1),
    'reg_lambda': stats.uniform(0, 1)
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_reg, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

# Get the best model and print best parameters
best_model = random_search.best_estimator_
print("Best parameters found: ", random_search.best_params_)

# Print feature importances
feature_importances = best_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Feature importances:")
print(importance_df)

# Select top important features (for example, top 8 features)
top_features = importance_df.head(8)['Feature'].values
print("Top features selected for prediction:", top_features)

# Filter training and validation data to include only top features
X_train_top = X_train[top_features]
X_val_top = X_val[top_features]

# Retrain the best model using only top features
best_model.fit(X_train_top, y_train)

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val_top)

# Calculate the Mean Squared Error and Root Mean Squared Error
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE) with top features:", rmse)

# Prepare the test data for prediction using only top features
test2 = test.drop(['ID', 'date', 'day'], axis=1)
test2 = pd.DataFrame(scaler.transform(test2), columns=X.columns)
test2_top = test2[top_features]

# Predict the burn area for the test set using the best model with top features
preds = best_model.predict(test2_top)

# Add predictions to the submission dataframe
ss['burn_area'] = preds

# Constrain predictions to the range (0, 1)
ss['burn_area'] = ss['burn_area'].clip(0, 1)

# Save the submission file
ss.to_csv('/content/drive/My Drive/fire-extent-prediction-challenge/starter_submissionXGB43.csv', index=False)

# Load and display the submission file
ss = pd.read_csv('/content/drive/My Drive/fire-extent-prediction-challenge/starter_submissionXGB43.csv')
print(ss.head())

# Print value counts of burn_area in submission file
print(ss['burn_area'].value_counts())


Mounted at /content/drive
Best parameters found:  {'colsample_bytree': 0.6281664523398175, 'learning_rate': 0.07267561528460804, 'max_depth': 9, 'n_estimators': 391, 'reg_alpha': 0.3586467812961639, 'reg_lambda': 0.25416364906973876, 'subsample': 0.7181162353675755}
Feature importances:
          Feature  Importance
28          month    0.092137
3     climate_def    0.086243
0             lat    0.066755
22    landcover_5    0.063254
1             lon    0.062257
16      elevation    0.055239
13    climate_vap    0.045919
27           year    0.041125
9    climate_srad    0.040619
26  precipitation    0.036791
8    climate_soil    0.034956
21    landcover_4    0.031205
15     climate_vs    0.030798
11   climate_tmmn    0.030439
6      climate_pr    0.030236
23    landcover_6    0.029895
5     climate_pet    0.028714
2     climate_aet    0.028300
12   climate_tmmx    0.025582
19    landcover_2    0.024234
7      climate_ro    0.022332
14    climate_vpd    0.021941
4    climate_pdsi    0