# Predict Fire Area Notebook

This notebook demonstrates the process of predicting Wildfire burn areas using LightGBM and grid search for hyperparameter tuning.

In [None]:
# Import dependencies

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error

## Data Loading and Preprocessing

In [None]:
# Load data
train_df = pd.read_csv('/kaggle/input/predict-fire/Train.csv')
test_df = pd.read_csv('/kaggle/input/predict-fire/Test.csv')

In [None]:
# Display the first 5 rows of the training data
train_df.head()

In [None]:
# Preprocess a DataFrame using a function
    
def wrangle(df):
    """Preprocess an input DataFrame by 
    extracting information from the 'ID' column."""

    # Extract area_id, month, and year from the ID column\n",
    df['area_id'] = df['ID'].str.split('_', expand=True)[0]
    df['month'] = df['ID'].str.split('-', expand=True)[1].astype('int')
    df['year'] = df['ID'].str.split('-', expand=True)[0].str.split('_', expand=True)[1].astype('int')
    
    # Remove the redundant ID column
    df.drop(columns='ID', inplace=True)
    return df


# Apply the wrangling function to the training data
wrangle(train_df)

In [None]:
# Display the first 5 rows of the preprocessed training data
train_df.head()

## Prepare the data for model training

In [None]:
#  Separate features and target variable

X = train_df.drop(columns=['burn_area', 'area_id'])
y = train_df['burn_area']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Print the shape of Predictor variable Training and testing sets
print(f"Training set shape: {X_train.shape}")
print("------------------------------------")
print(f"Testing set shape: {X_test.shape}")

## Model Training and Evaluation

In [None]:
# Create an RMSE scorer for model evaluation
rmse_scorer = make_scorer(lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False))


# Define the parameter grid for gridsearch
param_grid = {
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'reg_alpha': [0, 0.1, 0.5]
}


In [None]:
# Initialize the LightGBM regressor
lgbm = lgb.LGBMRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, n_jobs=-1, scoring=rmse_scorer)

# Fit the GridSearchCV to the data
grid_search.fit(X_train, y_train)

In [None]:
# Print the best parameters and best RMSE score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best RMSE score: {-grid_search.best_score_}")

In [None]:
# Get the best model
best_lgbm = grid_search.best_estimator_

# Fit the best model on the entire training set
best_lgbm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_lgbm.predict(X_test)

# Calculate and print the RMSE on the test set
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {rmse}")

# Get feature importances

In [None]:
# Create dataframe of important features
feature_importance = pd.DataFrame(
    {
        'feature': X.columns,
        'importance': best_lgbm.feature_importances_
    }
).sort_values('importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importances')
plt.tight_layout()
plt.show();