In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import warnings 
warnings.simplefilter('ignore')
%matplotlib inline

In [2]:
dataset = pd.read_csv("../datasets/forestfire_dataset/forestfires.csv")

In [3]:
dataset.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [4]:
cat_data = dataset.select_dtypes('object').columns.to_list()
num_data = dataset.columns.to_list()
for col in cat_data:
    num_data.remove(col)
num_data.remove('area')
print('the categoraical columns in the dataset is: ', cat_data)
print('the numerical columns in the dataset is: ', num_data)

the categoraical columns in the dataset is:  ['month', 'day']
the numerical columns in the dataset is:  ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']


In [5]:
data_no_dup = dataset.copy()
data_no_dup.drop_duplicates(inplace = True)
data_no_dup[data_no_dup.duplicated()]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area


In [6]:
data_no_dup['log-area'] = np.log10(data_no_dup['area'] + 1)

In [7]:
from scipy.stats import zscore
y_outliers = data_no_dup[abs(zscore(data_no_dup['area'])) >= 3 ]
y_outliers

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,log-area
237,1,2,sep,tue,91.0,129.5,692.6,7.0,18.8,40,2.2,0.0,212.88,2.33017
238,6,5,sep,sat,92.5,121.1,674.4,8.6,25.1,27,4.0,0.0,1090.84,3.038159
415,8,6,aug,thu,94.8,222.4,698.6,13.9,27.5,27,4.9,0.0,746.28,2.873483
479,7,4,jul,mon,89.2,103.9,431.6,6.4,22.6,57,4.9,0.0,278.53,2.446428


In [8]:
def area_cat(area):
    if area == 0.0:
        return "No damage"
    elif area <= 1:
        return "low"
    elif area <= 25:
        return "moderate"
    elif area <= 100:
        return "high"
    else:
        return "very high"

data_no_dup['damage_category'] = data_no_dup['area'].apply(area_cat)
data_no_dup.sample(5)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,log-area,damage_category
279,4,4,dec,mon,85.4,25.4,349.7,2.6,4.6,21,8.5,0.0,9.77,1.032216,moderate
222,4,3,mar,mon,87.6,52.2,103.8,5.0,11.0,46,5.8,0.0,36.85,1.578066,high
416,6,3,jul,tue,92.7,164.1,575.8,8.9,26.3,39,3.1,0.0,7.02,0.904174,moderate
166,6,5,aug,wed,92.1,111.2,654.1,9.6,16.6,47,0.9,0.0,2.29,0.517196,moderate
173,4,4,sep,mon,90.9,126.5,686.5,7.0,17.7,39,2.2,0.0,3.07,0.609594,moderate


In [9]:
out_columns = ['area','FFMC','ISI','rain']
df = pd.get_dummies(data_no_dup, columns=['day','month'], drop_first = True)

print(np.log1p(df[out_columns]).skew())
print('-'*20)
print(np.log1p(df[out_columns]).kurtosis())

area     1.219398
FFMC   -11.630232
ISI     -0.931885
rain    14.117710
dtype: float64
--------------------
area      0.961880
FFMC    184.053886
ISI       2.546805
rain    232.413847
dtype: float64


In [10]:
mask = df.loc[:, ['FFMC']].apply(zscore).abs() < 3

# Since most of the values in rain are 0.0, we can convert it as a categorical column
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))

df = df[mask.values]
df.shape

(506, 30)

In [11]:
out_columns.remove('rain')
df[out_columns] = np.log1p(df[out_columns])
df[out_columns].skew()

area    1.210025
FFMC   -1.794148
ISI    -0.431279
dtype: float64

In [12]:
# we will use this dataframe for building our ML model
df_ml = df.drop(columns = ['damage_category']).copy()
df_ml.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,7,5,4.468204,26.2,94.3,1.808289,8.2,51,6.7,0,...,False,False,False,False,False,True,False,False,False,False
1,7,4,4.517431,35.4,669.1,2.04122,18.0,33,0.9,0,...,False,False,False,False,False,False,False,False,True,False
2,7,4,4.517431,43.7,686.9,2.04122,14.6,33,1.3,0,...,False,False,False,False,False,False,False,False,True,False
3,8,6,4.529368,33.3,77.5,2.302585,8.3,97,4.0,1,...,False,False,False,False,False,True,False,False,False,False
4,8,6,4.503137,51.3,102.2,2.360854,11.4,99,1.8,0,...,False,False,False,False,False,True,False,False,False,False


In [14]:
df_ml.columns

Index(['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain',
       'area', 'log-area', 'day_mon', 'day_sat', 'day_sun', 'day_thu',
       'day_tue', 'day_wed', 'month_aug', 'month_dec', 'month_feb',
       'month_jan', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep'],
      dtype='object')

In [None]:
inputs = df_ml.drop(['area', 'log-area'], axis = 1)
target = df_ml['log-area']
inputs.shape

(506, 27)

In [None]:
# split data into training set and testin set
from sklearn.model_selection import train_test_split
x_train , x_test, y_train, y_test = train_test_split(inputs, target, test_size = 0.25, random_state = 42)
x_train.shape, x_test.shape

((379, 27), (127, 27))

In [None]:
# Defining REC (regression error charcteristics)
def rec(m, n, tol):     
    if type(m) != 'numpy.ndarray':
        m = np.array(m)
    if type(n) != 'numpy.ndarray':
        n = np.array(n)

    l = m.size       #Assigning m.size to l
    percent = 0
    for i in range(l):
        if np.abs(10 ** m[i] - 10 ** n[i]) <= tol:
            percent += 1
    return 100 * (percent / l) 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [5, 10, 15, 20, 50], 'max_leaf_nodes': [2, 5, 10], 'min_samples_leaf': [2, 5, 10],
    'min_samples_split':[2, 5, 10]
}
grid_rf = GridSearchCV(RandomForestRegressor(), param_grid = params, refit = True, verbose = 0, cv = 5)
grid_rf.fit(x_train, y_train)

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"{'max_depth': [5, 10, ...], 'max_leaf_nodes': [2, 5, ...], 'min_samples_leaf': [2, 5, ...], 'min_samples_split': [2, 5, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,15
,min_samples_split,5
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,2
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
from sklearn.metrics import mean_squared_error

a = grid_rf.predict(x_test)     
print("RMSE for Random Forest:",  np.sqrt(mean_squared_error(y_test, a))) 

RMSE for Random Forest: 0.6147744343993713


In [None]:
import joblib


joblib.dump(grid_rf, "../saved_models/forestfire_prediction_model.pkl")
print("✅ Model pipeline saved as 'forestfire_prediction_model.pkl'")

✅ Model pipeline saved as 'forestfire_prediction_model.pkl'
