# Forest Fire Area Prediction

# Import Library

Imported some commonly used libraries for data manipulation (NumPy, Pandas), data visualization (Matplotlib, Seaborn), and potentially statistical analysis (Seaborn). These libraries are widely used in machine learning tasks. If you have a specific task or analysis in mind, you can proceed with your code using these libraries.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Loaded a dataset

In [4]:
df= pd.read_csv("forestfires.csv")
df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [5]:
df.shape

(517, 13)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


# Data analysis

The code df.isnull().sum().sum() calculates the total number of missing values in your DataFrame (df). It sums up the count of missing values for each column and then provides the overall count.


In [7]:
df.isnull().sum().sum()

0

# Data Preprocessing

In [8]:
df.month.value_counts()

month
aug    184
sep    172
mar     54
jul     32
feb     20
jun     17
oct     15
apr      9
dec      9
jan      2
may      2
nov      1
Name: count, dtype: int64

In [9]:
df.day.value_counts()

day
sun    95
fri    85
sat    84
mon    74
tue    64
thu    61
wed    54
Name: count, dtype: int64

# Label & Orginal Encoding

The LabelEncoder is commonly used to convert categorical labels into numerical format, which is necessary for many machine learning algorithms that require numerical input.

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le=LabelEncoder()

In [12]:
l_enc= df[["month","day"]]

In [13]:
le.fit_transform(l_enc["month"])

array([ 7, 10, 10,  7,  7,  1,  1,  1, 11, 11, 11, 11,  1, 11, 11, 11,  7,
       10,  7,  0, 11, 11,  6,  1,  1,  1, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 10, 10, 10,  7,  5,  1,  1, 11, 11, 11, 11,  5,  7,  7, 11,
        1,  1,  1,  1, 11, 11, 10,  3,  3,  7,  7,  1,  1,  1,  1, 11, 11,
       11,  7,  7, 11,  7,  1, 11,  3,  3,  7,  1,  1,  1,  1,  1,  1,  1,
       11, 11, 11, 11,  7,  1,  7,  1,  1,  1, 11,  3,  7,  1,  1,  1,  1,
        1, 11,  4,  7,  7,  1, 11, 11,  7,  7, 11, 11,  7,  7,  7,  7,  7,
        1,  1,  1, 11, 11, 11, 10,  7, 11, 10, 10,  3,  7,  7, 11,  7,  1,
       11, 11,  5, 11, 11,  1,  1,  5,  1,  1,  7, 11,  1, 11,  6,  5,  5,
       11, 11,  1, 11,  1,  1, 11,  7,  1,  7, 11, 11,  7,  1,  1,  7,  1,
       11,  1,  1, 11,  1,  1,  0,  1, 11,  1, 11, 10,  3, 10,  1, 11,  7,
       11,  7,  7,  7,  1,  1, 11,  1,  1,  0, 11, 11, 11, 11,  7,  3, 10,
        7, 11,  1, 11, 11, 11, 10,  1, 11,  7,  7,  7, 11, 11, 11,  7,  1,
       11,  7,  5, 11, 11

In [14]:
l_enc["month_L_enc"] = le.fit_transform(l_enc["month"])
l_enc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  l_enc["month_L_enc"] = le.fit_transform(l_enc["month"])


Unnamed: 0,month,day,month_L_enc
0,mar,fri,7
1,oct,tue,10
2,oct,sat,10
3,mar,fri,7
4,mar,sun,7
...,...,...,...
512,aug,sun,1
513,aug,sun,1
514,aug,sun,1
515,aug,sat,1


In [15]:
order_Label = {"sat":7,"fri":6,"thu":5,"wed":4,"tue":3,"mon":2,"sun":1}

In [16]:
# Now we map the the dictionary "order_Label" with the class "day" 
l_enc["day_ord_enc"] = l_enc["day"].map(order_Label)
l_enc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  l_enc["day_ord_enc"] = l_enc["day"].map(order_Label)


Unnamed: 0,month,day,month_L_enc,day_ord_enc
0,mar,fri,7,6
1,oct,tue,10,3
2,oct,sat,10,7
3,mar,fri,7,6
4,mar,sun,7,1
...,...,...,...,...
512,aug,sun,1,1
513,aug,sun,1,1
514,aug,sun,1,1
515,aug,sat,1,7


# other method convert variable

In [17]:
month_map = {'jan':1, 'feb':2, 'mar':3, 
             'apr':4, 'may':5, 'jun':6, 
             'jul':7, 'aug':8, 'sep':9, 
             'oct':10, 'nov':11, 'dec':12}

day_map = {'mon':1, 'tue':2, 'wed':3,
          'thu':4, 'fri':5, 'sat':6, 'sun':7}

df.month = df.month.map(month_map)
df.day = df.day.map(day_map)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    int64  
 3   day     517 non-null    int64  
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(5)
memory usage: 52.6 KB


# Standardization & Splitting

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

importing StandardScaler for feature scaling and train_test_split for splitting your data into training and testing sets. These are common preprocessing steps in machine learning.

In [20]:
# Splitting Data
# X represents the input features (all columns except 'Grade')
X=df.drop(['area'],axis=1)
# y represents the target variable ('Grade')
y=df['area']
print('shape of X', X.shape)
print('shape of y', y.shape)

shape of X (517, 12)
shape of y (517,)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)
print('shape of the X_train= ', X_train.shape)
print('shape of the y_train= ', y_train.shape)
print('shape of the X_test= ', X_test.shape)
print('shape of the y_test= ', y_test.shape)


shape of the X_train=  (413, 12)
shape of the y_train=  (413,)
shape of the X_test=  (104, 12)
shape of the y_test=  (104,)


# Feature Scaling

Feature scaling is important for many machine learning algorithms to ensure that all features contribute equally to the model's performance. train_test_split is used to divide your dataset into training and testing sets, allowing you to assess the model's performance on unseen data.

In [22]:
sc = StandardScaler()

In [23]:
sc.fit(X_train)

In [24]:
sc.mean_

array([4.69249395e+00, 4.29782082e+00, 7.49636804e+00, 4.22760291e+00,
       9.07409201e+01, 1.08040920e+02, 5.44986925e+02, 9.05762712e+00,
       1.87878935e+01, 4.45569007e+01, 4.05060533e+00, 2.42130751e-02])

In [25]:
sc.scale_

array([  2.28053492,   1.20969454,   2.2737818 ,   2.09149397,
         4.5192562 ,  62.067655  , 247.41300684,   4.70833285,
         5.9366387 ,  16.25239136,   1.81519838,   0.32787329])

In [26]:
X_train.describe()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
count,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
mean,4.692494,4.297821,7.496368,4.227603,90.74092,108.04092,544.986925,9.057627,18.787893,44.556901,4.050605,0.024213
std,2.283301,1.211162,2.27654,2.094031,4.524737,62.142934,247.713083,4.714043,5.943839,16.272103,1.8174,0.328271
min,1.0,2.0,1.0,1.0,50.4,2.4,9.3,0.4,2.2,15.0,0.4,0.0
25%,3.0,4.0,7.0,2.0,90.2,56.7,430.8,6.3,15.4,33.0,2.7,0.0
50%,4.0,4.0,8.0,5.0,91.6,108.0,658.2,8.4,19.2,42.0,4.0,0.0
75%,7.0,5.0,9.0,6.0,92.9,141.2,706.8,11.0,22.9,53.0,4.9,0.0
max,9.0,9.0,12.0,7.0,96.2,291.3,860.6,56.1,33.3,99.0,9.4,6.4


In [27]:
X_train_sc=sc.transform(X_train)
X_test_sc=sc.transform(X_test)

In [28]:
X_train_sc = pd.DataFrame(X_train_sc, columns= ['X','Y','month','day','FFMC','DMC','DC','ISI','temp','RH','wind','rain'])
X_test_sc = pd.DataFrame(X_test_sc, columns= ['X','Y','month','day','FFMC','DMC','D','ISI','temp','RH','wind','rain'])

In [29]:

X_train

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
390,7,4,2,1,84.7,9.5,58.3,4.1,7.5,71,6.3,0.0
296,6,4,6,7,90.4,89.5,290.8,6.4,14.3,46,1.8,0.0
249,3,4,8,3,93.1,157.3,666.7,13.5,21.7,40,0.4,0.0
126,3,5,3,1,87.6,52.2,103.8,5.0,9.0,49,2.2,0.0
315,3,4,9,3,91.2,134.7,817.5,7.2,18.5,30,2.7,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
28,6,3,9,6,93.4,145.4,721.4,8.1,30.2,24,2.7,0.0
361,4,3,9,5,92.5,122.0,789.7,10.2,17.3,45,4.0,0.0
436,8,6,8,1,92.1,207.0,672.6,8.2,26.8,35,1.3,0.0
485,2,4,8,1,95.0,135.5,596.3,21.3,30.6,28,3.6,0.0


In [30]:
X_train_sc.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
0,1.011827,-0.246195,-2.41728,-1.543204,-1.336707,-1.587637,-1.967103,-1.052947,-1.901395,1.627028,1.2392,-0.073849
1,0.573333,-0.246195,-0.658097,1.325558,-0.075437,-0.298721,-1.027379,-0.564452,-0.755965,0.088793,-1.239867,-0.073849
2,-0.742148,-0.246195,0.221495,-0.58695,0.522006,0.793635,0.491943,0.943513,0.490531,-0.280383,-2.011133,-0.073849
3,-0.742148,0.58046,-1.977484,-1.543204,-0.695008,-0.899678,-1.7832,-0.861797,-1.648726,0.273381,-1.019506,-0.073849
4,-0.742148,-0.246195,0.661291,-0.58695,0.101583,0.429516,1.10145,-0.39454,-0.048494,-0.895677,-0.744054,-0.073849


In [31]:
X_train_sc.describe()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
count,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
mean,1.193557e-16,-2.236575e-16,-1.0752770000000001e-17,-4.301106e-17,-1.520441e-15,-4.301106e-17,-2.365608e-16,-2.505394e-16,2.580664e-16,1.161299e-16,-8.602212e-18,4.301106e-18
std,1.001213,1.001213,1.001213,1.001213,1.001213,1.001213,1.001213,1.001213,1.001213,1.001213,1.001213,1.001213
min,-1.619135,-1.899505,-2.857076,-1.543204,-8.926451,-1.702029,-2.165153,-1.838788,-2.794156,-1.818619,-2.011133,-0.07384888
25%,-0.7421478,-0.2461951,-0.2183006,-1.065077,-0.1196923,-0.8271767,-0.4615235,-0.5856908,-0.5706754,-0.7110892,-0.7440538,-0.07384888
50%,-0.3036542,-0.2461951,0.2214953,0.369304,0.1900932,-0.0006592821,0.4575874,-0.139673,0.06941749,-0.1573246,-0.02787868,-0.07384888
75%,1.011827,0.5804599,0.6612912,0.8474311,0.4777512,0.5342409,0.6540201,0.4125394,0.6926658,0.5194989,0.4679349,-0.07384888
max,1.888814,3.88708,1.980679,1.325558,1.20796,2.95257,1.275653,9.991301,2.444499,3.349852,2.947003,19.44589


In [32]:
X_train_sc.describe().round(2)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
count,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
mean,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.62,-1.9,-2.86,-1.54,-8.93,-1.7,-2.17,-1.84,-2.79,-1.82,-2.01,-0.07
25%,-0.74,-0.25,-0.22,-1.07,-0.12,-0.83,-0.46,-0.59,-0.57,-0.71,-0.74,-0.07
50%,-0.3,-0.25,0.22,0.37,0.19,-0.0,0.46,-0.14,0.07,-0.16,-0.03,-0.07
75%,1.01,0.58,0.66,0.85,0.48,0.53,0.65,0.41,0.69,0.52,0.47,-0.07
max,1.89,3.89,1.98,1.33,1.21,2.95,1.28,9.99,2.44,3.35,2.95,19.45


# Implement Model

# Linear Regression

Replace X_train, y_train, X_test, and y_test with your actual training and testing sets. The LinearRegression model is now trained on your training data, and you can evaluate its performance using metrics like mean squared error.

In [33]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train, y_train)

In [34]:
regressor.score(X_test,y_test)

-0.007611241260526036

# K- NearestNeighbors Regression

The k-nearest neighbors algorithm predicts the target variable by considering the k-nearest data points in the training set. It's essential to choose an appropriate value for n_neighbors based on the characteristics of your data.
 

In [35]:
from sklearn.neighbors import KNeighborsRegressor


In [36]:
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(X_train,y_train)

In [37]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [38]:
sc.fit(X_train)

In [39]:
regressor.score(X_test,y_test)


-1.6190085650681802

# Implement Random Forest Regression

Random Forest is an ensemble method that combines multiple decision trees to improve predictive performance and control overfitting. It can be a powerful tool for regression tasks

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [41]:
rd = RandomForestRegressor(n_estimators=40, random_state=51)


In [42]:
rd.fit(X_train, y_train)


In [43]:
y_pred = rd.predict(X_test)

In [44]:
rd.score(X_test,y_test)

-3.1148850321469856

In [45]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the Root Mean Squared Error (RMSE)
print('Root Mean Squared Error:', rmse)

Root Mean Squared Error: 65.37588277865657


In [46]:
from sklearn.model_selection import cross_val_score

In [47]:
score_rf=cross_val_score(RandomForestRegressor(n_estimators=5),X_train,y_train,cv=5)
print(score_rf)
print("Avg:",np.average(score_rf))

[-0.08908886 -9.3281654   0.01846978 -0.12051604 -3.12540431]
Avg: -2.5289409657218167


In [48]:
scores1 = cross_val_score(RandomForestRegressor(n_estimators=5),X_train,y_train,cv=5)
print("Avg Score for Estimators=5 and CV=5 :",np.average(scores1))

Avg Score for Estimators=5 and CV=5 : -1.2020697726560716


# Implement Decision Tree Regression

In [49]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming X_train, X_test, y_train, y_test are already defined

# Initialize the Decision Tree Regressor
decision_tree_model = DecisionTreeRegressor(random_state=51)

# Train the model
decision_tree_model.fit(X_train, y_train)

decision_tree_model.score(X_test,y_test)

-17.19268599205353

In [50]:
# Make predictions on the test set
y_pred = decision_tree_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the Root Mean Squared Error (RMSE)
print('Root Mean Squared Error:', rmse)

Root Mean Squared Error: 137.4634138301058
