# Forest fire area predictor

## Reading data

In [8]:
import pandas as pd
import numpy as np
forest_fire=pd.read_csv("forestfires.csv")   #Reading data
forest_fire.info()              #Some information about data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


In [9]:
col=forest_fire.columns
col#Viewing sample of data

Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area'],
      dtype='object')

In [10]:
forest_fire.describe()    #Description about data

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


In [11]:
forest_fire_mod=forest_fire.drop(["month","day"],axis=1)
forest_fire_mod.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


## One hot encoding

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
ohe=OneHotEncoder(drop="first",sparse=False,dtype=np.int64)
forest_fire_new=ohe.fit_transform(forest_fire[['month','day']])

In [14]:
forest_fire_new.shape

(517, 17)

In [15]:
forest_fire=pd.DataFrame(np.hstack((forest_fire_mod,forest_fire_new)))

## Train-Test split

In [16]:

from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(forest_fire,test_size=0.2,random_state=42)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
329,4.0,3.0,92.2,102.3,751.5,8.4,23.5,27.0,4.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
173,4.0,4.0,90.9,126.5,686.5,7.0,17.7,39.0,2.2,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
272,2.0,5.0,92.1,152.6,658.2,14.3,20.2,47.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
497,3.0,4.0,96.1,181.1,671.2,14.3,32.3,27.0,2.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
182,5.0,4.0,86.8,15.6,48.3,3.9,12.4,53.0,2.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
test_set.shape

(104, 28)

## Feature and labels

In [20]:
train_features=train_set.drop(27,axis=1)
train_labels=train_set[27].copy()
train_features.shape

(413, 27)

In [21]:
train_labels.shape

(413,)

## Creating a pipeline

In [22]:
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [23]:
my_pipeline=Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("scalar",StandardScaler())
]
)

In [24]:
pd_train_set=my_pipeline.fit_transform(train_features)

## Selecting a model

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [63]:
model=LinearRegression()
#model=DecisionTreeRegressor()
#model=RandomForestRegressor()

In [64]:
model.fit(pd_train_set,train_labels)

LinearRegression()

In [65]:
some_data=train_features.iloc[:5]
prd_data=my_pipeline.fit_transform(some_data)
some_labels=train_labels.iloc[:5]
model.predict(prd_data)

array([0.02143157, 0.049782  , 0.18520062, 0.22886821, 0.01108564])

In [66]:
some_labels

329    0.0
173    0.0
272    0.0
497    0.0
182    0.0
Name: 27, dtype: float64

In [67]:
from sklearn.metrics import mean_squared_error
pdt=model.predict(pd_train_set)
err=mean_squared_error(train_labels,pdt)
err

0.05817562981247947

## Cross validation evaluation 

In [68]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(model,pd_train_set,train_labels,scoring="neg_mean_squared_error")
rmse_score=np.sqrt(-score)

In [69]:
def print_score(a):
    print(a)
    print("mean : ",a.mean())
    print("Standard deviation : ",a.std())

In [70]:
print_score(rmse_score)

[0.23967347 0.29828028 0.25506327 0.26844426 0.21272587]
mean :  0.25483743062235586
Standard deviation :  0.028551030652163238


## Storing data and Dumping model

In [71]:
from joblib import dump, load
dump(model,"forest_fire.joblib")

['forest_fire.joblib']

In [72]:
%store test_set

Stored 'test_set' (DataFrame)


In [73]:
%store my_pipeline

Stored 'my_pipeline' (Pipeline)


In [74]:
prd_data[0]

array([ 0.39223227, -1.58113883,  0.19466156, -0.23579877,  0.72610598,
       -0.28641452,  0.34435902, -1.10884271,  1.22474487,  0.        ,
       -0.62284825, -0.81649658,  0.        , -0.5       ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.22474487, -0.5       ,  2.        , -0.5       ,
        0.        , -0.81649658])