# Forest fire area predictor

## Reading data

In [1]:
import pandas as pd
forest_fire=pd.read_csv("forestfires.csv")   #Reading data
forest_fire.info()              #Some information about data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    int64  
 3   day     517 non-null    int64  
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(5)
memory usage: 52.6 KB


In [2]:
forest_fire.head()  #Viewing sample of data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,3,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,10,3,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,10,6,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,3,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,3,1,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
forest_fire.describe()    #Description about data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,7.475822,3.423598,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,2.27599,1.715316,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,1.0,1.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,7.0,2.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,8.0,3.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,9.0,5.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,12.0,6.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


## Train-Test split

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(forest_fire,test_size=0.2,random_state=42)
train_set.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
329,4,3,9,6,92.2,102.3,751.5,8.4,23.5,27,4.0,0.0,3.33
173,4,4,9,2,90.9,126.5,686.5,7.0,17.7,39,2.2,0.0,3.07
272,2,5,8,3,92.1,152.6,658.2,14.3,20.2,47,4.0,0.0,3.09
497,3,4,8,3,96.1,181.1,671.2,14.3,32.3,27,2.2,0.0,14.68
182,5,4,2,1,86.8,15.6,48.3,3.9,12.4,53,2.2,0.0,6.38


In [15]:
test_set.shape

(104, 13)

## Feature and labels

In [19]:
train_features=train_set.drop("area",axis=1)
train_labels=train_set['area'].copy()
train_features.shape

(413, 12)

In [20]:
train_labels.shape

(413,)

## Creating a pipeline

In [8]:
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [12]:
my_pipeline=Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("scalar",StandardScaler())
]
)

In [21]:
pd_train_set=my_pipeline.fit_transform(train_features)

## Selecting a model

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [76]:
model=LinearRegression()
#model=DecisionTreeRegressor()
#model=RandomForestRegressor()

In [77]:
model.fit(pd_train_set,train_labels)

LinearRegression()

In [78]:
some_data=train_features.iloc[:5]
prd_data=my_pipeline.fit_transform(some_data)
some_labels=train_labels.iloc[:5]
model.predict(prd_data)

array([15.87603452, 12.31341182,  4.47678242, 20.18237419,  2.81205081])

In [79]:
some_labels

329     3.33
173     3.07
272     3.09
497    14.68
182     6.38
Name: area, dtype: float64

In [80]:
from sklearn.metrics import mean_squared_error
pdt=model.predict(pd_train_set)
err=mean_squared_error(train_labels,pdt)
err

2004.0296690814466

## Cross validation evaluation 

In [83]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(model,pd_train_set,train_labels,scoring="neg_mean_squared_error")
rmse_score=np.sqrt(-score)

In [84]:
def print_score(a):
    print(a)
    print("mean : ",a.mean())
    print("Standard deviation : ",a.std())

In [85]:
print_score(rmse_score)

[39.8957861  18.03546782 33.21892645 84.53767632 24.48542125]
mean :  40.03465558810302
Standard deviation :  23.46373905979108


## Storing data and Dumping model

In [87]:
from joblib import dump, load
dump(model,"forest_fire.joblib")

['forest_fire.joblib']

In [88]:
%store test_set

Stored 'test_set' (DataFrame)


In [90]:
%store my_pipeline

Stored 'my_pipeline' (Pipeline)


In [91]:
prd_data[0]

array([ 0.39223227, -1.58113883,  0.68228824,  1.79284291,  0.19466156,
       -0.23579877,  0.72610598, -0.28641452,  0.34435902, -1.10884271,
        1.22474487,  0.        ])