In [47]:
# Importing the libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the data set

df = pd.read_csv('forestfires.csv')
df.head()

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,0,1,0,0,0,0,small
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,0,0,0,0,0,0,1,0,small
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,0,0,0,0,0,0,1,0,small
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,0,1,0,0,0,0,small
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,1,0,0,0,0,small


In [3]:
# Getting the information about the data set

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   month          517 non-null    object 
 1   day            517 non-null    object 
 2   FFMC           517 non-null    float64
 3   DMC            517 non-null    float64
 4   DC             517 non-null    float64
 5   ISI            517 non-null    float64
 6   temp           517 non-null    float64
 7   RH             517 non-null    int64  
 8   wind           517 non-null    float64
 9   rain           517 non-null    float64
 10  area           517 non-null    float64
 11  dayfri         517 non-null    int64  
 12  daymon         517 non-null    int64  
 13  daysat         517 non-null    int64  
 14  daysun         517 non-null    int64  
 15  daythu         517 non-null    int64  
 16  daytue         517 non-null    int64  
 17  daywed         517 non-null    int64  
 18  monthapr  

'month' and 'day' columns are not significantly related to the dependant variable. We can drop those columns.

In [4]:
df.drop(['month', 'day'], axis = 1, inplace = True)
df

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00,1,...,0,0,0,0,1,0,0,0,0,small
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00,0,...,0,0,0,0,0,0,0,1,0,small
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00,0,...,0,0,0,0,0,0,0,1,0,small
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00,1,...,0,0,0,0,1,0,0,0,0,small
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00,0,...,0,0,0,0,1,0,0,0,0,small
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44,0,...,0,0,0,0,0,0,0,0,0,large
513,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29,0,...,0,0,0,0,0,0,0,0,0,large
514,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16,0,...,0,0,0,0,0,0,0,0,0,large
515,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00,0,...,0,0,0,0,0,0,0,0,0,small


In [5]:
# Getting the descriptive statistics

df.describe()

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,...,monthdec,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,...,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292,0.16441,...,0.017408,0.038685,0.003868,0.061896,0.032882,0.104449,0.003868,0.001934,0.029014,0.332689
std,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818,0.371006,...,0.130913,0.193029,0.062137,0.241199,0.1785,0.306138,0.062137,0.04398,0.168007,0.471632
min,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Separating dependant and dependant variables

X = df.drop('size_category', axis = 1)
y = df.size_category

In [7]:
y

0      small
1      small
2      small
3      small
4      small
       ...  
512    large
513    large
514    large
515    small
516    small
Name: size_category, Length: 517, dtype: object

In [8]:
X

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,...,monthdec,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00,1,...,0,0,0,0,0,1,0,0,0,0
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00,0,...,0,0,0,0,0,0,0,0,1,0
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00,0,...,0,0,0,0,0,0,0,0,1,0
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00,1,...,0,0,0,0,0,1,0,0,0,0
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44,0,...,0,0,0,0,0,0,0,0,0,0
513,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29,0,...,0,0,0,0,0,0,0,0,0,0
514,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16,0,...,0,0,0,0,0,0,0,0,0,0
515,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
y

0      small
1      small
2      small
3      small
4      small
       ...  
512    large
513    large
514    large
515    small
516    small
Name: size_category, Length: 517, dtype: object

In [11]:
y.value_counts().to_dict()

{'small': 378, 'large': 139}

In [12]:
y.replace({'small': 0, 'large': 1}, inplace = True)

In [13]:
y

0      0
1      0
2      0
3      0
4      0
      ..
512    1
513    1
514    1
515    0
516    0
Name: size_category, Length: 517, dtype: int64

In [14]:
y.value_counts(normalize = True)

0    0.731141
1    0.268859
Name: size_category, dtype: float64

##### The given data set imbalanced one. Using the balancing techniques.

In [15]:
from imblearn.over_sampling import SMOTE

In [17]:
smt = SMOTE()
X_resample, y_resample = smt.fit_resample(X,y)

In [19]:
y.value_counts()

0    378
1    139
Name: size_category, dtype: int64

In [20]:
y_resample.value_counts()

0    378
1    378
Name: size_category, dtype: int64

##### Data set has been balanced now.

In [22]:
# Splitting the data set into training and testing data sets

X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.3, random_state=42)

In [29]:
# Applying the feature scaling 

scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [41]:
# Building the model

svc = SVC()
svc.fit(X_train_scaled,y_train)

SVC()

In [42]:
y_pred_test = svc.predict(X_test_scaled)

In [43]:
accuracy_score(y_pred_test,y_test)

0.7665198237885462

In [44]:
confusion_matrix(y_pred_test,y_test)

array([[114,  46],
       [  7,  60]], dtype=int64)

In [45]:
print(classification_report(y_pred_test,y_test))

              precision    recall  f1-score   support

           0       0.94      0.71      0.81       160
           1       0.57      0.90      0.69        67

    accuracy                           0.77       227
   macro avg       0.75      0.80      0.75       227
weighted avg       0.83      0.77      0.78       227



In [46]:
svc.score(X_train_scaled,y_train)

0.8601134215500945

### Hyperparameter Tuning

In [48]:
hyp = {'C': np.arange(1,50),
        'kernel':['linear', 'poly', 'rbf', 'sigmoid']} 
grid = GridSearchCV(svc, hyp, cv = 5)

In [49]:
grid.fit(X_train_scaled,y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [50]:
grid.best_params_

{'C': 15, 'kernel': 'linear'}

In [51]:
svc = SVC(C = 15, kernel = 'linear')
svc.fit(X_train_scaled,y_train)

SVC(C=15, kernel='linear')

In [52]:
y_pred_test = svc.predict(X_test_scaled)

In [53]:
accuracy_score(y_pred_test,y_test)

0.9691629955947136

In [54]:
confusion_matrix(y_pred_test,y_test)

array([[120,   6],
       [  1, 100]], dtype=int64)

In [55]:
print(classification_report(y_pred_test,y_test))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97       126
           1       0.94      0.99      0.97       101

    accuracy                           0.97       227
   macro avg       0.97      0.97      0.97       227
weighted avg       0.97      0.97      0.97       227



In [56]:
svc.score(X_train_scaled,y_train)

0.994328922495274