# **Support Vector Machine (SVM) - Forest Fires**

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
df_forest = pd.read_csv('forestfires.csv')
df_forest.head()

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,0,1,0,0,0,0,small
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,0,0,0,0,0,0,1,0,small
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,0,0,0,0,0,0,1,0,small
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,0,1,0,0,0,0,small
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,1,0,0,0,0,small


## **Exploratory Data Analysis (EDA)**

In [None]:
df_forest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   month          517 non-null    object 
 1   day            517 non-null    object 
 2   FFMC           517 non-null    float64
 3   DMC            517 non-null    float64
 4   DC             517 non-null    float64
 5   ISI            517 non-null    float64
 6   temp           517 non-null    float64
 7   RH             517 non-null    int64  
 8   wind           517 non-null    float64
 9   rain           517 non-null    float64
 10  area           517 non-null    float64
 11  dayfri         517 non-null    int64  
 12  daymon         517 non-null    int64  
 13  daysat         517 non-null    int64  
 14  daysun         517 non-null    int64  
 15  daythu         517 non-null    int64  
 16  daytue         517 non-null    int64  
 17  daywed         517 non-null    int64  
 18  monthapr  

Observation: Looks like columns from 11th i.e. dayfri onwards till monthsep are dummy columns created for model building. Since not sure whether this data is correct or got corrupted, will delete those columns.

In [None]:
df_forest1 = df_forest.iloc[:,0:11]
df_forest1['size_category'] = df_forest['size_category']
df_forest1.head()

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,size_category
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,small
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,small
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,small
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,small
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,small


In [None]:
# Change all categorical object columns to categorical columns

columns = ['month', 'day', 'size_category']

for x in columns:
  df_forest1[x] = df_forest1[x].astype('category')

df_forest1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   month          517 non-null    category
 1   day            517 non-null    category
 2   FFMC           517 non-null    float64 
 3   DMC            517 non-null    float64 
 4   DC             517 non-null    float64 
 5   ISI            517 non-null    float64 
 6   temp           517 non-null    float64 
 7   RH             517 non-null    int64   
 8   wind           517 non-null    float64 
 9   rain           517 non-null    float64 
 10  area           517 non-null    float64 
 11  size_category  517 non-null    category
dtypes: category(3), float64(8), int64(1)
memory usage: 38.8 KB


In [None]:
# Separate out input and output features before model builiding
x = pd.get_dummies(df_forest1.drop('size_category', axis=1))
y = df_forest1['size_category'].cat.codes
print(x.head())
print(y.head())

   FFMC   DMC     DC  ISI  temp  RH  wind  rain  area  month_apr  ...  \
0  86.2  26.2   94.3  5.1   8.2  51   6.7   0.0   0.0          0  ...   
1  90.6  35.4  669.1  6.7  18.0  33   0.9   0.0   0.0          0  ...   
2  90.6  43.7  686.9  6.7  14.6  33   1.3   0.0   0.0          0  ...   
3  91.7  33.3   77.5  9.0   8.3  97   4.0   0.2   0.0          0  ...   
4  89.3  51.3  102.2  9.6  11.4  99   1.8   0.0   0.0          0  ...   

   month_nov  month_oct  month_sep  day_fri  day_mon  day_sat  day_sun  \
0          0          0          0        1        0        0        0   
1          0          1          0        0        0        0        0   
2          0          1          0        0        0        1        0   
3          0          0          0        1        0        0        0   
4          0          0          0        0        0        0        1   

   day_thu  day_tue  day_wed  
0        0        0        0  
1        0        1        0  
2        0        0    

**Splitting input data into Train and Test Data**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=123)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(387, 28)
(130, 28)
(387,)
(130,)


## **Hyper parameter Tuning**

In [None]:
svc = SVC()

param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10], 'gamma' : [0.001, 0.01, 0.1, 1], 
              'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'] 
}

svc_grid = GridSearchCV(estimator=svc, param_grid = param_grid, cv=10)

In [None]:
svc_grid.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'gamma': [0.001, 0.01, 0.1, 1],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [None]:
print(svc_grid.best_score_)

0.9844804318488528


In [None]:
print(svc_grid.best_params_)

{'C': 0.1, 'gamma': 0.001, 'kernel': 'linear'}


## **SVM Model Creation**

In [None]:
svc_final = SVC(C=0.1, kernel='linear', random_state=7)

In [None]:
svc_final.fit(x_train, y_train)

SVC(C=0.1, kernel='linear', random_state=7)

In [None]:
# Predict the values for test data
preds =svc_final.predict(x_test)

In [None]:
confusion_matrix(y_test, preds)

array([[35,  1],
       [ 1, 93]])

In [None]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97        36
           1       0.99      0.99      0.99        94

    accuracy                           0.98       130
   macro avg       0.98      0.98      0.98       130
weighted avg       0.98      0.98      0.98       130



In [None]:
accuracy_score(y_test, preds)

0.9846153846153847

# Conclusion: SVM got created with 98% accuracy with C=0.1 and kernel = linear