In [170]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as ac
from sklearn.model_selection import GridSearchCV

In [171]:
df= pd.read_csv("forestfires.csv")

In [172]:
df.head()

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,0,1,0,0,0,0,small
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,0,0,0,0,0,0,1,0,small
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,0,0,0,0,0,0,1,0,small
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,0,1,0,0,0,0,small
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,1,0,0,0,0,small


In [173]:
df.columns

Index(['month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind',
       'rain', 'area', 'dayfri', 'daymon', 'daysat', 'daysun', 'daythu',
       'daytue', 'daywed', 'monthapr', 'monthaug', 'monthdec', 'monthfeb',
       'monthjan', 'monthjul', 'monthjun', 'monthmar', 'monthmay', 'monthnov',
       'monthoct', 'monthsep', 'size_category'],
      dtype='object')

In [174]:
df['size_category'].value_counts()

small    378
large    139
Name: size_category, dtype: int64

In [175]:
df=df.iloc[:,2:]

In [176]:
lblen = LabelEncoder()
df['size_category']= lblen.fit_transform(df['size_category'])
df.head()

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,1,...,0,0,0,0,1,0,0,0,0,1
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,0,...,0,0,0,0,0,0,0,1,0,1
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,0,...,0,0,0,0,0,0,0,1,0,1
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,1,...,0,0,0,0,1,0,0,0,0,1
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,1


In [177]:
def get_standardized_val(data):
    df_norm = (data-data.min())/(data.max()-data.min())
    return df_norm

In [178]:
df_norm=get_standardized_val(df.iloc[:,:29])
df_norm.head()

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,0.870968,0.086492,0.101325,0.090909,0.192926,0.423529,0.7,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.927742,0.118194,0.775419,0.11943,0.508039,0.211765,0.055556,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.927742,0.146795,0.796294,0.11943,0.398714,0.211765,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.941935,0.110958,0.081623,0.160428,0.196141,0.964706,0.4,0.03125,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.910968,0.172984,0.11059,0.171123,0.29582,0.988235,0.155556,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [179]:
df_norm.shape

(517, 29)

In [180]:
X=df_norm.iloc[:,:28]
Y=df_norm.iloc[:,28]

In [181]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3)

In [182]:
model = SVC(kernel='rbf',gamma=0.0005)
model.fit(x_train,y_train)

SVC(gamma=0.0005)

In [183]:
ac(y_test,model.predict(x_test))

0.75

In [184]:
model = SVC(kernel='rbf',gamma=500)
model.fit(x_train,y_train)

SVC(gamma=500)

In [185]:
ac(y_test,model.predict(x_test))

0.7371794871794872

From this it is clear that the data is linearly separable because the more we project data to higher dimension, the lower the accuracy.

In [197]:
model = SVC()
param = {'kernel':['linear'],'C':[10,50,100,200,500,750,1000]}
gds= GridSearchCV(model,param_grid=param,cv=5)
gds.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [10, 50, 100, 200, 500, 750, 1000],
                         'kernel': ['linear']})

In [198]:
gds.best_params_,gds.best_score_

({'C': 1000, 'kernel': 'linear'}, 0.9141552511415524)

In [202]:
model = SVC(kernel='linear',C=200)
model.fit(x_train,y_train)
ac(y_test,model.predict(x_test))

0.9038461538461539

# Even though we get high accuracy with C=1000, but the margin will be comparatively very low. Hence we go for SVM with C=200 which has an  accuracy = 90.4%