In [1]:
#Load Libraries
import pandas as pd
import numpy as np
import sqlite3 as sqlite3
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

In [2]:
%%time
train_df=pd.read_csv('data/SMOTE_training_data.csv')
test_df=pd.read_csv('data/testing_data_v2.csv')

Wall time: 4.75 s


In [3]:
xTrain=train_df.loc[:,train_df.columns != 'STAT_CAUSE_DESCR']
yTrain=train_df['STAT_CAUSE_DESCR']
xTrain = xTrain.drop('OWNER_CODE', axis=1)
xTrain=pd.get_dummies(xTrain, columns=['NWCG_REPORTING_AGENCY', 'SOURCE_SYSTEM_TYPE'])
xTest=test_df.loc[:,test_df.columns != 'STAT_CAUSE_DESCR']
xTest = xTest.drop('OWNER_CODE', axis=1)
xTest=pd.get_dummies(xTest, columns=['NWCG_REPORTING_AGENCY', 'SOURCE_SYSTEM_TYPE'])
xTest['NWCG_REPORTING_AGENCY_BOR']=0
xTest=xTest[list(xTrain.columns)]
yTest=test_df['STAT_CAUSE_DESCR']

xTrain.shape
# yTrain.shape

(4241600, 19)

In [4]:
counts = yTrain.value_counts()
counts

Debris Burning    385600
Arson             385600
Powerline         385600
Children          385600
Smoking           385600
Lightning         385600
Fireworks         385600
Equipment Use     385600
Structure         385600
Railroad          385600
Campfire          385600
Name: STAT_CAUSE_DESCR, dtype: int64

In [5]:
#Global variable necessary for function
counts = yTrain.value_counts()

#Change count accordingly for your model
def count_over(colname, count=5000):
    if counts[colname] > count:
        return count
    return counts[colname]

#Perform undersampling
UnderSampleRatio = {
    'Lightning' : count_over('Lightning'), 'Debris Burning' : count_over('Debris Burning'), 'Campfire' : count_over('Campfire'), 
    'Equipment Use' : count_over('Equipment Use'),    
    'Arson' : count_over('Arson'), 'Children' : count_over('Children'), 'Railroad' : count_over('Railroad'),
    'Smoking' : count_over('Smoking'), 'Powerline' : count_over('Powerline'),
    'Fireworks' : count_over('Fireworks'), 'Structure' : count_over('Structure')
}

newSampStrat=RandomUnderSampler(sampling_strategy=UnderSampleRatio,random_state=441) #Goal balance all classes
xTrain,yTrain=newSampStrat.fit_resample(xTrain,yTrain) #perform the balancing newX and newY are balanced X and y

In [6]:
yTrain.value_counts()

Children          5000
Debris Burning    5000
Smoking           5000
Fireworks         5000
Lightning         5000
Equipment Use     5000
Arson             5000
Powerline         5000
Railroad          5000
Campfire          5000
Structure         5000
Name: STAT_CAUSE_DESCR, dtype: int64

In [7]:
xTrain.shape

(55000, 19)

In [8]:
# RBF kernel with covariate scaling
model_rbf = Pipeline(
    steps=[("scaler", StandardScaler()), 
           ("model", svm.SVC(kernel='rbf', cache_size=1600))]
)

# tuning parameter grid
# model__xyz specifies that parameter xyz is a parameter to model
param_grid = {
    "model__C": [100, 1000, 10000],
    "model__gamma": ["scale", "auto"],
}

# crossvalidation folds
cv = KFold(
    n_splits=5,  # number of folds
    shuffle=True # protects against data being ordered, e.g., all successes first
)

cv_rbf_onevall = GridSearchCV(
    estimator = model_rbf,
    param_grid = param_grid,
    cv = cv
)

In [9]:
%%time
cv_rbf_onevall.fit(X=xTrain, y=yTrain)

Wall time: 9h 52min 25s


In [10]:
cv_rbf_onevall.cv_results_

{'mean_fit_time': array([ 166.10807204,  161.87194209,  426.38098612,  407.77066183,
        2509.55397182, 2374.3309773 ]),
 'std_fit_time': array([ 2.77177536,  0.98333484,  7.4193394 ,  6.51226623, 67.5812307 ,
        45.8833533 ]),
 'mean_score_time': array([50.74177155, 49.59234047, 49.21438355, 48.8020534 , 51.20683265,
        51.82566867]),
 'std_score_time': array([1.27390225, 0.34989879, 1.06655713, 0.25508457, 2.26637805,
        1.06882547]),
 'param_model__C': masked_array(data=[100, 100, 1000, 1000, 10000, 10000],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_model__gamma': masked_array(data=['scale', 'auto', 'scale', 'auto', 'scale', 'auto'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'model__C': 100, 'model__gamma': 'scale'},
  {'model__C': 100, 'model__gamma': 'auto'},
  {'model__C': 1000, 'model__gamma': 'sc

In [11]:
final_model = cv_rbf_onevall.best_estimator_

In [12]:
yPred = final_model.predict(xTest) 

In [13]:
accuracy = accuracy_score(yTest, yPred)
print("Test Accuracy: %.2f%%" % (accuracy * 100.0))

Test Accuracy: 36.68%


In [14]:
print("The best training accuracy score is ", cv_rbf_onevall.best_score_ * 100, "%")

The best training accuracy score is  42.870909090909095 %


In [15]:
%%time
filename='SVM_model_tuning'
pickle.dump(cv_rbf_onevall, open(filename, 'wb')) #Saving the model

Wall time: 36.6 ms


In [16]:
pickle.dump(yPred, open('predictions', 'wb'))

In [17]:
pickle.load(open('predictions', 'rb'))

array(['Lightning', 'Arson', 'Campfire', ..., 'Arson', 'Equipment Use',
       'Powerline'], dtype=object)

In [18]:
yPred

array(['Lightning', 'Arson', 'Campfire', ..., 'Arson', 'Equipment Use',
       'Powerline'], dtype=object)

In [19]:
cv_rbf_onevall.best_estimator_