# Data Preparation

## Library Imports

In [1]:
# Base Imports
import sqlite3
import pandas as pd
import numpy as np 

# Pre-processing
from sklearn.preprocessing import LabelEncoder

# Metrics and Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Pipeline
from sklearn.pipeline import Pipeline
import joblib

# Estimators
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


In [2]:
estimator = "decision_tree_classifier"

In [3]:
conn = sqlite3.connect('../../../../../../../data/FPA_FOD_20170508.sqlite')

In [4]:
df_fires = pd.read_sql_query("SELECT LATITUDE, LONGITUDE, DISCOVERY_DATE, FIRE_SIZE, STATE,OWNER_DESCR, STAT_CAUSE_DESCR FROM 'Fires'", conn)

In [5]:
df_fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880465 entries, 0 to 1880464
Data columns (total 7 columns):
 #   Column            Dtype  
---  ------            -----  
 0   LATITUDE          float64
 1   LONGITUDE         float64
 2   DISCOVERY_DATE    float64
 3   FIRE_SIZE         float64
 4   STATE             object 
 5   OWNER_DESCR       object 
 6   STAT_CAUSE_DESCR  object 
dtypes: float64(4), object(3)
memory usage: 100.4+ MB


In [6]:
df_fires.isna().any()

LATITUDE            False
LONGITUDE           False
DISCOVERY_DATE      False
FIRE_SIZE           False
STATE               False
OWNER_DESCR         False
STAT_CAUSE_DESCR    False
dtype: bool

In [7]:
df_fires["DISCOVERY_DATETIME"] = pd.to_datetime(df_fires["DISCOVERY_DATE"], unit='D', origin='julian')

In [8]:
df_fires['DISCOVERY_DAY_OF_WEEK'] = df_fires["DISCOVERY_DATETIME"].dt.day_name()

In [9]:
# create an instance of LabelEncoder
label_encoder = LabelEncoder()

# map to numerical values in a new variable
df_fires["STATE_CAT"] = label_encoder.fit_transform(df_fires['STATE'])
df_fires["OWNER_DESCR_CAT"] = label_encoder.fit_transform(df_fires['OWNER_DESCR'])
df_fires["DISCOVERY_DAY_OF_WEEK_CAT"] = label_encoder.fit_transform(df_fires['DISCOVERY_DAY_OF_WEEK'])

In [10]:
df_fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880465 entries, 0 to 1880464
Data columns (total 12 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   LATITUDE                   float64       
 1   LONGITUDE                  float64       
 2   DISCOVERY_DATE             float64       
 3   FIRE_SIZE                  float64       
 4   STATE                      object        
 5   OWNER_DESCR                object        
 6   STAT_CAUSE_DESCR           object        
 7   DISCOVERY_DATETIME         datetime64[ns]
 8   DISCOVERY_DAY_OF_WEEK      object        
 9   STATE_CAT                  int64         
 10  OWNER_DESCR_CAT            int64         
 11  DISCOVERY_DAY_OF_WEEK_CAT  int64         
dtypes: datetime64[ns](1), float64(4), int64(3), object(4)
memory usage: 172.2+ MB


In [11]:

X = df_fires[["LATITUDE", "LONGITUDE", "DISCOVERY_DATE", "FIRE_SIZE", "STATE_CAT", "OWNER_DESCR_CAT", "DISCOVERY_DAY_OF_WEEK_CAT"]]
y = df_fires["STAT_CAUSE_DESCR"]

## Train / Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.1, 
    random_state=1, 
    stratify=y)

## Gaussian Naive Bayes Classifier



In [13]:
%%time

if estimator == "gaussian_nb":
    
    clf = OneVsRestClassifier(GaussianNB())

    clf.fit(X_train, y_train)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


## Train Decision Classifier

In [14]:
%%time

if estimator == "decision_tree_classifier":

    clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=1, 
        splitter='best', 
        min_samples_split=5, 
        min_samples_leaf=4, 
        max_features='auto', 
        class_weight=None))

    clf.fit(X_train, y_train)

CPU times: user 1min 56s, sys: 982 ms, total: 1min 57s
Wall time: 1min 58s


In [15]:
%%time
y_pred = clf.predict(X_test)

CPU times: user 806 ms, sys: 36.6 ms, total: 842 ms
Wall time: 861 ms


In [16]:
%%time
print ('accuracy:', accuracy_score(y_test, y_pred))

accuracy: 0.5223640898286067
CPU times: user 543 ms, sys: 8.84 ms, total: 552 ms
Wall time: 554 ms


In [17]:

print(classification_report(y_test, y_pred))

                   precision    recall  f1-score   support

            Arson       0.51      0.48      0.50     28145
         Campfire       0.39      0.28      0.33      7614
         Children       0.24      0.16      0.19      6117
   Debris Burning       0.51      0.56      0.53     42903
    Equipment Use       0.31      0.27      0.29     14761
        Fireworks       0.37      0.31      0.34      1150
        Lightning       0.70      0.75      0.72     27847
    Miscellaneous       0.47      0.49      0.48     32381
Missing/Undefined       0.88      0.89      0.88     16672
        Powerline       0.15      0.12      0.13      1445
         Railroad       0.40      0.40      0.40      3345
          Smoking       0.13      0.09      0.10      5287
        Structure       0.01      0.07      0.02       380

         accuracy                           0.52    188047
        macro avg       0.39      0.37      0.38    188047
     weighted avg       0.52      0.52      0.52    18

In [18]:
if estimator == "decision_tree_classifier":
    joblib.dump(clf, '../models/decission_tree_classifier.pkl', compress=3)
elif estimator == "gaussian_nb":
    joblib.dump(clf, '../models/gaussian_nb_classifier.pkl')
elif estimator =="kneighbors_classifier":
    joblib.dump(clf, '../models/knn_classifier.pkl')
else:
    pass   

In [19]:
print(X_test[:1])

         LATITUDE   LONGITUDE  DISCOVERY_DATE  FIRE_SIZE  STATE_CAT  \
161943  43.235833 -122.466944       2452859.5        0.1         37   

        OWNER_DESCR_CAT  DISCOVERY_DAY_OF_WEEK_CAT  
161943               15                          0  


In [20]:
print(y_test[:1])

161943    Lightning
Name: STAT_CAUSE_DESCR, dtype: object


In [21]:
if estimator == "decision_tree_classifier":
    classifier = joblib.load('../models/decission_tree_classifier.pkl') 
elif estimator == "gaussian_nb":
    classifier = joblib.load('../models/gaussian_nb_classifier.pkl') 
elif estimator =="kneighbors_classifier":
    classifier = joblib.load('../models/knn_classifier.pkl') 
else:
    pass   

# classifier = joblib.load('../models/decission_tree_classifier.pkl')    

pred_test = [[43.235833, -122.466944, 2452859.5, 0.1, 37, 15, 0]]

classifier.predict(pred_test)

array(['Lightning'], dtype='<U17')

In [79]:
pred_proba = classifier.predict_proba(pred_test)

In [96]:
pred_proba

array([[0.        , 0.17494333, 0.00865444, 0.        , 0.        ,
        0.        , 0.81640223, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

In [81]:
max_proba = np.argmax(pred_proba, axis=1)

In [119]:
pred_proba[[0][0]][int(max_proba)]

0.8164022254275706

In [78]:
classifier.classes_

array(['Arson', 'Campfire', 'Children', 'Debris Burning', 'Equipment Use',
       'Fireworks', 'Lightning', 'Miscellaneous', 'Missing/Undefined',
       'Powerline', 'Railroad', 'Smoking', 'Structure'], dtype='<U17')