# Data Preparation

## Library Imports

In [1]:
# Base Imports
import pandas as pd
import numpy as np 
# import time
# from matplotlib import pyplot as plt
# from matplotlib.ticker import MaxNLocator
# import seaborn as sns
# %matplotlib inline

# Pre-Processing
# from sklearn.preprocessing import RobustScaler
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer

# Metrics and Evaluation
from sklearn import metrics
from sklearn.metrics import classification_report
# from sklearn.metrics import plot_confusion_matrix
# from sklearn.metrics import plot_roc_curve

# Train/ Test Split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline

# Imbalanced Data
# from imblearn.over_sampling import SMOTE
# from imblearn.over_sampling import BorderlineSMOTE
# from imblearn.pipeline import make_pipeline, Pipeline

# Estimators
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import BernoulliNB
# from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier

# Hyper Parameter Tuning
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RandomizedSearchCV

In [2]:
df_fires = pd.read_csv("../data/ca_fires.csv")

In [3]:
df_fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189550 entries, 0 to 189549
Data columns (total 40 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   FOD_ID                      189550 non-null  int64  
 1   FPA_ID                      189550 non-null  object 
 2   SOURCE_SYSTEM_TYPE          189550 non-null  object 
 3   SOURCE_SYSTEM               189550 non-null  object 
 4   NWCG_REPORTING_AGENCY       189550 non-null  object 
 5   NWCG_REPORTING_UNIT_ID      189550 non-null  object 
 6   NWCG_REPORTING_UNIT_NAME    189550 non-null  object 
 7   SOURCE_REPORTING_UNIT       189550 non-null  object 
 8   SOURCE_REPORTING_UNIT_NAME  189550 non-null  object 
 9   LOCAL_FIRE_REPORT_ID        61933 non-null   float64
 10  LOCAL_INCIDENT_ID           127983 non-null  object 
 11  FIRE_CODE                   55522 non-null   object 
 12  FIRE_NAME                   174555 non-null  object 
 13  ICS_209_INCIDE

In [4]:
df_fires["STAT_CAUSE_DESCR"].value_counts()

Miscellaneous        51943
Equipment Use        39407
Lightning            27000
Arson                19635
Debris Burning       14318
Missing/Undefined    12605
Campfire              9516
Children              6930
Smoking               5936
Powerline             1198
Railroad               720
Fireworks              219
Structure              123
Name: STAT_CAUSE_DESCR, dtype: int64

In [5]:
# encode to binary arson (1) and non-arson (0)
df_fires['ARSON'] = np.where((df_fires.STAT_CAUSE_DESCR == "Arson"), 1, 0)

In [6]:
df_fires['DISCOVERY_DATETIME'] = pd.to_datetime(df_fires['DISCOVERY_DATE'])

In [7]:
df_fires['MONTH'] = pd.DatetimeIndex(df_fires['DISCOVERY_DATETIME']).month
df_fires['DAY_OF_WEEK'] = df_fires['DISCOVERY_DATETIME'].dt.day_name()

In [8]:
X_cols = ["LATITUDE", "LONGITUDE", "DISCOVERY_DATE", "FIRE_SIZE"]  # "MONTH", "DAY_OF_WEEK" "FIRE_YEAR", 

In [9]:
df_fires[X_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189550 entries, 0 to 189549
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LATITUDE        189550 non-null  float64
 1   LONGITUDE       189550 non-null  float64
 2   DISCOVERY_DATE  189550 non-null  float64
 3   FIRE_SIZE       189550 non-null  float64
dtypes: float64(4)
memory usage: 5.8 MB


In [10]:
X = df_fires[X_cols]
# y = df_fires["ARSON"]
y = df_fires['STAT_CAUSE_DESCR']

In [11]:
X.head()

Unnamed: 0,LATITUDE,LONGITUDE,DISCOVERY_DATE,FIRE_SIZE
0,40.036944,-121.005833,2453403.5,0.1
1,38.933056,-120.404444,2453137.5,0.25
2,38.984167,-120.735556,2453156.5,0.1
3,38.559167,-119.913333,2453184.5,0.1
4,38.559167,-119.933056,2453184.5,0.1


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
import joblib

In [13]:
 X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.2,
    random_state=1,
    stratify=y)

In [14]:
clf1 = OneVsRestClassifier(GaussianNB())

In [15]:
clf1.fit(X_train, y_train)

OneVsRestClassifier(estimator=GaussianNB())

In [16]:
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

                   precision    recall  f1-score   support

            Arson       0.00      0.00      0.00      3927
         Campfire       0.15      0.00      0.00      1903
         Children       0.04      0.98      0.07      1386
   Debris Burning       0.00      0.00      0.00      2864
    Equipment Use       0.33      0.01      0.02      7881
        Fireworks       0.00      0.00      0.00        44
        Lightning       0.48      0.01      0.02      5400
    Miscellaneous       0.30      0.01      0.02     10388
Missing/Undefined       0.28      0.04      0.08      2521
        Powerline       0.00      0.00      0.00       240
         Railroad       0.00      0.00      0.00       144
          Smoking       0.00      0.00      0.00      1187
        Structure       0.00      0.00      0.00        25

         accuracy                           0.05     37910
        macro avg       0.12      0.08      0.02     37910
     weighted avg       0.24      0.05      0.02     3

In [17]:
joblib.dump(clf1, '../models/gaussian_nb_classifier.pkl')

['../models/gaussian_nb_classifier.pkl']

In [18]:
print(X_test[:1])

         LATITUDE   LONGITUDE  DISCOVERY_DATE  FIRE_SIZE
100568  37.093056 -121.573056       2448787.5        0.3


In [19]:
print(y_test[:1])

100568    Equipment Use
Name: STAT_CAUSE_DESCR, dtype: object


In [20]:
classifier = joblib.load('../models/gaussian_nb_classifier.pkl')    

#['LATITUDE', 'LONGITUDE', 'DISCOVERY_DATE', 'FIRE_SIZE']
pred_test = [[37.093056, -121.573056, 2448787.5, 0.3]]

classifier.predict(pred_test)

array(['Children'], dtype='<U17')