In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
wildfires = pd.read_excel('fp-historical-wildfire-data-2006-2021.xlsx')

In [3]:
wildfires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22914 entries, 0 to 22913
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   fire_year                     22914 non-null  int64         
 1   fire_number                   22914 non-null  object        
 2   fire_name                     604 non-null    object        
 3   current_size                  22914 non-null  float64       
 4   size_class                    22914 non-null  object        
 5   fire_location_latitude        22914 non-null  float64       
 6   fire_location_longitude       22914 non-null  float64       
 7   fire_origin                   22905 non-null  object        
 8   general_cause_desc            22914 non-null  object        
 9   industry_identifier_desc      408 non-null    object        
 10  responsible_group_desc        9031 non-null   object        
 11  activity_class              

In [4]:
# Observe that there are a few columns where the number of non-Null values a very similar: 
# fire_spread_rate, fire_type, fire_position_on_slope, weather_conditions_over_fire, temperature, 
# relative humidity, wind_direction, and wind speed
# So let's see if those null values are in similar rows in the data set

In [5]:
columns_with_similar_null_totals = [
    'fire_spread_rate',
    'fire_type', 
    'fire_position_on_slope', 
    'weather_conditions_over_fire', 
    'temperature',
    'relative_humidity',
    'wind_direction', 
    'wind_speed'
]

In [6]:
wildfires[wildfires[columns_with_similar_null_totals].isnull().all(axis = 1)]

Unnamed: 0,fire_year,fire_number,fire_name,current_size,size_class,fire_location_latitude,fire_location_longitude,fire_origin,general_cause_desc,industry_identifier_desc,...,distance_from_water_source,first_bucket_drop_date,bh_fs_date,bh_hectares,uc_fs_date,uc_hectares,to_fs_date,to_hectares,ex_fs_date,ex_hectares
11,2021,CWF012,,0.01,A,49.599117,-114.353217,Provincial Land,Recreation,,...,,NaT,2021-04-17 14:35:00,0.01,2021-04-17 14:35:00,0.01,NaT,,2021-04-17 14:36:00,0.01
24,2021,CWF025,,0.01,A,51.400830,-115.076775,Provincial Land,Recreation,,...,,NaT,2021-05-21 16:00:00,0.01,2021-05-21 16:00:00,0.01,NaT,,2021-05-21 16:15:00,0.01
25,2021,CWF026,,0.01,A,51.391783,-115.109717,Provincial Land,Recreation,,...,,NaT,2021-05-22 16:35:00,0.01,2021-05-22 16:35:00,0.01,NaT,,2021-05-22 16:50:00,0.01
26,2021,CWF027,,0.01,A,51.391797,-115.107490,Provincial Land,Recreation,,...,,NaT,2021-05-22 16:57:00,0.01,2021-05-22 16:57:00,0.01,NaT,,2021-05-22 17:05:00,0.01
27,2021,CWF028,,0.01,A,51.602402,-115.013847,Provincial Land,Recreation,,...,,NaT,2021-05-22 17:06:00,0.01,2021-05-22 17:06:00,0.01,NaT,,2021-05-22 17:10:00,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15320,2010,LWF117,,0.01,A,54.545083,-111.475700,Provincial Land,Recreation,,...,,NaT,2010-05-24 16:35:00,0.01,2010-05-24 16:35:00,0.01,NaT,,2010-05-24 18:52:00,0.01
15739,2010,RWF048,,0.01,A,52.229657,-116.449037,Provincial Land,Recreation,,...,,NaT,2010-06-28 15:41:00,0.01,2010-06-28 15:41:00,0.01,NaT,,2010-06-28 15:46:00,0.01
15748,2010,RWF057,,0.01,A,52.277024,-116.038532,Provincial Land,Recreation,,...,,NaT,2010-07-15 14:10:00,0.01,2010-07-15 14:10:00,0.01,NaT,,2010-07-15 14:30:00,0.01
15750,2010,RWF059,,0.01,A,51.858517,-115.101837,Provincial Land,Recreation,,...,,NaT,2010-07-25 12:00:00,0.01,2010-07-25 12:00:00,0.01,NaT,,2010-07-25 12:01:00,0.01


In [7]:
#So there are 2458 rows where every one of those values is null, so I will remove all of those rows

In [8]:
null_row_indices = wildfires[wildfires[columns_with_similar_null_totals].isnull().all(axis = 1)].index.tolist()

In [9]:
wildfires = wildfires.drop(index = null_row_indices).reset_index(drop=True)

In [10]:
wildfires

Unnamed: 0,fire_year,fire_number,fire_name,current_size,size_class,fire_location_latitude,fire_location_longitude,fire_origin,general_cause_desc,industry_identifier_desc,...,distance_from_water_source,first_bucket_drop_date,bh_fs_date,bh_hectares,uc_fs_date,uc_hectares,to_fs_date,to_hectares,ex_fs_date,ex_hectares
0,2021,CWF001,,0.01,A,49.410191,-114.318843,Provincial Park,Power Line Industry,,...,,NaT,2021-01-13 13:25:00,0.01,2021-01-13 13:25:00,0.01,NaT,,2021-01-13 13:35:00,0.01
1,2021,CWF002,,0.01,A,49.587753,-114.378655,Provincial Land,Recreation,,...,,NaT,2021-01-29 10:45:00,0.01,2021-01-29 10:45:00,0.01,NaT,,2021-01-29 13:15:00,0.01
2,2021,CWF003,,0.05,A,51.119500,-114.778950,Provincial Land,Resident,,...,,NaT,2021-02-23 10:42:00,0.05,2021-02-23 10:42:00,0.05,NaT,,2021-02-23 10:52:00,0.05
3,2021,CWF004,,0.15,B,51.271888,-115.025818,Provincial Land,Incendiary,,...,,NaT,2021-03-14 19:00:00,0.15,2021-03-14 19:00:00,0.15,NaT,,2021-03-15 11:05:00,0.15
4,2021,CWF005,,0.30,B,51.161000,-114.963283,Indian Reservation,Resident,,...,,NaT,2021-03-17 15:01:00,0.80,2021-03-17 16:10:00,0.80,NaT,,2021-03-18 13:00:00,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20451,2006,WWF110,,9.71,C,54.697817,-116.218083,Provincial Land,Undetermined,,...,5.0,2006-10-03 16:23:00,2006-10-03 18:11:00,5.00,2006-10-03 19:13:00,9.71,NaT,,2006-10-12 13:22:00,9.71
20452,2006,WWF111,,0.80,B,54.068434,-115.668980,Private Land,Recreation,,...,,NaT,2006-10-04 16:49:00,1.00,2006-10-04 16:49:00,1.00,2006-10-04 17:35:00,0.8,2006-10-05 11:00:00,0.80
20453,2006,WWF112,,0.10,A,54.651933,-115.460117,Provincial Land,Power Line Industry,,...,,NaT,2006-10-05 15:19:00,0.10,2006-10-05 15:19:00,0.10,NaT,,2006-10-05 15:25:00,0.10
20454,2006,WWF113,,0.05,A,54.619465,-115.516485,Provincial Land,Power Line Industry,,...,,NaT,2006-10-27 10:59:00,0.05,2006-10-27 10:59:00,0.05,NaT,,2006-10-27 11:05:00,0.05


In [11]:
wildfires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20456 entries, 0 to 20455
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   fire_year                     20456 non-null  int64         
 1   fire_number                   20456 non-null  object        
 2   fire_name                     591 non-null    object        
 3   current_size                  20456 non-null  float64       
 4   size_class                    20456 non-null  object        
 5   fire_location_latitude        20456 non-null  float64       
 6   fire_location_longitude       20456 non-null  float64       
 7   fire_origin                   20447 non-null  object        
 8   general_cause_desc            20456 non-null  object        
 9   industry_identifier_desc      408 non-null    object        
 10  responsible_group_desc        6577 non-null   object        
 11  activity_class              

### Now we have to determine which columns predict the size_class too well 

In [12]:
# I suspect current_size, bh_hectares, uc_hectares, and ex_hectares might be the culprits 

In [13]:
wildfires.select_dtypes(include='number').corr()

Unnamed: 0,fire_year,current_size,fire_location_latitude,fire_location_longitude,discovered_size,assessment_hectares,fire_spread_rate,temperature,relative_humidity,wind_speed,fire_fighting_start_size,distance_from_water_source,bh_hectares,uc_hectares,to_hectares,ex_hectares
fire_year,1.0,0.008835,0.079834,0.04413,,-0.00066,0.05343,0.023997,-0.017939,0.019096,0.024184,-0.006796,0.005634,0.0085,0.021763,0.008835
current_size,0.008835,1.0,0.024976,0.014599,,0.037687,0.071773,0.018138,-0.018491,0.050302,0.066583,0.002197,0.901745,0.997393,1.0,1.0
fire_location_latitude,0.079834,0.024976,1.0,-0.060381,,0.037698,0.123244,0.151646,0.030022,0.015118,0.049452,-0.032771,0.014228,0.022466,0.035165,0.024976
fire_location_longitude,0.04413,0.014599,-0.060381,1.0,,0.00642,0.052016,0.024582,-0.040349,0.040629,0.014862,-0.003955,0.017869,0.01486,-0.033819,0.014599
discovered_size,,,,,,,,,,,,,,,,
assessment_hectares,-0.00066,0.037687,0.037698,0.00642,,1.0,0.127932,0.019437,-0.010928,0.025223,0.447678,-0.00179,0.028499,0.032869,0.578828,0.037687
fire_spread_rate,0.05343,0.071773,0.123244,0.052016,,0.127932,1.0,0.152724,-0.111232,0.134798,0.120372,-0.009101,0.069165,0.066523,0.17618,0.071773
temperature,0.023997,0.018138,0.151646,0.024582,,0.019437,0.152724,1.0,-0.274911,-0.017498,0.025513,-0.027851,0.017865,0.017076,0.024049,0.018138
relative_humidity,-0.017939,-0.018491,0.030022,-0.040349,,-0.010928,-0.111232,-0.274911,1.0,-0.168359,-0.023402,0.019332,-0.020308,-0.01837,-0.080456,-0.018491
wind_speed,0.019096,0.050302,0.015118,0.040629,,0.025223,0.134798,-0.017498,-0.168359,1.0,0.040925,-0.023366,0.03952,0.048538,0.122354,0.050302


In [14]:
columns_that_describe_fire_size = ['current_size', 'bh_hectares', 'uc_hectares', 'ex_hectares']

In [15]:
wildfires[columns_that_describe_fire_size]

Unnamed: 0,current_size,bh_hectares,uc_hectares,ex_hectares
0,0.01,0.01,0.01,0.01
1,0.01,0.01,0.01,0.01
2,0.05,0.05,0.05,0.05
3,0.15,0.15,0.15,0.15
4,0.30,0.80,0.80,0.30
...,...,...,...,...
20451,9.71,5.00,9.71,9.71
20452,0.80,1.00,1.00,0.80
20453,0.10,0.10,0.10,0.10
20454,0.05,0.05,0.05,0.05


In [16]:
test_df = wildfires[columns_that_describe_fire_size + ['size_class']]

In [17]:
test_df

Unnamed: 0,current_size,bh_hectares,uc_hectares,ex_hectares,size_class
0,0.01,0.01,0.01,0.01,A
1,0.01,0.01,0.01,0.01,A
2,0.05,0.05,0.05,0.05,A
3,0.15,0.15,0.15,0.15,B
4,0.30,0.80,0.80,0.30,B
...,...,...,...,...,...
20451,9.71,5.00,9.71,9.71,C
20452,0.80,1.00,1.00,0.80,B
20453,0.10,0.10,0.10,0.10,A
20454,0.05,0.05,0.05,0.05,A


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [19]:
label_encoder = preprocessing.LabelEncoder()
size_labels = label_encoder.fit_transform(test_df['size_class'])
test_df = test_df.drop('size_class', axis = 1)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(test_df, size_labels, stratify=size_labels, test_size=0.2)

In [21]:
classifier = DecisionTreeClassifier(random_state = 42)

In [22]:
test_df

Unnamed: 0,current_size,bh_hectares,uc_hectares,ex_hectares
0,0.01,0.01,0.01,0.01
1,0.01,0.01,0.01,0.01
2,0.05,0.05,0.05,0.05
3,0.15,0.15,0.15,0.15
4,0.30,0.80,0.80,0.30
...,...,...,...,...
20451,9.71,5.00,9.71,9.71
20452,0.80,1.00,1.00,0.80
20453,0.10,0.10,0.10,0.10
20454,0.05,0.05,0.05,0.05


In [23]:
classifier.fit(X_train, y_train)

In [24]:
predictions = classifier.predict(X_test)

In [25]:
report = classification_report(y_test, predictions)

In [26]:
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2590
           1       1.00      1.00      1.00      1165
           2       1.00      1.00      1.00       212
           3       1.00      1.00      1.00        60
           4       1.00      1.00      1.00        65

    accuracy                           1.00      4092
   macro avg       1.00      1.00      1.00      4092
weighted avg       1.00      1.00      1.00      4092



In [27]:
feature_importance = classifier.feature_importances_

In [28]:
feature_importance

array([0.943911, 0.      , 0.      , 0.056089])

In [29]:
# So current_size definetly needs to go

In [30]:
wildfires = wildfires.drop('current_size', axis = 1)


In [31]:
columns_that_describe_fire_size = ['bh_hectares', 'uc_hectares', 'ex_hectares']
test_df = wildfires[columns_that_describe_fire_size + ['size_class']]
label_encoder = preprocessing.LabelEncoder()
size_labels = label_encoder.fit_transform(test_df['size_class'])
test_df = test_df.drop('size_class', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(test_df, size_labels, stratify=size_labels, test_size=0.2)
classifier = DecisionTreeClassifier(random_state = 42)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2590
           1       1.00      1.00      1.00      1165
           2       1.00      1.00      1.00       212
           3       1.00      0.98      0.99        60
           4       0.98      1.00      0.99        65

    accuracy                           1.00      4092
   macro avg       1.00      1.00      1.00      4092
weighted avg       1.00      1.00      1.00      4092



In [32]:
feature_importance = classifier.feature_importances_

In [33]:
feature_importance

array([0., 0., 1.])

In [34]:
# so ex_hectares also has to go 

In [35]:
wildfires = wildfires.drop('ex_hectares', axis = 1)

In [36]:
columns_that_describe_fire_size = ['bh_hectares', 'uc_hectares']
test_df = wildfires[columns_that_describe_fire_size + ['size_class']]
label_encoder = preprocessing.LabelEncoder()
size_labels = label_encoder.fit_transform(test_df['size_class'])
test_df = test_df.drop('size_class', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(test_df, size_labels, stratify=size_labels, test_size=0.2)
classifier = DecisionTreeClassifier(random_state = 42)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2590
           1       0.96      0.95      0.96      1165
           2       0.87      0.83      0.85       212
           3       0.83      0.75      0.79        60
           4       0.95      0.92      0.94        65

    accuracy                           0.97      4092
   macro avg       0.92      0.89      0.90      4092
weighted avg       0.97      0.97      0.97      4092



In [37]:
feature_importance = classifier.feature_importances_
feature_importance

array([0.00939485, 0.99060515])

In [38]:
# And uc_hectares has to go as well 
wildfires = wildfires.drop('uc_hectares', axis = 1)

In [39]:
columns_that_describe_fire_size = ['bh_hectares']
test_df = wildfires[columns_that_describe_fire_size + ['size_class']]
label_encoder = preprocessing.LabelEncoder()
size_labels = label_encoder.fit_transform(test_df['size_class'])
test_df = test_df.drop('size_class', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(test_df, size_labels, stratify=size_labels, test_size=0.2)
classifier = DecisionTreeClassifier(random_state = 42)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2590
           1       0.90      0.91      0.91      1165
           2       0.78      0.70      0.74       212
           3       0.81      0.58      0.68        60
           4       0.88      0.65      0.74        65

    accuracy                           0.93      4092
   macro avg       0.87      0.76      0.81      4092
weighted avg       0.93      0.93      0.93      4092



In [40]:
# let's remove bh_hectares as well to make the dataset simpler 
wildfires = wildfires.drop('bh_hectares', axis = 1)

In [42]:
wildfires = wildfires.drop('bh_fs_date', axis = 1)
wildfires = wildfires.drop('uc_fs_date', axis = 1)
wildfires = wildfires.drop('to_fs_date', axis = 1)
wildfires = wildfires.drop('to_hectares', axis = 1)

### I want to say that the dataset is ready but let's see 

In [43]:
wildfires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20456 entries, 0 to 20455
Data columns (total 42 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   fire_year                     20456 non-null  int64         
 1   fire_number                   20456 non-null  object        
 2   fire_name                     591 non-null    object        
 3   size_class                    20456 non-null  object        
 4   fire_location_latitude        20456 non-null  float64       
 5   fire_location_longitude       20456 non-null  float64       
 6   fire_origin                   20447 non-null  object        
 7   general_cause_desc            20456 non-null  object        
 8   industry_identifier_desc      408 non-null    object        
 9   responsible_group_desc        6577 non-null   object        
 10  activity_class                11747 non-null  object        
 11  true_cause                  

In [45]:
wildfires.to_csv('fp-historical-wildfire-data-2006-2021-modified.csv')