In [2]:
# Standard library imports
from io import StringIO
import importlib
import math
import numpy as np
import os
import pandas as pd
import pdb
import re
import regex
import sys 

# Third party imports
import dateparser
import reverse_geocoder as rg

# Local application imports
import mender_tools as mt

# Reading the csv file into a dataframe

In [3]:
df = pd.read_csv('blazes/fires.csv')

# Set options to display all rows and columns in Jupyter notebook

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Checking out the dataframe

In [5]:
df.shape

(109, 10)

In [6]:
df.dtypes

cause            object
coordinates      object
day              object
fuelsinvolved    object
incident         object
incidentype      object
location         object
perimeter        object
personnel        object
size             object
dtype: object

In [7]:
df.head()

Unnamed: 0,cause,coordinates,day,fuelsinvolved,incident,incidentype,location,perimeter,personnel,size
0,"Saturday August 15th, 2020 approx. 07:15 AM",,"Tyler Brown, Texas A&M Forest Service",,Chimney Fire,Wildfire,"32.722 latitude, -99.481 longitude",,600 Acres,40%
1,Lightning,"33.544 latitude, -110.42 longitude","Saturday August 01st, 2020 approx. 04:30 PM","Primary carrier of the fire is currently the timber litter, grass and light brush associated with the under story of the ponderosa pine forest. Oak scrub is involved on the western edge of the burn area.",Cassadore Springs Fire,Wildfire,"11 miles north of San Carlos, AZ",90%,243,"21,284 Acres"
2,Powerline,"30.863 latitude, -100.548 longitude","Thursday August 13th, 2020 approx. 03:15 PM",,Bucholtz Fire,Wildfire,5 miles east of Eldorado,,73 Acres,100%
3,,,,,,,,,,
4,,,,,,,,,,


# Renaming columns names

In [8]:
df.columns

Index(['cause', 'coordinates', 'day', 'fuelsinvolved', 'incident',
       'incidentype', 'location', 'perimeter', 'personnel', 'size'],
      dtype='object')

In [5]:
df.columns = ['Cause', 'Coordinates', 'Date', 'Fuels Involved', 'Incident', 'Incident Type', 'Location', 'Perimeter Contained (%)', 'Personnel Involved', 'Fire Size (Acres)']

# Removing rows with all values missing and striping whitespaces

In [6]:
# Checking total number of rows with all cells empty
df.isnull().all(axis=1).sum()


9

In [7]:
# Drop all the rows with all cells empty
df.dropna(how='all', inplace=True)

In [8]:
# Reseting the index of the dataframe
df.reset_index(drop = True, inplace = True)

In [9]:
# Remove trailing and leading whitespaces
df.loc[:,:] = df.applymap(lambda x: x.strip() if type(x)==str else x)

In [10]:
# Locate rows of duplicate data
dups = df.duplicated()
print(dups.any())

False


# Making a backup

In [11]:
''' In order to make a copy of the dataframe we need two different instances of the dataframe, if we don't do that any changes
made to any of the variables that reference to dataframe will modify the other one. In that case we use the method copy().'''
wildfire_df = df.copy()

# Arrange Numeric Columns

### Preparing column 'Fire Size (Acres)'

In [12]:
# Copy values of acres in column 'Personnel Involved' that pertain to column 'Fire Size (Acres)'
wildfire_df = mt.emend_values(df, wildfire_df, 'Personnel Involved', 'Fire Size (Acres)', r'.*\s*Acres')

In [13]:
# Remove the string 'Acres' that comes along with the digits
wildfire_df = mt.strip_symbol(wildfire_df, 'Fire Size (Acres)', 'Acres')

In [14]:
# Convert the string digits to integer values
wildfire_df = mt.convert_to_int(wildfire_df, 'Fire Size (Acres)')

In [15]:
 # Check for value types 
wildfire_df['Fire Size (Acres)'].apply(type)

0       <class 'int'>
1       <class 'int'>
2       <class 'int'>
3       <class 'int'>
4       <class 'int'>
5       <class 'int'>
6     <class 'float'>
7       <class 'int'>
8       <class 'int'>
9       <class 'int'>
10      <class 'int'>
11    <class 'float'>
12      <class 'int'>
13      <class 'int'>
14      <class 'int'>
15      <class 'int'>
16      <class 'int'>
17      <class 'int'>
18      <class 'int'>
19      <class 'int'>
20      <class 'int'>
21      <class 'int'>
22      <class 'int'>
23      <class 'int'>
24      <class 'int'>
25      <class 'int'>
26      <class 'int'>
27      <class 'int'>
28    <class 'float'>
29      <class 'int'>
30      <class 'int'>
31      <class 'int'>
32      <class 'int'>
33      <class 'int'>
34      <class 'int'>
35      <class 'int'>
36      <class 'int'>
37      <class 'int'>
38      <class 'int'>
39      <class 'int'>
40      <class 'int'>
41      <class 'int'>
42      <class 'int'>
43      <class 'int'>
44      <class 'int'>
45      <c

## Preparing column 'Perimeter Contained (%)'

In [16]:
# Get values in column 'Fire Size (Acres)' that belong to column 'Perimeter Contained (%)'
wildfire_df = mt.emend_values(df, wildfire_df, 'Fire Size (Acres)', 'Perimeter Contained (%)', r'^\d*[.]{0,1}\d*\s*%$')

In [17]:
# Get rid of '%' symbol
wildfire_df = mt.strip_symbol(wildfire_df, 'Perimeter Contained (%)', '%')

In [18]:
# Convert float values to integer
wildfire_df = mt.convert_to_int(wildfire_df, 'Perimeter Contained (%)')

In [20]:
# 
wildfire_df['Perimeter Contained (%)'].apply(type).value_counts()

<class 'int'>      82
<class 'float'>    14
<class 'str'>       4
Name: Perimeter Contained (%), dtype: int64

In [21]:
# Convert string values to NaN
wildfire_df = mt.str_to_nan(wildfire_df, 'Perimeter Contained (%)')

In [23]:
wildfire_df['Perimeter Contained (%)'].apply(type).value_counts()

<class 'float'>    100
Name: Perimeter Contained (%), dtype: int64

In [27]:
wildfire_df.loc[:, 'Personnel Involved']

0        600 Acres
1              243
2         73 Acres
3              757
4               93
5               80
6              NaN
7      1,700 Acres
8          2 Acres
9               30
10              38
11             NaN
12              20
13              25
14           1,563
15               3
16             249
17              90
18              25
19    24,729 Acres
20               5
21              20
22             132
23        42 Acres
24             168
25              11
26              10
27             177
28             NaN
29             288
30              19
31             309
32              79
33              16
34               1
35             304
36             185
37               3
38              16
39               1
40             150
41             272
42       180 Acres
43    71,450 Acres
44               3
45               4
46             NaN
47    23,142 Acres
48             NaN
49             134
50             193
51             215
52    14,624

In [25]:
wildfire_df[['Personnel Involved','Perimeter Contained (%)']].applymap(type)

Unnamed: 0,Personnel Involved,Perimeter Contained (%)
0,<class 'str'>,<class 'float'>
1,<class 'str'>,<class 'float'>
2,<class 'str'>,<class 'float'>
3,<class 'str'>,<class 'float'>
4,<class 'str'>,<class 'float'>
5,<class 'str'>,<class 'float'>
6,<class 'float'>,<class 'float'>
7,<class 'str'>,<class 'float'>
8,<class 'str'>,<class 'float'>
9,<class 'str'>,<class 'float'>


In [26]:
wildfire_df['Perimeter Contained (%)'].isnull().sum()

18

## Preparing column 'Personnel Involved'

In [28]:
wildfire_df['Personnel Involved'].apply(type)

0       <class 'str'>
1       <class 'str'>
2       <class 'str'>
3       <class 'str'>
4       <class 'str'>
5       <class 'str'>
6     <class 'float'>
7       <class 'str'>
8       <class 'str'>
9       <class 'str'>
10      <class 'str'>
11    <class 'float'>
12      <class 'str'>
13      <class 'str'>
14      <class 'str'>
15      <class 'str'>
16      <class 'str'>
17      <class 'str'>
18      <class 'str'>
19      <class 'str'>
20      <class 'str'>
21      <class 'str'>
22      <class 'str'>
23      <class 'str'>
24      <class 'str'>
25      <class 'str'>
26      <class 'str'>
27      <class 'str'>
28    <class 'float'>
29      <class 'str'>
30      <class 'str'>
31      <class 'str'>
32      <class 'str'>
33      <class 'str'>
34      <class 'str'>
35      <class 'str'>
36      <class 'str'>
37      <class 'str'>
38      <class 'str'>
39      <class 'str'>
40      <class 'str'>
41      <class 'str'>
42      <class 'str'>
43      <class 'str'>
44      <class 'str'>
45      <c

In [29]:
wildfire_df['Personnel Involved'].apply(type).value_counts()

<class 'str'>      92
<class 'float'>     8
Name: Personnel Involved, dtype: int64

In [30]:
wildfire_df['Personnel Involved'].isnull().sum()

8

In [31]:
wildfire_df = mt.convert_to_int(wildfire_df, 'Personnel Involved')

In [32]:
wildfire_df = mt.str_to_nan(wildfire_df, 'Personnel Involved')

In [606]:
wildfire_df.loc[:,'Personnel Involved'].head(100)

0     <NA>
1      243
2     <NA>
3      757
4       93
5       80
6     <NA>
7     <NA>
8     <NA>
9       30
10      38
11    <NA>
12      20
13      25
14    1563
15       3
16     249
17      90
18      25
19    <NA>
20       5
21      20
22     132
23    <NA>
24     168
25      11
26      10
27     177
28    <NA>
29     288
30      19
31     309
32      79
33      16
34       1
35     304
36     185
37       3
38      16
39       1
40     150
41     272
42    <NA>
43    <NA>
44       3
45       4
46    <NA>
47    <NA>
48    <NA>
49     134
50     193
51     215
52    <NA>
53    <NA>
54     255
55      44
56      30
57      71
58      67
59      96
60      18
61    <NA>
62     142
63      58
64      30
65     201
66    <NA>
67    <NA>
68     134
69       9
70     115
71     233
72       5
73       5
74      32
75      18
76    <NA>
77     149
78      22
79    <NA>
80       3
81    <NA>
82    <NA>
83     120
84     751
85    <NA>
86    <NA>
87     102
88     345
89    <NA>
90    <NA>

In [580]:
wildfire_df = mt.convert_to_string(wildfire_df, 'Personnel Involved')

In [41]:
wildfire_df['Personnel Involved'] = wildfire_df['Personnel Involved'] .astype('Int64')

In [45]:
wildfire_df[['Fire Size (Acres)','Personnel Involved','Perimeter Contained (%)']].applymap(type)

Unnamed: 0,Fire Size (Acres),Personnel Involved,Perimeter Contained (%)
0,<class 'int'>,<class 'pandas._libs.missing.NAType'>,<class 'float'>
1,<class 'int'>,<class 'int'>,<class 'float'>
2,<class 'int'>,<class 'pandas._libs.missing.NAType'>,<class 'float'>
3,<class 'int'>,<class 'int'>,<class 'float'>
4,<class 'int'>,<class 'int'>,<class 'float'>
5,<class 'int'>,<class 'int'>,<class 'float'>
6,<class 'float'>,<class 'pandas._libs.missing.NAType'>,<class 'float'>
7,<class 'int'>,<class 'pandas._libs.missing.NAType'>,<class 'float'>
8,<class 'int'>,<class 'pandas._libs.missing.NAType'>,<class 'float'>
9,<class 'int'>,<class 'int'>,<class 'float'>


In [42]:
# To finalize the numeric columns preparation we save the dataframe as a csv file
wildfire_df.to_csv('blazes/fires_v2.csv', index = False)

In [46]:
wildf[['Fire Size (Acres)','Personnel Involved','Perimeter Contained (%)']].applymap(type)

Unnamed: 0,Fire Size (Acres),Personnel Involved,Perimeter Contained (%)
0,<class 'float'>,<class 'float'>,<class 'float'>
1,<class 'float'>,<class 'float'>,<class 'float'>
2,<class 'float'>,<class 'float'>,<class 'float'>
3,<class 'float'>,<class 'float'>,<class 'float'>
4,<class 'float'>,<class 'float'>,<class 'float'>
5,<class 'float'>,<class 'float'>,<class 'float'>
6,<class 'float'>,<class 'float'>,<class 'float'>
7,<class 'float'>,<class 'float'>,<class 'float'>
8,<class 'float'>,<class 'float'>,<class 'float'>
9,<class 'float'>,<class 'float'>,<class 'float'>


In [44]:
wildf.head()

Unnamed: 0,Cause,Coordinates,Date,Fuels Involved,Incident,Incident Type,Location,Perimeter Contained (%),Personnel Involved,Fire Size (Acres)
0,"Saturday August 15th, 2020 approx. 07:15 AM",,"Tyler Brown, Texas A&M Forest Service",,Chimney Fire,Wildfire,"32.722 latitude, -99.481 longitude",40.0,,600.0
1,Lightning,"33.544 latitude, -110.42 longitude","Saturday August 01st, 2020 approx. 04:30 PM","Primary carrier of the fire is currently the timber litter, grass and light brush associated with the under story of the ponderosa pine forest. Oak scrub is involved on the western edge of the burn area.",Cassadore Springs Fire,Wildfire,"11 miles north of San Carlos, AZ",90.0,243.0,21284.0
2,Powerline,"30.863 latitude, -100.548 longitude","Thursday August 13th, 2020 approx. 03:15 PM",,Bucholtz Fire,Wildfire,5 miles east of Eldorado,100.0,,73.0
3,Lightning,Wildfire,"Friday July 31st, 2020 approx. 05:15 PM","Pinyon, juniper, oak, and sage",Pine Gulch Fire,Wildfire,"18 miles north of Grand Junction, Colorado",7.0,757.0,74807.0
4,Unknown,Wildfire,"Friday August 14th, 2020 approx. 11:43 AM","Timber,litter and understory",Williams Fork,Wildfire,"15 miles NW of Fraser, CO",,93.0,1300.0


In [47]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : None
python           : 3.7.6.final.0
python-bits      : 64
OS               : Windows
OS-release       : 7
machine          : AMD64
processor        : Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : None.None

pandas           : 1.0.1
numpy            : 1.19.1
pytz             : 2019.3
dateutil         : 2.8.1
pip              : 20.2.2
setuptools       : 45.2.0.post20200210
Cython           : 0.29.15
pytest           : 5.3.5
hypothesis       : 5.5.4
sphinx           : 2.4.0
blosc            : None
feather          : None
xlsxwriter       : 1.2.7
lxml.etree       : 4.5.0
html5lib         : 1.0.1
pymysql          : None
psycopg2         : None
jinja2           : 2.11.1
IPython          : 7.12.0
pandas_datareader: 0.9.0
bs4              : 4.8.2
bottleneck       : 1.3.2
fastparquet      : None
gcsfs            : None
lxml.etree       : 4

# Arrange Categorical Columns

## Preparing Column 'Coordinates'

In [43]:
# Read the csv file into a new dataframe
wildf = pd.read_csv('blazes/fires_v2.csv')

In [39]:
# Make a copy of the dataframe
wildf_dfv2 = wildf.copy()

In [723]:
# Copy the values from column 'Location' to the corresponding cells in column 'Coordinates'
wildf_dfv2 = mt.emend_values(df, wildf_dfv2, 'Location', 'Coordinates', r'-?\d+\.?\d+\s*latitude,?\s*-?\d+\.?\d+\s*longitude')

In [724]:
# Copy the values from column 'Cause' to the corresponding cells in column 'Coordinates'
wildf_dfv2 = mt.emend_values(df, wildf_dfv2, 'Cause', 'Coordinates', r'-?\d+\.?\d+\s*latitude,?\s*-?\d+\.?\d+\s*longitude')

In [725]:
# Create a new column 'Latitude' with the values that go along with 'latitude' in column 'Coordinates'
wildf_dfv2 = mt.create_new_col(wildf_dfv2, r'-?\d+\.?\d+\s*(?=latitude)', 'Coordinates', 'Latitude')

In [726]:
# Create a new column 'Longitude' with the values that go along with 'longitude' in column 'Coordinates'
wildf_dfv2 = mt.create_new_col(wildf_dfv2, r'-?\d+\.?\d+\s*(?=longitude)', 'Coordinates', 'Longitude')

In [727]:
# Show value types totals
wildf_dfv2['Latitude'].apply(type).value_counts()

<class 'str'>    100
Name: Latitude, dtype: int64

In [728]:
# Show empty strings totals
(wildf_dfv2['Latitude'] == '').sum()

37

In [729]:
# Fill empty strings with NaNs values in column 'Longitude'
wildf_dfv2['Longitude'].replace(r'^\s*$', np.nan, regex=True, inplace = True)

In [730]:
# Fill empty strings with NaNs values in column 'Latitude'
wildf_dfv2['Latitude'].replace(r'^\s*$', np.nan, regex=True, inplace = True)

In [731]:
# Show value types totals
wildf_dfv2['Latitude'].apply(type).value_counts()

<class 'str'>      63
<class 'float'>    37
Name: Latitude, dtype: int64

In [732]:
# Show null values totals
wildf_dfv2['Latitude'].isnull().sum()

37

In [733]:
# Convert strings to float in column 'Latitude'
wildf_dfv2['Latitude'] = pd.to_numeric(wildf_dfv2['Latitude'], downcast='float')

In [734]:
# Convert strings to float in column 'Longitude'
wildf_dfv2['Longitude'] = pd.to_numeric(wildf_dfv2['Longitude'], downcast='float')

In [735]:
# Drop column 'Coordinates'
wildf_dfv2.drop(['Coordinates'], axis=1, inplace=True)

## Preparing Column 'Date'

In [736]:
# Copy the values from column 'Cause' to the corresponding cells in column 'Date'
wildf_dfv2 = mt.emend_values(df, wildf_dfv2, 'Cause', 'Date', r'(\d{2}:\d{2}\s?(AM|PM))$')

In [737]:
# Remove string 'approx.' from the string containing the day and hour
wildf_dfv2 = mt.strip_symbol(wildf_dfv2, 'Date', 'approx.')

In [738]:
# Convert date entries from 'Date' column to ISO format
wildf_dfv2 = mt.convert_to_isodate(wildf_dfv2, 'Date')

In [739]:
# Drop column 'Date'
wildf_dfv2.drop(['Date'], axis=1, inplace=True)

In [740]:
#wildf_dfv2.loc[:,['Date', 'ISO Date']].head()

## Preparing Column 'Cause'

In [742]:
# Get rid of strings of dates
wildf_dfv2 = mt.strip_string(wildf_dfv2, 'Cause', r'-?\d+\.?\d+\s*latitude,?\s*-?\d+\.?\d+\s*longitude')


In [743]:
# Get rid of strings of coordinates
wildf_dfv2 = mt.strip_string(wildf_dfv2, 'Cause', r'(\d{2}:\d{2}\s?(AM|PM))$')


In [744]:
# Remove trailing and leading whitespaces
wildf_dfv2.loc[:, 'Cause'] = wildf_dfv2.loc[:, 'Cause'].apply(lambda x: x.strip() if type(x)==str else x)

In [745]:
# Replace empty strings with NaN values
wildf_dfv2['Cause'].replace(r'^\s*$', np.nan, regex=True, inplace = True)

In [746]:
# Create and populate set with causes of fire
causes = set()
for item in wildf_dfv2['Cause']:
    if isinstance(item, str):
        if item not in causes:
            causes.add(item)
print(causes)

{'Lightning', 'Piece From Faulty Catalytic Converter', 'Human Caused', 'Accidental Roadside Start', 'Unknown, Under Investigation', 'Human-caused - Suspected Arson', 'Debris Burning', 'Human', 'Lightning/natural', 'Natural, Lightning', 'Powerline', 'Unknown', 'Human-caused', 'Human Caused, Under Investigation', 'Under Investigation', 'Unkown'}


In [747]:
# List causes of fire from set 'causes'
i=1
for item in causes:
    print(i,'->',item)
    i += 1

1 -> Lightning
2 -> Piece From Faulty Catalytic Converter
3 -> Human Caused
4 -> Accidental Roadside Start
5 -> Unknown, Under Investigation
6 -> Human-caused - Suspected Arson
7 -> Debris Burning
8 -> Human
9 -> Lightning/natural
10 -> Natural, Lightning
11 -> Powerline
12 -> Unknown
13 -> Human-caused
14 -> Human Caused, Under Investigation
15 -> Under Investigation
16 -> Unkown


In [748]:
# Ad hoc function for replacing and fixing redundant values in column 'Cause'
result = ""
def is_match(pattern, x, word):
    global result
    if isinstance(x, str):
        match = re.search(pattern, x)
        if match:
            result = x.replace(match.string, word)
        else:
            return False
    else:
        return False
    return True    

In [749]:
# Using function 'is_match' with a lambda function
wildf_dfv2.loc[:, 'Cause'] = wildf_dfv2.loc\
[:, 'Cause'].apply(lambda x:x.replace(x, result) if\
is_match(r'Lightning', x, 'Lightning') else x)

In [750]:
# Using function 'is_match' with a lambda function
wildf_dfv2.loc[:, 'Cause'] = wildf_dfv2.loc\
[:, 'Cause'].apply(lambda x:x.replace(x, result)\
if is_match(r'Human', x, 'Human Caused') else x)

In [751]:
# Using function 'is_match' with a lambda function
wildf_dfv2.loc[:, 'Cause'] = wildf_dfv2.loc\
[:, 'Cause'].apply(lambda x:x.replace(x, result)\
                   if is_match(r'Unk[n]?own', x, 'Unknown') else x)

In [752]:
wildf_dfv2.loc[:, 'Cause'].head()

0          NaN
1    Lightning
2    Powerline
3    Lightning
4      Unknown
Name: Cause, dtype: object

## Creating column Region

In [753]:
# Get the location given by latitude and longitude coordinates
def get_location(coordinates):
    return rg.search(coordinates)

In [688]:
# Function for creating column 'Region' based on coordinates data
def create_region_col(df):
    i = 0
    for latitude, longitude in zip(wildf_dfv2['Latitude'], wildf_dfv2['Longitude']):        
        if (not math.isnan(latitude) and not math.isnan(longitude)):
           coordinates = (latitude, longitude)
           location = get_location(coordinates)
           df.at[i, 'Region'] = list(location[0].values())[3]
           i += 1            
        else:
           i += 1

In [754]:
# Call to function create_region_col
create_region_col(wildf_dfv2)

In [646]:
# Checking null values after last call to create_region_col
wildf_dfv2.loc[:,  'Region'].isnull().sum()

37

In [765]:
wildf_dfv2.loc[15:24, ['Location', 'Latitude', 'Longitude', 'Region']]

Unnamed: 0,Location,Latitude,Longitude,Region
15,"25 miles west of Cody, Wyo",44.459,-109.546997,Wyoming
16,17 miles southwest of Stevens Village,65.853996,-149.578995,Alaska
17,"Darrell Willis, DFFM",33.806999,-112.119003,Arizona
18,NE of New River,,,Nebraska
19,Superstition Mountains,33.418999,-111.301003,Arizona
20,Split Mountain,,,
21,"Up to 38 miles northeast of Mesa, AZ along Hwy 87.",,,Arizona
22,East side of Peavine Mountain,39.576,-119.913002,Nevada
23,23 miles from Cortez,,,
24,"5 miles north of Oak City, Utah",,,Utah


## Appendix Section

In [790]:
wildf_dfv21 = wildf_dfv2.copy()

In [694]:
wildf_dfv21['Region'].isnull().sum()

15

In [771]:
# Dictionary with US states names and their abbrevations
us_abbrev_dic={'Alabama': 'AL', 'Alaska': 'AK', 'Arizona':'AZ', 'Arkansas':'AR',
               'California': 'CA', 'Colorado':'CO', 'Connecticut':'CT', 'Delaware':'DE',
               'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
               'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
               'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
               'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
               'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
               'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
               'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
               'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
               'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
               'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
               'Wisconsin': 'WI', 'Wyoming': 'WY'  
              }
st = us_abbrev_dic

In [772]:
# Ad hoc function for completing states names in column 'Region' based on column 'Location' data
def foo2(df, col):
    i=0
    flag=0
    list_count = 0
    wordup = ''
    for sentence in df[col]:
        if not isinstance(df.at[i,'Region'], str):
            if isinstance(sentence, str):
                word_list = re.findall(r'\w+', sentence)
                word_list_len = len(word_list)
                for word in word_list:
                    list_count += 1
                    if len(word)>2:
                        wordup = word.upper()
                    else:
                        wordup = word
                    for st_name, st_abbrev in st.items():
                        upper_st = st_name.upper()
                        if (wordup == upper_st or wordup == st_abbrev):
                            df.at[i,'Region'] = st_name
                            i+=1
                            flag = 1
                            break
                    if (list_count == word_list_len and flag ==0):
                        list_count = 0
                        i+=1
                    elif(flag == 1):
                        list_count = 0
                        flag = 0
                        break
            else:
                i+=1
        else:
            i+=1

In [791]:
# Call to function foo2
foo2(wildf_dfv21, 'Location')

In [795]:
# Listing the rows of 'Location' that will be googled
i = 0
indexes = list()
for item, name in zip(wildf_dfv21['Region'], wildf_dfv21['Location']):
    if (isinstance(item, float) and not isinstance(name, float)):
        indexes.append(wildf_dfv21.index[wildf_dfv21['Location'] == name].tolist())
        print(indexes[i][0], '->', name)
        i += 1  

12 -> Ryan Carbajal
40 -> Wildland Fire


In [793]:
# Dictionary with the last states with keys serving as dataframe corresponding indexes
last_st = {20:'California', 23:'Colorado', 26: 'Arizona', 33: 'Arizona',\
          53: 'Oregon', 56: 'Arizona', 61: 'Idaho', 67: 'Nevada', 76: 'California',\
           78: 'California', 97: 'Arizona', 98: 'California'}

In [794]:
# Set last missing states
for key, values in last_st.items():
        wildf_dfv21.at[key, 'Region'] = values

In [799]:
wildf_dfv21.loc[[12,40], ['Incident', 'Location', 'Region']].head(41)

Unnamed: 0,Incident,Location,Region
12,Seco,Ryan Carbajal,
40,Milepost 21,Wildland Fire,


In [800]:
wildf_dfv21.at[12, 'Region'] = 'New Mexico'
wildf_dfv21.at[40, 'Region'] ='California'   

In [801]:
# Drop column 'Location'
wildf_dfv21.drop(['Location'], axis=1, inplace=True)

In [802]:
# Drop column 'Incident'
wildf_dfv21.drop(['Incident'], axis=1, inplace=True)

In [803]:
# Save dataframe to csv file
wildf_dfv21.to_csv('blazes/fires_v3.csv', index = False)

## Final Tweakings

In [821]:
# Read the csv file into a new dataframe
wildf = pd.read_csv('blazes/fires_v3.csv')

In [822]:
# Make a copy
wildf_dfv3 = wildf.copy()

In [836]:
# Show selected columns where column 'Incident Type' is 'Burned Area Emergency Response'
exclude = ['Latitude','Longitude', 'Fuels Involved', 'Region']
wildf_dfv3.loc[wildf_dfv3['Incident Type'] == 'Burned Area Emergency Response', wildf_dfv3.columns.difference(exclude, sort=False)]

Unnamed: 0,Cause,Incident Type,Perimeter Contained (%),Personnel Involved,Fire Size (Acres),ISO Date
11,,Burned Area Emergency Response,,,,
28,,Burned Area Emergency Response,,,,
46,,Burned Area Emergency Response,,,,


In [839]:
wildf_dfv3.drop(index=wildf_dfv3[wildf_dfv3['Incident Type'] == 'Burned Area Emergency Response'].index, inplace=True)

In [None]:
# Checking total number of rows with all cells empty
wildf_dfv3.isnull().all(axis=1).sum()

In [None]:
# Drop all the rows with all cells empty
wildf_dfv3.dropna(how='all', inplace=True)

In [842]:
wildf_dfv3.shape

(96, 10)

In [860]:
# Converting float values to integers. When saving dataframe to csv int values are saved as float
wildf_dfv3[['Perimeter Contained (%)','Personnel Involved','Fire Size (Acres)']] = wildf_dfv3[['Perimeter Contained (%)','Personnel Involved','Fire Size (Acres)']].astype('Int64')

In [865]:
exclude = ['Latitude','Longitude', 'Fuels Involved']
wildf_dfv3.loc[:, wildf_dfv3.columns.difference(exclude, sort=False)].head(10)

Unnamed: 0,Cause,Incident Type,Perimeter Contained (%),Personnel Involved,Fire Size (Acres),ISO Date,Region
0,,Wildfire,40.0,,600,2020-08-15T07:15:00,Texas
1,Lightning,Wildfire,90.0,243.0,21284,2020-08-01T16:30:00,Arizona
2,Powerline,Wildfire,100.0,,73,2020-08-13T15:15:00,Texas
3,Lightning,Wildfire,7.0,757.0,74807,2020-07-31T17:15:00,Colorado
4,Unknown,Wildfire,,93.0,1300,2020-08-14T11:43:00,Colorado
5,Lightning,Wildfire,30.0,80.0,2900,2020-08-10T13:45:00,Arizona
7,Unknown,Wildfire,90.0,,1700,,Wyoming
8,,Wildfire,100.0,,2,2020-08-15T12:15:00,Texas
9,Unknown,Wildfire,75.0,30.0,5,2020-08-06T00:00:00,Montana
10,,Wildfire,40.0,38.0,770,2020-08-11T00:00:00,Wyoming


In [864]:
wildf_dfv3.to_csv('blazes/fires_v4.csv', index = False)

# This section is for analyzing the data

In [855]:
wildf_dfv21[['Fire Size (Acres)','Personnel Involved','Perimeter Contained (%)']].applymap(type)

Unnamed: 0,Fire Size (Acres),Personnel Involved,Perimeter Contained (%)
0,<class 'float'>,<class 'float'>,<class 'float'>
1,<class 'float'>,<class 'float'>,<class 'float'>
2,<class 'float'>,<class 'float'>,<class 'float'>
3,<class 'float'>,<class 'float'>,<class 'float'>
4,<class 'float'>,<class 'float'>,<class 'float'>
5,<class 'float'>,<class 'float'>,<class 'float'>
6,<class 'float'>,<class 'float'>,<class 'float'>
7,<class 'float'>,<class 'float'>,<class 'float'>
8,<class 'float'>,<class 'float'>,<class 'float'>
9,<class 'float'>,<class 'float'>,<class 'float'>


In [None]:
us_capital_dic={
    'Alabama': 'Montgomery',
    'Alaska': 'Juneau',
    'Arizona':'Phoenix',
    'Arkansas':'Little Rock',
    'California': 'Sacramento',
    'Colorado':'Denver',
    'Connecticut':'Hartford',
    'Delaware':'Dover',
    'Florida': 'Tallahassee',
    'Georgia': 'Atlanta',
    'Hawaii': 'Honolulu',
    'Idaho': 'Boise',
    'Illinois': 'Springfield',
    'Indiana': 'Indianapolis',
    'Iowa': 'Des Monies',
    'Kansas': 'Topeka',
    'Kentucky': 'Frankfort',
    'Louisiana': 'Baton Rouge',
    'Maine': 'Augusta',
    'Maryland': 'Annapolis',
    'Massachusetts': 'Boston',
    'Michigan': 'Lansing',
    'Minnesota': 'St. Paul',
    'Mississippi': 'Jackson',
    'Missouri': 'Jefferson City',
    'Montana': 'Helena',
    'Nebraska': 'Lincoln',
    'Nevada': 'Carson City',
    'New Hampshire': 'Concord',
    'New Jersey': 'Trenton',
    'New Mexico': 'Santa Fe',
    'New York': 'Albany',
    'North Carolina': 'Raleigh',
    'North Dakota': 'Bismarck',
    'Ohio': 'Columbus',
    'Oklahoma': 'Oklahoma City',
    'Oregon': 'Salem',
    'Pennsylvania': 'Harrisburg',
    'Rhode Island': 'Providence',
    'South Carolina': 'Columbia',
    'South Dakoda': 'Pierre',
    'Tennessee': 'Nashville',
    'Texas': 'Austin',
    'Utah': 'Salt Lake City',
    'Vermont': 'Montpelier',
    'Virginia': 'Richmond',
    'Washington': 'Olympia',
    'West Virginia': 'Charleston',
    'Wisconsin': 'Madison',
    'Wyoming': 'Cheyenne'  
}

In [74]:
# Do not erase this. It is for slicing tips
wildf_dfv2.loc[wildf_dfv2['Region'].isnull(), ['Location', 'Region']].head(100)

Unnamed: 0,Location,Region
6,,
12,Ryan Carbajal,
20,Split Mountain,
23,23 miles from Cortez,
26,14 mi. W. Sahuarita,
33,Northeast of Tucson,
40,Wildland Fire,
53,Klamath National Forest,
56,3 miles East of Tusayan near FR 302 & 2709,
61,"Selway-Bitterroot Wilderness, Nez Perce-Clearwater National Forests",


In [192]:
wildf_dfv2.loc[:,'Region'].isnull().all().sum()

0

In [None]:
fire4.corr()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
fire4[['Personnel Involved', 'Area Contained (Acres)']].plot(ax=ax)
fire4['Perimeter Contained (%)'].plot(secondary_y=True)
plt.legend(loc='best')
ax.set_ylabel('Personnel Involved')
ax.right_ax.set_ylabel('Perimeter')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
fire4.plot(x='Region', y=['Personnel Involved', 'Area Contained (Acres)'],ax=ax)
fire4.plot(x='Region', y='Perimeter Contained (%)', secondary_y=True, ax= ax)
fig.autofmt_xdate()
ax.set_ylabel('Personnel Involved')
ax.right_ax.set_ylabel('Perimeter')

In [None]:
fire4.axes

In [None]:
fire4.index

In [None]:
fire4.columns

In [None]:
fire4.info()

In [None]:
from sklearn.preprocessing import minmax_scale
cols = fire4['Perimeter Contained (%)']

min_max_scaler = preprocessing.MinMaxScaler()
fire4.loc[:,'Perimeter Scaled'] = minmax_scale(fire4['Perimeter Contained (%)'])
#df_normalized = pd.DataFrame(np_scaled, columns = cols)

In [None]:
from sklearn.preprocessing import MinMaxScaler



In [None]:
scaler = MinMaxScaler()
fire4.loc[:,'Perimeter Scaled']  = scaler.fit_transform(fire4.loc['Perimeter Contained (%)'])


In [None]:
unscaled = scaler.inverse_transform(fire4['Perimeter Scaled'])

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
fire4_scaled = min_max_scaler.fit_transform(fire4['Perimeter Contained (%)'])

In [None]:
fire4_scaled = preprocessing.scale(fire4['Perimeter Contained (%)'])

In [None]:
fire4_scaled

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
new_inverse = scalery.inverse_transform(fire4['Perimeter Contained (%)'])