In [1]:
import os
import pandas as pd
import geopandas as gpd
import pygeos as pg
import numpy as np
import tensorflow as tf
import sqlalchemy as sq
import calendar
from dotenv import load_dotenv
from IPython.display import clear_output
from matplotlib import pyplot as plt
from DataService import DataService

2022-11-27 03:15:56.720191: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-27 03:15:56.830490: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
pd.set_option('display.max_columns', None)
os.chdir('/tf')
PGUSER = os.getenv('POSTGRES_USER')
PGPW = os.getenv('POSTGRES_PW')
PGDB = os.getenv('POSTGRES_DB')
NULLFLAG = -9999

In [3]:
# Connect to the database
pullService = DataService(PGDB, PGUSER, PGPW)
db_pull_con = pullService.connect()

pushService = DataService(PGDB, PGUSER, PGPW)
db_push_con = pushService.connect()

In [4]:
# Load the data
weatherDailyAggTable = "WeatherDataHourlyAggDaily"
query = "SELECT * FROM public.\"{}\";".format(weatherDailyAggTable)
dfAgg = pd.read_sql(query, db_pull_con)

In [5]:
# Load the data
weatherMonthlyAggTable = "WeatherDataHourlyAggMonthly"
query = "SELECT * FROM public.\"{}\";".format(weatherMonthlyAggTable)
dfAggMonthly = pd.read_sql(query, db_pull_con)

In [6]:
# Load fire data
fireTable = "lgFireFifty"
query = "SELECT * FROM public.\"{}\" WHERE \"YEAR\" > 2009;".format(fireTable)
dfFire = gpd.GeoDataFrame.from_postgis(query, db_pull_con)

In [7]:
centroids = "lgFireFiftyCentroids"
query = "SELECT * FROM public.\"{}\";".format(centroids)
dfCent = gpd.GeoDataFrame.from_postgis(query, db_pull_con)

In [8]:
dfAgg.describe()

Unnamed: 0,Year,Month,Day,MeanTemp,MinTemp,MaxTemp,MeanDewPoint,MinDewPoint,MaxDewPoint,MeanHumidity,MinHumidity,MaxHumidity,MeanPressure,MinPressure,MaxPressure,MeanWindSpeed,MinWindSpeed,MaxWindSpeed,MeanWindChill,MinWindChill,MaxWindChill,TotalPrecip,MeanWindDirection
count,72790.0,72790.0,72790.0,72790.0,72790.0,72790.0,71852.0,71852.0,71852.0,71852.0,71852.0,71852.0,71487.0,71487.0,71487.0,71487.0,71487.0,71487.0,71487.0,71487.0,71487.0,72790.0,71487.0
mean,2015.7,6.5,15.7,-0.4,-5.7,4.4,-5.6,-9.1,-2.3,71.4,55.2,86.8,97.4,97.0,97.7,10.2,3.4,18.2,-18.9,-22.1,-14.6,1.0,18.5
std,3.4,3.5,8.8,14.8,14.7,15.3,13.4,14.4,12.7,13.5,19.5,9.4,1.3,1.3,1.3,4.8,3.6,7.4,6.5,7.6,7.8,17.2,7.4
min,2010.0,1.0,1.0,-42.6,-48.5,-36.2,-47.3,-54.9,-41.2,17.8,6.0,28.0,92.4,91.7,92.7,0.0,0.0,0.0,-53.3,-60.0,-49.0,0.0,0.0
25%,2013.0,4.0,8.0,-11.9,-17.0,-7.6,-15.1,-19.5,-11.1,62.5,39.0,81.0,96.5,96.1,96.8,6.8,0.0,13.0,-21.0,-24.0,-19.6,0.0,12.5
50%,2016.0,7.0,16.0,1.3,-2.8,5.3,-3.8,-6.5,-1.1,72.9,56.0,89.0,97.3,96.9,97.6,9.5,2.0,17.0,-18.1,-19.6,-17.0,0.0,18.4
75%,2019.0,10.0,23.0,12.8,6.9,17.8,6.0,2.9,8.8,81.7,71.0,94.0,98.2,97.9,98.6,12.9,5.0,22.0,-15.2,-17.5,-7.0,0.2,24.5
max,2021.0,12.0,31.0,31.5,30.2,39.9,19.5,17.3,30.0,100.0,100.0,100.0,102.2,102.0,102.4,50.3,34.0,93.0,-2.1,-4.0,0.0,2812.5,36.0


In [9]:
dfFire.describe()

Unnamed: 0,EntryID,YEAR,MONTH,DAY,SIZE_HA,CALC_HA
count,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0
mean,6282.6,2014.5,6.7,16.1,8200.9,8125.4
std,3983.5,2.8,1.1,9.2,32210.4,31947.1
min,1.0,2010.0,2.0,1.0,200.0,36.1
25%,3056.0,2012.0,6.0,7.0,487.0,482.5
50%,5708.0,2015.0,7.0,16.0,1375.7,1370.4
75%,9085.0,2017.0,7.0,25.0,4719.8,4664.7
max,12827.0,2020.0,12.0,31.0,596459.1,596459.1


In [10]:
# we bin fire data size_ha by quartiles
dfFire['size_ha_bin'] = pd.qcut(dfFire['SIZE_HA'], 4, labels=False)
dfFire.head()

Unnamed: 0,EntryID,FIRE_ID,FIRENAME,YEAR,MONTH,DAY,REP_DATE,OUT_DATE,DECADE,SIZE_HA,CALC_HA,CAUSE,CFS_REF_ID,geom,size_ha_bin
0,1,HWF278,Birch Complex Fire,2015,7,6,2015-07-06,,2010-2019,3329.7,3329.9,L,AB-2015-HWF278,"POLYGON ((4932200.196 2611443.456, 4932314.213...",2
1,2,HWF280,,2017,9,1,2017-09-01,,2010-2019,13628.3,13638.3,U,AB-2017-HWF280,"MULTIPOLYGON (((4834104.255 2898701.409, 48339...",3
2,33,LWF116,Cowper Complex Fire,2018,5,22,2018-05-22,,2010-2019,1456.2,1457.3,L,AB-2018-LWF116,"MULTIPOLYGON (((5036526.749 2375899.118, 50365...",2
3,6,HWF291,,2017,9,24,2017-09-24,,2010-2019,1520.7,1521.4,U,AB-2017-HWF291,"POLYGON ((4711737.650 2814312.160, 4711570.367...",2
4,7,HWF292,,2017,9,25,2017-09-25,,2010-2019,448.9,449.1,U,AB-2017-HWF292,"POLYGON ((4711219.479 2814219.324, 4711097.577...",0


In [11]:
dfFire.describe()

Unnamed: 0,EntryID,YEAR,MONTH,DAY,SIZE_HA,CALC_HA,size_ha_bin
count,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0,3265.0
mean,6282.6,2014.5,6.7,16.1,8200.9,8125.4,1.5
std,3983.5,2.8,1.1,9.2,32210.4,31947.1,1.1
min,1.0,2010.0,2.0,1.0,200.0,36.1,0.0
25%,3056.0,2012.0,6.0,7.0,487.0,482.5,0.0
50%,5708.0,2015.0,7.0,16.0,1375.7,1370.4,1.0
75%,9085.0,2017.0,7.0,25.0,4719.8,4664.7,2.0
max,12827.0,2020.0,12.0,31.0,596459.1,596459.1,3.0


In [12]:
dfFire.drop(columns=['DECADE', 'CALC_HA', 'CFS_REF_ID', 'CAUSE', 'OUT_DATE'], inplace=True)
dfFire['FIRE_ID'].count()

3263

In [13]:
# join fire with centroids
dfMerged = dfFire.merge(dfCent, on='EntryID', how='left')
dfMerged.set_geometry('geom_y')
dfMerged.drop(columns=['geom_x'], inplace=True)
dfMerged.head()

Unnamed: 0,EntryID,FIRE_ID,FIRENAME,YEAR,MONTH,DAY,REP_DATE,SIZE_HA,size_ha_bin,geom_y
0,1,HWF278,Birch Complex Fire,2015,7,6,2015-07-06,3329.7,2,POINT (4934547.488 2608092.858)
1,2,HWF280,,2017,9,1,2017-09-01,13628.3,3,POINT (4820621.875 2904194.413)
2,33,LWF116,Cowper Complex Fire,2018,5,22,2018-05-22,1456.2,2,POINT (5034115.756 2378216.407)
3,6,HWF291,,2017,9,24,2017-09-24,1520.7,2,POINT (4711296.400 2816704.501)
4,7,HWF292,,2017,9,25,2017-09-25,448.9,0,POINT (4710313.249 2815071.918)


In [14]:
# use fire name where fire id is null
dfMerged['FIRE_ID'].fillna(dfMerged['FIRENAME'], inplace=True)
dfMerged.count()

EntryID        3265
FIRE_ID        3265
FIRENAME        664
YEAR           3265
MONTH          3265
DAY            3265
REP_DATE       3265
SIZE_HA        3265
size_ha_bin    3265
geom_y         3265
dtype: int64

In [15]:
 # use fire id where fire name is null
dfMerged['FIRENAME'].fillna(dfMerged['FIRE_ID'], inplace=True)
dfMerged.count()

EntryID        3265
FIRE_ID        3265
FIRENAME       3265
YEAR           3265
MONTH          3265
DAY            3265
REP_DATE       3265
SIZE_HA        3265
size_ha_bin    3265
geom_y         3265
dtype: int64

In [16]:
stations = "TenYrStationsHourly"
query = "SELECT \"ClimateID\", \"geom\" FROM public.\"{}\";".format(stations)
dfStation = gpd.GeoDataFrame.from_postgis(query, db_pull_con)
dfStation.count()

ClimateID    633
geom         633
dtype: int64

In [17]:
dfFinal = dfMerged.copy(deep=True)

In [18]:
# For each line in dfMerged, find the closest weather station and keep the name of that station
for index, row in dfMerged.iterrows():
    # find the closest weather station
    closest = dfStation.distance(row['geom_y']).idxmin()
    dfFinal.at[index, 'ClimateID'] = dfStation.at[closest, 'ClimateID']
dfFinal.count()

EntryID        3265
FIRE_ID        3265
FIRENAME       3265
YEAR           3265
MONTH          3265
               ... 
REP_DATE       3265
SIZE_HA        3265
size_ha_bin    3265
geom_y         3265
ClimateID      3265
Length: 11, dtype: int64

In [23]:
dfFinal.rename(columns={'geom_y': 'geom'}, inplace=True)
dfFinal.head()


Unnamed: 0,EntryID,FIRE_ID,FIRENAME,YEAR,MONTH,DAY,REP_DATE,SIZE_HA,size_ha_bin,geom,ClimateID
0,1,HWF278,Birch Complex Fire,2015,7,6,2015-07-06,3329.7,2,POINT (4934547.488 2608092.858),307KPFP
1,2,HWF280,HWF280,2017,9,1,2017-09-01,13628.3,3,POINT (4820621.875 2904194.413),3073148
2,33,LWF116,Cowper Complex Fire,2018,5,22,2018-05-22,1456.2,2,POINT (5034115.756 2378216.407),3062697
3,6,HWF291,HWF291,2017,9,24,2017-09-24,1520.7,2,POINT (4711296.400 2816704.501),3073148
4,7,HWF292,HWF292,2017,9,25,2017-09-25,448.9,0,POINT (4710313.249 2815071.918),3073148


In [24]:
dfFinal = gpd.GeoDataFrame(dfFinal, geometry='geom')
dfFinal.to_postgis('FirePreWeatherMerge', db_push_con, if_exists='replace', index=False)