In [116]:
import geopandas as gpd
import pandas as pd
from datetime import datetime
from shapely.wkt import loads
from warnings import filterwarnings
import matplotlib.pyplot as plt
import numpy as np

filterwarnings('ignore')

Let's first grab the fire data below and check to make sure everything is prepped for combining

In [268]:
fire_data = gpd.read_file('Data/Fire_Data/fire_date_geo.shp',crs='esri:102009')


In [118]:
fire_data.crs

<Projected CRS: PROJCS["NAD_1983_Lambert_Conformal_Conic",GEOGCS[" ...>
Name: NAD_1983_Lambert_Conformal_Conic
Axis Info [cartesian]:
- [east]: Easting (metre)
- [north]: Northing (metre)
Area of Use:
- undefined
Coordinate Operation:
- name: unnamed
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [119]:
fire_data.shape

(26875, 4)

We checked the CRS above to make sure that it matches the CRS that was listed in our data source. A Coordinate reference system (CRS) defines, with the help of coordinates, how the two-dimensional, projected map is related to real locations on the earth.

This is necessary to ensure that when measuring the distance between fires and their corresponding stations, we are able to measure that distance accurately. 

In [234]:
fire_data.head()

Unnamed: 0,YEAR,MONTH,SRC_AGY2,geometry,point,Province,station_1,station_1 distance,station_2,station_2 distance,station_3,station_3 distance,station_4,station_4 distance,station_5,station_5 distance
0,2004,6,BC,"POLYGON Z ((-1886926.467 898021.006 0.000, -18...",POINT (-1887998.420 896224.870),NORTHWEST TERRITORIES,2100636,2089763.0,2100950,2089764.0,2100630,2089768.0,2100840,2089768.0,1206197,2089769.0
1,2004,6,BC,"POLYGON Z ((-1880308.251 892344.865 0.000, -18...",POINT (-1878788.740 890903.317),NORTHWEST TERRITORIES,2100636,2079161.0,2100950,2079163.0,2100630,2079166.0,2100840,2079166.0,1206197,2079168.0
2,2004,6,BC,"POLYGON Z ((-1965048.293 820512.199 0.000, -19...",POINT (-1965682.454 819433.502),NORTHWEST TERRITORIES,2100636,2129488.0,2100950,2129490.0,2100630,2129493.0,2100840,2129493.0,1206197,2129494.0
3,2004,6,BC,"POLYGON Z ((-1995073.527 854615.146 0.000, -19...",POINT (-1994169.415 843833.781),NORTHWEST TERRITORIES,2100636,2165201.0,2100950,2165203.0,2100630,2165206.0,2100840,2165206.0,1206197,2165207.0
4,2004,6,BC,"POLYGON Z ((-1988211.829 940418.674 0.000, -19...",POINT (-1991636.184 937300.515),NORTHWEST TERRITORIES,2100636,2201014.0,2100950,2201016.0,2100630,2201019.0,2100840,2201019.0,1206197,2201021.0


In [121]:
fire_data.shape

(26875, 4)

Let's take the list of stations that have data from 1990 to 2022 consecutively, and are within the latitude 45 and 66

In [253]:
monthly_weather= pd.read_csv('Data/Monthly_Weather_Data/monthly_weather.csv',index_col=0)

In [254]:
monthly_weather

Unnamed: 0,Climate ID,Longitude (x),Latitude (y),Date,Mean Temp (°C),Total Snow (cm),Total Precip (mm)
0,1011500,-123.74,48.94,1991-07-31,17.983871,0.000000,1.019355
1,1011500,-123.74,48.94,1991-08-31,18.174194,0.000000,4.280645
2,1011500,-123.74,48.94,1991-09-30,15.763333,0.000000,0.100000
3,1011500,-123.74,48.94,1991-10-31,9.738710,0.438710,1.500000
4,1011500,-123.74,48.94,1991-11-30,7.113333,0.000000,8.800000
...,...,...,...,...,...,...,...
161498,709CEE9,-78.28,48.80,2022-08-31,,0.000000,1.984000
161499,709CEE9,-78.28,48.80,2022-09-30,,0.000000,2.534783
161500,709CEE9,-78.28,48.80,2022-10-31,,0.000000,2.821053
161501,709CEE9,-78.28,48.80,2022-11-30,,0.613636,2.394444


In [257]:
weather_shp=gpd.GeoDataFrame(monthly_weather,geometry=gpd.points_from_xy(monthly_weather['Longitude (x)'],monthly_weather['Latitude (y)']),crs='esri:102009')

In [259]:
weather_shp.head()

Unnamed: 0,Climate ID,Longitude (x),Latitude (y),Date,Mean Temp (°C),Total Snow (cm),Total Precip (mm),geometry
0,1011500,-123.74,48.94,1991-07-31,17.983871,0.0,1.019355,POINT (-123.740 48.940)
1,1011500,-123.74,48.94,1991-08-31,18.174194,0.0,4.280645,POINT (-123.740 48.940)
2,1011500,-123.74,48.94,1991-09-30,15.763333,0.0,0.1,POINT (-123.740 48.940)
3,1011500,-123.74,48.94,1991-10-31,9.73871,0.43871,1.5,POINT (-123.740 48.940)
4,1011500,-123.74,48.94,1991-11-30,7.113333,0.0,8.8,POINT (-123.740 48.940)


Now I have:
station_shp
fire_geo


I need to combine them via distance and date. Let's start with distance to see if we can create a distance df with all the info:

In [126]:

station_shp.crs

<Projected CRS: ESRI:102009>
Name: North_America_Lambert_Conformal_Conic
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: North America - onshore and offshore: Canada - Alberta; British Columbia; Manitoba; New Brunswick; Newfoundland and Labrador; Northwest Territories; Nova Scotia; Nunavut; Ontario; Prince Edward Island; Quebec; Saskatchewan; Yukon. United States (USA) - Alabama; Alaska (mainland); Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming.
- bounds: (-172.54, 23.81, -47.74, 86.46)
Coordin

In [127]:
fire_data.crs

<Projected CRS: PROJCS["NAD_1983_Lambert_Conformal_Conic",GEOGCS[" ...>
Name: NAD_1983_Lambert_Conformal_Conic
Axis Info [cartesian]:
- [east]: Easting (metre)
- [north]: Northing (metre)
Area of Use:
- undefined
Coordinate Operation:
- name: unnamed
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [128]:
station_shp

Unnamed: 0,Station Name,Province,Latitude,Longitude,Elevation,Climate ID,WMO Identifier,TC Identifier,First Year,Last Year,HLY First Year,HLY Last Year,DLY First Year,DLY Last Year,MLY First Year,MLY Last Year,geometry
0,100 MILE HOUSE 6NE,BRITISH COLUMBIA,51.68,-121.22,928.0,1165793,,,1987,2023,,,1987.0,2023.0,1987.0,2007.0,POINT (-121.220 51.680)
1,ABEE AGDM,ALBERTA,54.28,-112.97,664.0,3010010,71285.0,XAF,1990,2024,1990.0,2024.0,2002.0,2024.0,2002.0,2007.0,POINT (-112.970 54.280)
2,ADDENBROKE ISLAND,BRITISH COLUMBIA,51.60,-127.86,21.3,1060080,,WCZ,1978,2024,1994.0,2001.0,1978.0,2024.0,1978.0,2007.0,POINT (-127.860 51.600)
3,AGASSIZ CDA,BRITISH COLUMBIA,49.24,-121.76,15.0,1100120,,,1889,2023,,,1889.0,2023.0,1889.0,2007.0,POINT (-121.760 49.240)
4,AGASSIZ RCS,BRITISH COLUMBIA,49.24,-121.76,19.3,1100119,71113.0,WZA,1988,2024,1994.0,2024.0,1988.0,2024.0,1988.0,2006.0,POINT (-121.760 49.240)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,WHITE ROCK CAMPBELL SCIENTIFIC,BRITISH COLUMBIA,49.02,-122.78,13.0,1108910,71785.0,WWK,1929,2024,1994.0,2024.0,1929.0,2024.0,1929.0,2007.0,POINT (-122.780 49.020)
431,WINDSOR RIVERSIDE,ONTARIO,42.33,-82.93,188.4,6139520,,,1866,2024,,,1866.0,2024.0,1866.0,2006.0,POINT (-82.930 42.330)
432,WOODSTOCK,ONTARIO,43.14,-80.77,281.9,6149625,,,1870,2024,,,1870.0,2024.0,1870.0,2006.0,POINT (-80.770 43.140)
433,WRIGHT,QUEBEC,46.07,-76.05,141.7,7038975,,,1967,2024,,,1967.0,2024.0,1967.0,2018.0,POINT (-76.050 46.070)


https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.representative_point.html#geopandas.GeoSeries.representative_point

In [269]:
fire_data['point']=fire_data['geometry'].representative_point()

In [270]:
fire_data['point'].crs

<Projected CRS: PROJCS["NAD_1983_Lambert_Conformal_Conic",GEOGCS[" ...>
Name: NAD_1983_Lambert_Conformal_Conic
Axis Info [cartesian]:
- [east]: Easting (metre)
- [north]: Northing (metre)
Area of Use:
- undefined
Coordinate Operation:
- name: unnamed
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [266]:
weather_shp.dtypes

Climate ID             object
Longitude (x)         float64
Latitude (y)          float64
Date                   object
Mean Temp (°C)        float64
Total Snow (cm)       float64
Total Precip (mm)     float64
geometry             geometry
dtype: object

In [267]:
fire_data.dtypes

YEAR                     int64
MONTH                    int64
SRC_AGY2                object
geometry              geometry
point                 geometry
Province                object
station_1               object
station_1 distance     float64
station_2               object
station_2 distance     float64
station_3               object
station_3 distance     float64
station_4               object
station_4 distance     float64
station_5               object
station_5 distance     float64
dtype: object

In [293]:
list_stations= weather_shp[['Climate ID','geometry']]

In [300]:
list_stations.drop_duplicates(inplace=True)

In [301]:
list_stations.shape

(430, 2)

In [302]:
#Working LOOP!!!!
for idx_fire, fire in fire_data.iterrows():
    fire_point = fire['point']
    station_distance_dict = dict()
    for idx_station, station in list_stations['Climate ID'].iterrows():        
        station_point = station['geometry']
        station_distance_dict[station['Climate ID']] = station_point.distance(fire_point)
    sorted_distance = sorted(station_distance_dict.items(), key=lambda x: x[1])

    for rank, pairs in enumerate(sorted_distance[:5],start=1):
        fire_data.loc[idx_fire, f'station_{rank}'] = pairs[0]
        fire_data.loc[idx_fire, f'station_{rank} distance'] = pairs[1]
        

        
            

AttributeError: 'Series' object has no attribute 'iterrows'

39667    -6.412903
39668    -7.803571
39669     0.425000
39670     5.303333
39671     8.807407
           ...    
40058    16.590323
40059    12.383333
40060     6.732258
40061    -8.023333
40062   -12.454839
Name: Mean Temp (°C), Length: 396, dtype: float64

In [140]:
station_shp.to_file('stations_shp.shp')

In [141]:
station_shp.to_csv('stations_shp.csv')

In [242]:
station_shp.shape

(435, 17)

Now we have a concatenated the stations with the fire data. Let's join the weather data with the fire data to finalize our working table. 

In [142]:
weather_data=pd.read_csv('Data/Monthly_Weather_Data/monthly_weather.csv',index_col=0)

In [225]:
weather_data.head()

Unnamed: 0,Station,Date,Mean Temp (°C),Total Snow (cm),Total Precip (mm),MONTH,YEAR
0,1011500,1991-07-31,17.983871,0.0,1.019355,7,1991
1,1011500,1991-08-31,18.174194,0.0,4.280645,8,1991
2,1011500,1991-09-30,15.763333,0.0,0.1,9,1991
3,1011500,1991-10-31,9.73871,0.43871,1.5,10,1991
4,1011500,1991-11-30,7.113333,0.0,8.8,11,1991


In [143]:
weather_data.shape

(160667, 5)

In [176]:
df1_melted = fire_data.melt(id_vars=['MONTH', 'YEAR','point','Province'], 
                       value_vars=['station_1', 'station_2', 'station_3', 'station_4', 'station_5'], 
                       var_name='StationNumber', 
                       value_name='Station')

In [193]:
df1_melted

Unnamed: 0,MONTH,YEAR,point,Province,StationNumber,Station
0,6,2004,POINT (-1887998.420 896224.870),NORTHWEST TERRITORIES,station_1,2100636
1,6,2004,POINT (-1878788.740 890903.317),NORTHWEST TERRITORIES,station_1,2100636
2,6,2004,POINT (-1965682.454 819433.502),NORTHWEST TERRITORIES,station_1,2100636
3,6,2004,POINT (-1994169.415 843833.781),NORTHWEST TERRITORIES,station_1,2100636
4,6,2004,POINT (-1991636.184 937300.515),NORTHWEST TERRITORIES,station_1,2100636
...,...,...,...,...,...,...
134370,7,1994,POINT (-1550016.992 1795701.199),NORTHWEST TERRITORIES,station_5,2200675
134371,7,1994,POINT (-1541856.878 1774727.228),NORTHWEST TERRITORIES,station_5,2200675
134372,7,1994,POINT (-1433519.923 1707775.801),NORTHWEST TERRITORIES,station_5,2200675
134373,7,1992,POINT (-1606625.034 1912779.504),NORTHWEST TERRITORIES,station_5,2200675


In [179]:
weather_data.rename(columns={'Climate ID':'Station'},inplace=True)

In [180]:
df1_melted.dtypes

MONTH               int64
YEAR                int64
point            geometry
Province           object
StationNumber      object
Station            object
dtype: object

In [181]:
weather_data.dtypes

Station                      object
Date                 datetime64[ns]
Mean Temp (°C)              float64
Total Snow (cm)             float64
Total Precip (mm)           float64
MONTH                         int32
YEAR                          int32
dtype: object

In [182]:
weather_data['Date']=pd.to_datetime(weather_data['Date'])

In [183]:


weather_data['MONTH'] = weather_data['Date'].dt.month
weather_data['YEAR'] = weather_data['Date'].dt.year

In [184]:
merged_df = pd.merge(df1_melted, weather_data, how='outer', on=['MONTH','YEAR','Station'])

In [198]:
merged_df.sample(30)

Unnamed: 0,MONTH,YEAR,point,Province,StationNumber,Station,Date,Mean Temp (°C),Total Snow (cm),Total Precip (mm)
123361,6,2008,POINT (25593.322 679747.598),NORTHWEST TERRITORIES,station_1,2400302,NaT,,,
283035,12,1996,,,,1021480,1996-12-31,1.174194,1.433333,9.553333
23896,2,2014,,,,6163171,2014-02-28,-12.375,1.110714,2.046429
105125,6,1996,,,,1020590,1996-06-30,15.222222,,0.493333
93329,5,2018,,,,2400570,2018-05-31,-10.236667,,
8610,1,2010,,,,6152695,2010-01-31,-5.105556,0.194444,0.194444
67831,5,1995,,,,1023462,1995-05-31,14.641935,0.0,2.103226
138522,6,2016,POINT (-233330.501 708780.842),NORTHWEST TERRITORIES,station_5,2100840,NaT,,,
24892,2,2016,POINT (-1449326.199 396979.673),NORTHWEST TERRITORIES,station_3,2100630,NaT,,,
47025,4,2000,POINT (-1026961.293 736339.259),NORTHWEST TERRITORIES,station_2,2100950,NaT,,,


In [249]:
merged_df.shape

(294176, 10)

In [246]:
merged_df.duplicated().sum()

20

In [247]:
merged_df.drop_duplicates(inplace=True)

In [250]:
for i in range(len(merged_df.columns)):
    na_index=merged_df.isna().sum().index[i]
    na_ratio=merged_df.isna().sum().iloc[i]
    print(f'{na_index} has {round((na_ratio/294176)*100,2)}% missing data')


MONTH has 0.0% missing data
YEAR has 0.0% missing data
point has 54.33% missing data
Province has 44.56% missing data
StationNumber has 54.33% missing data
Station has 0.0% missing data
Date has 40.13% missing data
Mean Temp (°C) has 48.57% missing data
Total Snow (cm) has 54.13% missing data
Total Precip (mm) has 46.13% missing data


We can see there's an issue now where we have missing information under `Province`, which is the column that tells us which province this fire took place in. The reason for this missing data is because we got this information from the fire data table, so all the `NaN` values are coming from weather readings on months that didn't have any fires. Since we have corresponding station ids to the provinces on rows where there were fires, let's use that information to impute the missing data. 

In [195]:
# Create a dictionary from the mapping dataframe
map_dict = station_shp.set_index('Climate ID')['Province'].to_dict()

# Fill missing province values in the main dataframe
merged_df['Province'] = merged_df['Province'].fillna(merged_df['Station'].map(map_dict))

In [206]:
merged_df.dtypes

MONTH                         int64
YEAR                          int64
point                      geometry
Province                     object
StationNumber                object
Station                      object
Date                 datetime64[ns]
Mean Temp (°C)              float64
Total Snow (cm)             float64
Total Precip (mm)           float64
dtype: object

In [207]:
print(set(merged_df['Station']).issubset(set(station_shp['Climate ID'])))

False


In [220]:
station_shp['Climate ID'].nunique()

435

In [217]:
merged_df['Station'].nunique()

442

In [232]:
station_shp[weather_data['Station'].isin([station_shp['Climate ID']])]

Unnamed: 0,Station Name,Province,Latitude,Longitude,Elevation,Climate ID,WMO Identifier,TC Identifier,First Year,Last Year,HLY First Year,HLY Last Year,DLY First Year,DLY Last Year,MLY First Year,MLY Last Year,geometry


In [187]:
len(station_province_dict)

20

In [157]:
# merged_df[merged_df['SRC_AGY2'].notnull()]

In [188]:
merged_df.sample(10)

Unnamed: 0,MONTH,YEAR,point,Province,StationNumber,Station,Date,Mean Temp (°C),Total Snow (cm),Total Precip (mm)
228143,8,2017,POINT (-371488.446 831696.970),NORTHWEST TERRITORIES,station_3,2200675,NaT,,,
266418,10,2021,,,,4041000,2021-10-31,3.693548,0.051613,1.122581
98358,5,2021,,,,2400570,2021-05-31,-2.776471,,
49741,4,2005,POINT (-235539.828 309570.308),NORTHWEST TERRITORIES,station_3,2100630,NaT,,,
68470,5,1995,POINT (-893543.126 1026813.114),NORTHWEST TERRITORIES,station_5,2200675,NaT,,,
17394,2,1999,,,,4041000,1999-02-28,-11.242857,0.317857,0.317857
269027,11,1995,,,,1181508,1995-11-30,-8.94,2.876667,2.42
284060,12,1998,,,,5050920,1998-12-31,-16.758065,0.774194,0.774194
270081,11,1997,,,,7034365,1997-11-30,-0.74,1.216667,2.836667
243676,9,2009,,,,1064321,2009-09-30,12.622222,0.006667,8.906667


In [159]:
weather_data.columns

Index(['Station', 'Date', 'Mean Temp (°C)', 'Total Snow (cm)',
       'Total Precip (mm)', 'MONTH', 'YEAR'],
      dtype='object')

In [160]:
merged_df['Fire'] = np.where(merged_df['point']!=None,1,0)

In [161]:
merged_df.sample(20)

Unnamed: 0,MONTH,YEAR,point,SRC_AGY2,StationNumber,Station,Date,Mean Temp (°C),Total Snow (cm),Total Precip (mm),Fire
162953,7,2005,POINT (-1384479.521 1630161.032),NT,station_5,2200675,NaT,,,,1
36250,3,2010,POINT (-1401074.911 470698.376),PC,station_4,2100840,NaT,,,,1
166317,7,2007,,,,2403602,2007-07-31,6.090323,,,0
252584,10,1992,,,,1064320,1992-10-31,6.364516,1.096774,11.096774,0
130990,6,2013,,,,6138270,2013-06-30,,0.0,5.376667,0
250945,9,2020,,,,3060L20,2020-09-30,,,,0
43859,4,1994,,,,8100468,1994-04-30,,,,0
23071,2,2012,,,,7020392,2012-02-29,-5.286667,0.315385,0.5375,0
134722,6,2015,,,,7031360,2015-06-30,17.655,0.0,3.08,0
177129,7,2011,POINT (-26540.893 644958.680),MB,station_2,2403450,NaT,,,,1


In [162]:
merged_df.shape

(294196, 11)

In [None]:
merged_df.to_csv('Data/modelling_df.csv')

In [None]:
merged_df.sample(10)

Unnamed: 0,MONTH,YEAR,point,StationNumber,Station,Date,Mean Temp (°C),Total Snow (cm),Total Precip (mm),Fire
154757,7,2000,,,1166658,2000-07-31,14.048387,0.0,2.645161,0
74454,5,2004,,,7015730,2004-05-31,12.883871,0.0,2.245161,0
202366,8,1994,POINT (-1392172.341 1772172.705),station_1,2100636,NaT,,,,1
87755,5,2014,POINT (-1784653.215 987291.433),station_1,2100636,NaT,,,,1
193358,7,2018,POINT (-1767877.660 1165936.586),station_1,2100636,NaT,,,,1
50086,4,2005,POINT (-425526.729 495988.640),station_5,2200675,NaT,,,,1
184649,7,2014,POINT (-1443030.254 264140.663),station_4,2100840,NaT,,,,1
57470,4,2013,,,6119055,2013-04-30,5.84,0.166667,3.14,0
42475,4,1991,POINT (-1729632.788 1555037.896),station_2,2100950,NaT,,,,1
5080,1,2002,,,4061570,2002-01-31,-24.177419,0.193548,0.193548,0
