In [1]:
#Import needed libraries and also import the dataset
import pandas as pd
import numpy as np
import os
from datetime import datetime
import plotly.express as px
import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
sns.set_theme(style="whitegrid")

from dis import dis
import geopandas
from shapely.wkt import loads

In [3]:
characteristic = os.path.abspath(os.path.join(os.getcwd(), '../Dataset/charateristics.csv'))
data_char= pd.read_csv(characteristic, low_memory = False).set_index(['UID_Fire', 'REF_ID', 'YYYYMMDD'])

In [4]:
area_burn = os.path.abspath(os.path.join(os.getcwd(), '../Dataset/AoB.csv'))
data_ab = pd.read_csv(area_burn, dtype ={'UID_Fire':  str}).set_index(['UID_Fire', 'REF_ID', 'Map_Date'])

In [5]:
data_char.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HHMM,sat,lat,lon,T21,T31,sample,FRP,conf,type,Status,FD_Agency,geometry,dn
UID_Fire,REF_ID,YYYYMMDD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
547,BC-2010-V90985,20100816,604,T,50.388,-127.29,307.3,293.0,918,9.3,71,0,primary,CA,POINT (-2143720.639756499 1647561.2310603699),
541,BC-2010-V70506,20100721,936,A,49.199,-124.104,317.4,286.1,44,110.9,95,0,primary,CA,POINT (-1992828.7808720134 1429941.6404347),
541,BC-2010-V70506,20100721,1114,A,49.201,-124.111,305.8,285.8,1229,32.3,66,0,primary,CA,POINT (-1993205.6183386864 1430352.169513478),
538,BC-2010-V60945,20100814,615,T,48.492,-124.029,301.9,290.2,548,5.1,45,0,primary,CA,POINT (-2019595.6404366868 1356043.6818672968),
538,BC-2010-V60945,20100814,615,T,48.491,-124.043,357.2,294.9,549,84.1,100,0,primary,CA,POINT (-2020590.0362250712 1356363.409522294),


In [6]:
data_ab.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,FD_Agency,date_src,Year,JD,geometry
UID_Fire,REF_ID,Map_Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
193,BC-2010-C10060,2010-06-01,CA,1AF,2010,152,POLYGON Z ((-1688364.4292310965 1766761.124217...
208,BC-2010-C10258,2010-07-31,CA,1AF,2010,212,POLYGON Z ((-1765477.675062592 1801382.7590815...
215,BC-2010-C10320,2010-08-02,CA,1AF,2010,214,POLYGON Z ((-1637226.5542412489 1746616.851343...
222,BC-2010-C20018,2010-07-06,CA,1AF,2010,187,POLYGON Z ((-1764660.3624268812 1676377.400509...
245,BC-2010-C20293,2010-07-31,CA,1AF,2010,212,POLYGON Z ((-1755143.0002803158 1702585.418622...


In [7]:
# Number of unique elements and the number of times each element appears in the index 
print('Duplicated area of fire burn records', np.unique(data_ab.index.duplicated(), return_counts = True))

Duplicated area of fire burn records (array([False,  True]), array([ 14891, 425632]))


In [8]:
print('Duplicated area of fire burn records', np.unique(data_char.index.duplicated(), return_counts = True))

Duplicated area of fire burn records (array([False,  True]), array([  8268, 155972]))


In [23]:
def make_dataframes(aob_path, characteristics_path):
    # clean Area of Burn(AoB) data
    df_aob = pd.read_csv(aob_path, dtype={'UID_Fire': str}) \
               .drop(['FD_Agency', 'JD', 'date_src', 'Year'], axis = 1)
    #Rename columns(Map Date to Date Of Burn)
    df_aob.rename(columns={"Map_Date": "Date_of_Burn"}, inplace=True)
    df_aob["area"] = get_area_of_polygon(df_aob['geometry'])
#     df_aob.drop("geometry", axis = 1, inplace = True)
    df_aob = df_aob.groupby(['UID_Fire', 'REF_ID', 'Date_of_Burn'], axis = 0).sum()
    df_aob.rename(columns={'area': 'Total_AoB'}, inplace = True)
    
    # clean characteristics data
    df_characteristics = pd.read_csv(characteristics_path, low_memory = False) \
#                            .drop(['FD_Agency', 'dn', 'HHMM', 'sample', 'type', 'geometry'], axis = 1)
    df_characteristics['rounded_lat'], df_characteristics['rounded_lon'] = get_rounded_locations(df_characteristics)
    df_characteristics['Date'] = get_formatted_date(df_characteristics['YYYYMMDD'])
    df_characteristics.drop(['YYYYMMDD'], axis = 1, inplace = True)
    
    # brightness temperature of a fire pixel in Band 31 and Band 21 measured in Kelvin but converted to Celsius  
    df_characteristics['T21'] = df_characteristics['T21'] - 273.15
    df_characteristics['T31'] = df_characteristics['T31'] - 273.15
    grp_list = ['Date', 'sat', 'UID_Fire', 'Status', 'REF_ID', 'rounded_lat', 'rounded_lon']
    #group the data and then get mean and the SD and also replace all the nan values with 0
    df_characteristics = df_characteristics.groupby(grp_list, axis = 0) \
                      .agg(['mean', 'std']) \
                      .fillna(0) # to replace NaN in std when there is single value for grouped row
    df_characteristics.columns = ['_'.join(item) for item in df_characteristics.columns]
    return df_aob, df_characteristics

In [None]:
def get_rounded_locations(df):
    df_lat_rounded = df.lat.map(lambda x: round(x * 4) / 4)
    df_lon_rounded = df.lon.map(lambda x: round(x * 4) / 4)
    return df_lat_rounded, df_lon_rounded

def get_area_of_polygon(df_geometry):
     #converts it into a shapely Polygon.
    df_polygon = df_geometry.apply(lambda shp: loads(shp))
    #Calculates the area in square kilometers by dividing the area by 10 to the power of 6.
    df_area = df_polygon.apply(lambda x: (x.area / 10**6)) #Its  as square kilometers.
    return df_area

#Get YYYY-MM-DD

def get_formatted_date(df_date):
    return df_date.map(lambda x: str(x)) \
                  .map(lambda x: str(x[0:4] + '-' + x[4:6] + '-' + x[6:8]))

In [25]:
data_ab, data_char = make_dataframes(area_burn, characteristic)

  df_characteristics = df_characteristics.groupby(grp_list, axis = 0) \


In [26]:
data_ab.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Total_AoB
UID_Fire,REF_ID,Date_of_Burn,Unnamed: 3_level_1
100,BC-2011-V30040,2011-05-20,0.021933
100,BC-2014-G80090,2014-05-31,0.107307
1000,AB-2014-HWF124,2014-06-29,0.255955
1000,AB-2015-SWF061,2015-05-22,0.022832
1000,AB-2016-EWF008,2016-04-08,0.01035


In [27]:
data_char.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,HHMM_mean,HHMM_std,lat_mean,lat_std,lon_mean,lon_std,T21_mean,T21_std,T31_mean,T31_std,sample_mean,sample_std,FRP_mean,FRP_std,conf_mean,conf_std,type_mean,type_std
Date,sat,UID_Fire,Status,REF_ID,rounded_lat,rounded_lon,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2010-01-12,A,313,removed,BC-2010-G40151,53.75,-124.25,2036.0,0.0,53.829,0.0,-124.332,0.0,69.75,0.0,2.45,0.0,1111.0,0.0,110.7,0.0,93.0,0.0,0.0,0.0
2010-01-12,T,313,removed,BC-2010-G40151,53.75,-124.25,2024.0,0.0,53.832,0.0,-124.335,0.0,44.35,0.0,0.15,0.0,1252.0,0.0,82.7,0.0,64.0,0.0,0.0,0.0
2010-01-13,A,313,removed,BC-2010-G40151,53.75,-124.25,2119.0,0.0,53.838,0.001414,-124.3265,0.010607,71.9,8.980256,-3.55,0.424264,522.5,0.707107,63.8,18.526198,81.5,3.535534,0.0,0.0
2010-01-18,A,313,removed,BC-2010-G40151,53.75,-124.25,2138.0,0.0,53.845,0.0,-124.302,0.0,37.45,0.0,-2.75,0.0,280.0,0.0,25.8,0.0,63.0,0.0,0.0,0.0
2010-01-22,A,211,removed,BC-2010-C10299,53.25,-123.75,2113.0,0.0,53.316,0.0,-123.856,0.0,72.55,0.0,-3.45,0.0,603.0,0.0,60.5,0.0,84.0,0.0,0.0,0.0


In [28]:
#Area Burned
data_ab.to_csv('area_burnt.csv')
#Charateristics
data_char.to_csv('CharacteristicDataset.csv')