# Dataset creation

There are a lot of non-pertinent data in the datasets provided by INPE. In this notebook
I create functions to simplify the datasets where we are left solely with the relevant information

In [39]:
import pandas as pd

In [40]:
file2014 = '../../dataset/data2014.csv'
file2015 = '../../dataset/data2015.csv'
file2016 = '../../dataset/data2016.csv'
file2017 = '../../dataset/data2017.csv'
file2018 = '../../dataset/data2018.csv'
file2019 = '../../dataset/data2019.csv'
file2020 = '../../dataset/data2020.csv'

In [35]:
def fire_prec_creator(filename):
    cols = ['datahora', 'bioma', 'riscofogo', 'precipitacao']
    dataframe = pd.read_csv(filename, usecols = cols, parse_dates = True, index_col = 'datahora')
    
    df_fogo = dataframe.drop(columns = ['precipitacao']).copy()

    df_fogo = df_fogo[df_fogo.riscofogo == 1]

    gb_biome  = df_fogo.groupby('bioma')

    new_df_fogo = pd.DataFrame(columns = ['bioma', 'riscofogo'])
    for biome, df in gb_biome:
        df = df.resample('D').sum().sort_index()
        df['bioma'] = biome
        new_df_fogo = pd.concat([new_df_fogo, df])

    df_prec = dataframe.drop(columns = ['riscofogo']).copy()

    gb_biome  = df_prec.groupby('bioma')

    new_df_prec = pd.DataFrame(columns = ['bioma', 'precipitacao'])

    for biome, df in gb_biome:
        df = df.resample('D').mean().sort_index()
        df['bioma'] = biome
        new_df_prec = pd.concat([new_df_prec, df])

    new_df_prec.head()

    new_dataframe = pd.DataFrame(columns = ['bioma', 'riscofogo', 'precipitacao'])
    for biome in new_df_prec.bioma.unique():
        df1 = new_df_fogo[new_df_fogo.bioma == biome].drop(columns = 'bioma')
        df2 = new_df_prec[new_df_prec.bioma == biome].drop(columns = 'bioma')
        df3 = pd.merge(df1, df2, left_index = True, right_index = True)
        df3['bioma'] = biome
        new_dataframe = pd.concat([new_dataframe, df3])
    return new_dataframe

In [45]:
def fire_loc_creator(filename):
    df = pd.read_csv(filename, usecols = ['latitude', 'longitude', 'riscofogo', 'bioma'])

    df = df[df.riscofogo == 1]
    df = df.round(1)
    
    loc_gb = df.groupby(['latitude', 'longitude', 'bioma'])
    number_events = []
    lat = []
    long = []
    biome = []
    
    for info, df_loc in loc_gb:
        lat.append(info[0])
        long.append(info[1])
        biome.append(info[2])
        number_events.append(len(df_loc))

    df_loc_fire = pd.DataFrame({'latitude' : lat, 'longitude' : long, 'riscofogo' : number_events, 'bioma' : biome})
    
    return df_loc_fire

In [46]:
df2014 = fire_prec_creator(file2014)
df2015 = fire_prec_creator(file2015)
df2016 = fire_prec_creator(file2016)
df2017 = fire_prec_creator(file2017)
df2018 = fire_prec_creator(file2018)
df2019 = fire_prec_creator(file2019)
df2020 = fire_prec_creator(file2020)


df2014_loc = fire_loc_creator(file2014)
df2015_loc = fire_loc_creator(file2015)
df2016_loc = fire_loc_creator(file2016)
df2017_loc = fire_loc_creator(file2017)
df2018_loc = fire_loc_creator(file2018)
df2019_loc = fire_loc_creator(file2019)
df2020_loc = fire_loc_creator(file2020)

In [49]:
df2014.to_csv('webapp/datasets/df2014.csv')
df2014_loc.to_csv('webapp/datasets/df2014_loc.csv')

df2015.to_csv('webapp/datasets/df2015.csv')
df2015_loc.to_csv('webapp/datasets/df2015_loc.csv')

df2016.to_csv('webapp/datasets/df2016.csv')
df2016_loc.to_csv('webapp/datasets/df2016_loc.csv')

df2017.to_csv('webapp/datasets/df2017.csv')
df2017_loc.to_csv('webapp/datasets/df2017_loc.csv')

df2018.to_csv('webapp/datasets/df2018.csv')
df2018_loc.to_csv('webapp/datasets/df2018_loc.csv')

df2019.to_csv('webapp/datasets/df2019.csv')
df2019_loc.to_csv('webapp/datasets/df2019_loc.csv')

df2020.to_csv('webapp/datasets/df2020.csv')
df2020_loc.to_csv('webapp/datasets/df2020_loc.csv')

In [32]:
## Creating a df for map

def fire_biome(df, biome, ax, dfmap):
        df_biome = dfmap.copy()
        gb_biome = df_biome.groupby('bioma')

        biomes = ['Amazonia', 'Mata Atlantica', 'Cerrado', 'Pampa', 'Caatinga', 'Pantanal']
        colors = ['indianred', 'yellowgreen', 'cadetblue', 'mediumpurple', 'lightsteelblue', 'bisque']
        color_dict = dict(zip(biomes, colors))
        
        ## Looping through the biomes
        for biome_name, df in gb_biome:
            if biome_name != biome:
                ax.plot(df.longitude, df.latitude, marker = '*',
                label = biome_name, linestyle = '', c = color_dict[biome_name])

        ax.set_xlabel('Latitude', fontsize = 14)
        ax.set_ylabel('Longitude', fontsize = 14)
        ax.set_title('Fire events distribution in {} \nduring {}'.format(biome, self.year), fontsize = 16)
        ax.legend(loc = 'lower left')