# Dataset creation

There are a lot of non-pertinent data in the datasets provided by INPE. In this notebook
I create functions to simplify the datasets where we are left solely with the relevant information

In [1]:
import pandas as pd

In [60]:
file2014 = '../../dataset/data2014.csv'
file2015 = '../../dataset/data2015.csv'
file2016 = '../../dataset/data2016.csv'
file2017 = '../../dataset/data2017.csv'
file2018 = '../../dataset/data2018.csv'
file2019 = '../../dataset/data2019.csv'
file2020 = '../../dataset/data2020.csv'

In [62]:
def fire_prec_creator(filename):
    cols = ['datahora', 'bioma', 'riscofogo', 'latitude', 'longitude', 'precipitacao']
    dataframe = pd.read_csv(filename, usecols = cols, parse_dates = True, index_col = 'datahora')
    
    df_fogo = dataframe.drop(columns = ['latitude', 'longitude', 'precipitacao']).copy()

    df_fogo = df_fogo[df_fogo.riscofogo == 1]

    gb_biome  = df_fogo.groupby('bioma')

    new_df_fogo = pd.DataFrame(columns = ['bioma', 'riscofogo'])
    for biome, df in gb_biome:
        df = df.resample('W').sum().sort_index()
        df['bioma'] = biome
        new_df_fogo = pd.concat([new_df_fogo, df])

    df_prec = dataframe.drop(columns = ['latitude', 'longitude', 'riscofogo']).copy()

    gb_biome  = df_prec.groupby('bioma')

    new_df_prec = pd.DataFrame(columns = ['bioma', 'precipitacao'])

    for biome, df in gb_biome:
        df = df.resample('W').mean().sort_index()
        df['bioma'] = biome
        new_df_prec = pd.concat([new_df_prec, df])

    new_df_prec.head()

    new_dataframe = pd.DataFrame(columns = ['bioma', 'riscofogo', 'precipitacao'])
    for biome in new_df_prec.bioma.unique():
        df1 = new_df_fogo[new_df_fogo.bioma == biome].drop(columns = 'bioma')
        df2 = new_df_prec[new_df_prec.bioma == biome].drop(columns = 'bioma')
        df3 = pd.merge(df1, df2, left_index = True, right_index = True)
        df3['bioma'] = biome
        new_dataframe = pd.concat([new_dataframe, df3])
    return new_dataframe

In [63]:
df2014 = fire_prec_creator(file2014)


Unnamed: 0,bioma,riscofogo,precipitacao
2014-01-05,Amazonia,1462.0,1.367618
2014-01-12,Amazonia,528.0,2.994492
2014-01-19,Amazonia,357.0,2.270912
2014-01-26,Amazonia,1347.0,0.987918
2014-02-02,Amazonia,423.0,1.589529
...,...,...,...
2014-12-07,Pantanal,10.0,1.925424
2014-12-14,Pantanal,6.0,2.913043
2014-12-21,Pantanal,16.0,4.351948
2014-12-28,Pantanal,8.0,0.271875


In [80]:
## Work on the localization

cols = ['bioma', 'riscofogo', 'latitude', 'longitude']
dataframe = pd.read_csv(file2014, usecols = cols)#, index_col = ['latitude', 'longitude'])
dataframe = dataframe[dataframe.riscofogo == 1]
dataframe = dataframe.round(1)

In [88]:
def fire_loc_creator(filename):
    cols = ['bioma', 'riscofogo', 'latitude', 'longitude']
    dataframe = pd.read_csv(filename, usecols = cols)
    dataframe = dataframe[dataframe.riscofogo == 1]
    dataframe = dataframe.round(1)
    
    gb_biome = dataframe.groupby('bioma')

    df_loc_fire = pd.DataFrame(columns = ['latitude', 'longitude', 'riscofogo', 'bioma'])

    for biome, df in gb_biome:
        number_events = []
        lat = []
        long = []
        loc_gb = dataframe.groupby(['latitude', 'longitude'])
        for info, df_loc in loc_gb:
            lat.append(info[0])
            long.append(info[1])
            number_events.append(len(df_loc))

        df_partial = pd.DataFrame({'latitude' : lat, 'longitude' : long, 'riscofogo' : number_events, 'bioma' : biome})

        df_loc_fire = pd.concat([df_loc_fire, df_partial])
        
        return df_loc_fire

In [87]:
df2014 = fire_prec_creator(file2014)
df2015 = fire_prec_creator(file2015)
df2016 = fire_prec_creator(file2016)
df2017 = fire_prec_creator(file2017)
df2018 = fire_prec_creator(file2018)
df2019 = fire_prec_creator(file2019)
df2020 = fire_prec_creator(file2020)


df2014_loc = fire_loc_creator(file2014)
df2015_loc = fire_loc_creator(file2015)
df2016_loc = fire_loc_creator(file2016)
df2017_loc = fire_loc_creator(file2017)
df2018_loc = fire_loc_creator(file2018)
df2019_loc = fire_loc_creator(file2019)
df2020_loc = fire_loc_creator(file2020)

2572

In [89]:
df2014 = fire_prec_creator(file2014)