In [None]:
# data manipulation
import pandas as pd
import numpy as np
from datetime import timezone, timedelta, time, datetime
from math import sqrt

# stats liibs
from scipy import stats
from scipy.stats import pearsonr
from matplotlib.projections import process_projection_requirements

# data viz
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

# apply some cool styling
plt.style.use("seaborn-darkgrid")
rcParams['figure.figsize'] = (12,  6)
sns.set(font_scale=1.5)

# make reproducible
np.random.seed(42)

# Creating Categorical Ordered
from pandas.api.types import CategoricalDtype
month_type = CategoricalDtype(categories=['Jan', 'Feb','Mar', 'Apr','May', 'Jun', 'Jul', 'Aug', 'Sep','Oct', 'Nov', 'Dec'], ordered=True)
day_type = CategoricalDtype(categories=['Monday','Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)

# dataset
wildfire = 'https://bit.ly/3fVhoGZ'

# Portugal Districts
# Center   = ['Castelo Branco', 'Viseu', 'Guarda', 'Aveiro', 'Coimbra', 'Leiria']
# Algarve  = ['Faro']
# Alentejo = ['Évora', 'Beja', 'Portalegre', 'Santarém']
# Lisbon   = ['Lisboa', 'Setúbal']
# Northen  = ['Viana Do Castelo', 'Porto', 'Braga', 'Vila Real', 'Bragança']

North    = ['Porto', 'Vila Real']

# Six-Step Statistical Investigation Method

* Define|Ask a Research Question
* Design a study and collect Data (Step will be skipped)
* Explore the Data
* Draw inferences
* Formulate Conclusions
* Look back & ahead

## 1. Define|Ask a Research Question

### Do the Porto and Vila Real Districts have the same characteristics regarding Fire Typology, Fire Causes, Temporal Incidence (Weekday and Month), Fire Duration and Burnt Area?

### Variables to be used: 

    * Qualitative Variables (Categorical data): 
        - Tipo: wildfire type: “Florestal, Agrícola, Falso Alarme, Queimada” 
        - Distrito: Administrative areas 
        - TipoCausa: Source – “Intencional, Natural, Negligente, Desconhecida, Reacendimento, NULL” 

    * Quantitative Variables (Numerical data) 
        - DataAlerta: Communication date 
        - HoraAlerta: Communication hour 
        - DataExtinção: Extinction date 
        - HoraExtinção: Extinction hour 
        - AA_Total (pov+mato+agric) (ha): Total burned area 



## 2. Explore the Data (Data Cleaning & Filtering)

In [None]:
# Load dataset
df = pd.read_csv(wildfire, sep=';')
df.head()

In [None]:
# Inspecting dataset colunms name
df.columns

In [None]:
# Inspecting colunms with null values
display(df[df.columns[df.isnull().any()]].isnull().sum(axis=0))

In [None]:
# Inspecting dataset shape
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns. ')

In [None]:
# Inspecting columns datatype
df[['DataAlerta','HoraAlerta', 'DataExtincao', 'HoraExtincao']].dtypes

In [None]:
# Converting Date Columens to datetime
df['DAlert'] = df['DataAlerta'].str[:10].astype(str) + ' ' + df['HoraAlerta'].astype(str)
df['DExtiction'] = df['DataExtincao'].str[:10].astype(str) + ' ' + df['HoraExtincao'].astype(str)

df[['DAlert', 'DExtiction']] = df[['DAlert', 'DExtiction']].apply(pd.to_datetime, errors='coerce')

# Inspecting columns datatype
print(df[['DAlert', 'DExtiction']].dtypes)

# Inspecting first rows
df[['DAlert', 'DExtiction']].head()

In [None]:
# Columns for analysis
columns = ['Distrito','Tipo', 'TipoCausa', 'DAlert','DExtiction','AA_Total (pov+mato+agric) (ha)']
names   = ['District','Type', 'TypeCause', 'DateAlert', 'DateExtinction','AreaTotal']

# Filter the dataset
filtered = df[columns].query('Distrito in @North')

# Reset & drop Indexes
filtered.reset_index(inplace=True, drop=True)

# Renaming columns
filtered.columns = names
filtered.head()

In [None]:
filtered.Type.unique()

In [None]:
# Inspecting Type of Fire
display(filtered.Type.value_counts(normalize=True))

In [None]:
# Inspecting new dataset shape
print(f'The dataset has {filtered.shape[0]} rows and {filtered.shape[1]} columns. ')
print(f'Filtered dataframe accounts for {round(filtered.shape[0]/df.shape[0]*100, 2)}% of source')

In [None]:
# Inspecting colunms with null values
display(filtered[filtered.columns[filtered.isnull().any()]].isnull().sum(axis=0))

In [None]:
# Inspecting whether the date of Alert is greater that Extinction
display(filtered[(filtered.DateAlert > filtered.DateExtinction) == True])

In [None]:
# Removing rows where date of Alert is greater that Extinction
filtered = filtered[(filtered.DateAlert > filtered.DateExtinction) == False].copy(deep=True)
display(filtered[(filtered.DateAlert > filtered.DateExtinction) == True])

In [None]:
# Calculating time of fire and getting day of week

filtered['FireDuration']   = filtered.DateExtinction - filtered.DateAlert
filtered['DayAlert']       = filtered.DateAlert.dt.day_name()
filtered['Month']          = filtered.DateExtinction.dt.month_name().str[:3]
filtered['HourTotal']      = filtered.FireDuration / pd.Timedelta(1, 'h')
filtered['DayOfYear']      = filtered.DateAlert.dt.day_of_year
filtered['HourOfDay']      = filtered.DateAlert.dt.hour

# Converting datatypes
filtered['AreaTotal'] = filtered['AreaTotal'].str.replace(',', '.').astype('float')
filtered['Month'] = filtered['Month'].astype(month_type)
filtered['DayAlert'] = filtered['DayAlert'].astype(day_type)



In [None]:
filtered.head()

In [None]:
print("TypeCause  Numbers")
display(filtered.TypeCause.value_counts(dropna=False, ascending=False, normalize=True))
print("Type Numbers")
display(filtered.Type.value_counts(dropna=False, ascending=False, normalize=True))


In [None]:
g = sns.FacetGrid(filtered, col="District", height= 12, hue="Type")
g.map_dataframe(sns.histplot, x="Type")
g.set_axis_labels("", "Frequency")
g.set_titles(col_template="{col_name} District", row_template="{row_name}")
g.fig.suptitle("Type of Fire By District")
g.tight_layout();

In [None]:
g = sns.FacetGrid(filtered, col="District", height= 12, hue="TypeCause")
g.map_dataframe(sns.histplot, x="TypeCause")
g.set_axis_labels("", "Frequency")
g.set_titles(col_template="{col_name} District", row_template="{row_name}")
g.fig.suptitle("TypeCause of Fire By District")
g.tight_layout();

In [None]:
g = sns.FacetGrid(filtered, col="District", height= 12, hue="HourOfDay")
g.map_dataframe(sns.histplot, x="HourOfDay")
g.set_axis_labels("", "Frequency")
g.set_titles(col_template="{col_name} District", row_template="{row_name}")
g.fig.suptitle("Hour of Fire By District")
g.set(xticks=np.arange(0,24,1))
g.set_xticklabels(np.arange(0,24,1))
g.tight_layout();

In [None]:
g = sns.FacetGrid(filtered, col="District", height= 12, hue="DayAlert")
g.map_dataframe(sns.histplot, x="DayAlert")
g.set_axis_labels("", "Frequency")
g.set_titles(col_template="{col_name} District", row_template="{row_name}")
g.fig.suptitle("DayAlert of Fire By District")

g.tight_layout();

In [None]:
g = sns.FacetGrid(filtered, col="District", height= 12, hue="Month")
g.map_dataframe(sns.histplot, x="Month")
g.set_axis_labels("", "Frequency")
g.set_titles(col_template="{col_name} District", row_template="{row_name}")
g.fig.suptitle("DayAlert of Fire By District")

g.tight_layout();

In [None]:
g = sns.FacetGrid(filtered, col="District", height= 12, hue="DayOfYear")
g.map_dataframe(sns.histplot, x="DayOfYear")
g.set_axis_labels("", "Frequency")
g.set_titles(col_template="{col_name} District", row_template="{row_name}")
g.fig.suptitle("DayOfYear of Fire By District")
g.set(xticks=np.arange(0,366,90))
g.set_xticklabels(np.arange(0,366,90))
g.tight_layout();

In [None]:
burnArea = filtered.groupby('District')['AreaTotal'].agg('sum')

burnArea

In [None]:
burnAreaType = filtered.query('AreaTotal > 0').groupby(['District','Type'])['AreaTotal'].sum().reset_index(name='totalArea')
burnAreaType

In [None]:
burnArea = filtered.query('AreaTotal > 0').groupby('District')['AreaTotal'].sum().reset_index(name='totalArea')

sns.barplot(x="District", y='totalArea', data=burnArea);

In [None]:
filtered.query('AreaTotal > 0')

In [None]:
filtered.head()

In [None]:
stats = filtered.query('AreaTotal > 0')
stats.head()

In [None]:
filtered.groupby(['District','Type'])['AreaTotal'].sum()

In [None]:
# group by stats for burned area group by distrito
filtered.query('AreaTotal > 0').groupby(by="District")['District'].size().describe()

In [None]:
filtered.query('AreaTotal > 0').groupby('District')['Type'].size()

In [None]:
aux = filtered.query('AreaTotal > 0').sort_values(by='HourTotal', ascending=False)

aux.Type.unique()

In [None]:
# Relation between total burnt area and Fire Duration
sns.scatterplot(data=aux, x='AreaTotal', y='HourTotal', hue='District')
# set labels
plt.xlabel("Area Burned (ha)", size=15)
plt.ylabel("Wildfire Duration", size=15)
plt.title("Sort Bars in Barplot in Descending Order", size=18)
plt.tight_layout()

In [None]:
aux.head()

In [None]:
aux[aux.HourTotal.isna()]

In [None]:
aux[aux.HourTotal < 1]
