# Introduction

## Data Pre-processing (Importing libs, etc...)

In [None]:
# Importing required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

%matplotlib inline

pd.set_option("display.max_rows",300)
pd.set_option("display.max_columns",100)


Cleaning the data - the data seems ot be split into 2 regions, combining the data of both regions and creating a new column called regions with the name displayed in the column. The day, month and year column will be combined into a single date-time column. The classes column will be renamed as "fire" and will have True and False values rather than the "not fire" values.

In [None]:
# Reading the dataset
df = pd.read_csv("./Algerian_forest_fires_dataset_UPDATE.csv",header=[1],skipinitialspace=True)

# Splitting the csv file and then combining them depending on region.
splt_index = len(df)//2
dataset_a,dataset_b = df.iloc[:splt_index].copy(),df.iloc[splt_index:].copy()
dataset_a.loc[:,"region"] = "Bejaia"
dataset_b.loc[:,"region"] = "Sidi-Bel Abbes"
df = pd.concat([dataset_a,dataset_b]).reset_index(drop=True)

# Converting the day, month and year columns into datetime format
df["date"] = pd.to_datetime(df[["day","month","year"]],errors="coerce",dayfirst=True)
df = df.drop(columns=["day","month","year"])
df.head()

Lets rename the columns to something a bit more descriptive.

In [None]:
df.columns = df.columns.str.strip()

In [None]:
# Renaming the columns and data values.
df = df.rename(columns={"Temperature":"temp","RH":"humid","Classes":"fire","Ws":"wind"})
df["fire"] = df["fire"].str.strip()
df[["fire"]] = np.where(df[["fire"]]=="not fire",False,True)
df.head()

In [None]:
# Dropping the rows with nan values in date columns
df = df.drop(df[df["date"].isna()].index)

In [None]:
df.info()

In [None]:
# Converting all numeric columns with correct dtype.
headers = [cols for cols in df.select_dtypes(include="object")][:-1]

# Checking all rows contain only digits and replace letters with mmost common number.
incorrect_col = df.drop(df[df["FWI"].str.contains(pat=r"^[0-9]",regex=True)].index)
df["FWI"] = df["FWI"].replace(incorrect_col["FWI"].values,df["FWI"].mode())

# Converting all digits as float32.
df[headers] = df[headers].apply(lambda x: x.str.replace(" ","").astype(dtype=np.float32))

In [None]:
df.info()


## Univariate Analysis

In this section we are answering the following questions:
+ The frequencies/counts of details of fire in 2 main areas.
+ Distribution of the details using a boxplot to better understand the complexity.


In [None]:
def plot_columns(df):
    name_cols = df.select_dtypes(include=np.number).columns
    num_cols = len(name_cols)
    n_rows = 2
    n_cols = int(np.ceil(num_cols/n_rows))
    f,axes = plt.subplots(nrows=n_rows, ncols=n_cols,sharex=True,sharey=False,figsize=(3*n_cols,5))
    if num_cols==1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for i,col in enumerate(name_cols):
        sns.barplot(x=df["region"],y=df[col].values,ax=axes[i])
        axes[i].set_title(f"Count of {col}")
        axes[i].set_xlabel(f"{col}")
        axes[i].set_ylabel("Value Count")
    
    # Hide unused subplots (if num_cols isn't evenly divisible by nrows)

    for j in range(i+1, len(axes)):
        f.delaxes(axes[j])
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.5,hspace=0.2)
    plt.show()
    plt.close()
    
plot_columns(df)

- temp – maximum temperature in degrees Celsius
- humid – relative humidity as a percentage
- region – location in Bejaia in the northeast of Algeria or Sidi Bel-abbes in the northwest of Algeria
- FFMC – Fine Fuel Moisture Code: measure of forest litter fuel moisture that incorporates temperature, humidity, wind, and rain
- ISI – Initial Spread Index: estimates spread potential of fire
- BUI – Buildup Index: estimates potential release of heat
- FWI – Fire Weather Index: measure of general fire intensity potential that incorporates ISI and BUI

In [None]:
df['year_month'] = df.date.dt.strftime("%Y-%m")
_df = pd.concat([df["temp"], df["year_month"], df["region"]],axis=1)
_df.groupby(["year_month","region"])["temp"].mean()

## Multivariate Analysis

Checking the multilinearity assumption

In [None]:
#check multicollinearity with a heatmap
corr_grid = df.select_dtypes(include=np.number).corr()
plt.subplots(figsize=(10,6))
sns.heatmap(corr_grid, xticklabels=corr_grid.columns, yticklabels=corr_grid.columns, annot=True)
plt.show()
plt.clf()

Lets compare the relatiosnhip between humidity and temperature

In [None]:
f,ax = plt.subplots(figsize=(10,6))
sns.scatterplot(data=df,x="temp",y="humid",hue="region",legend="auto")
plt.title("Relationship between humidity and temperature")
plt.xlabel("Temperature")
plt.ylabel("Humidity")
plt.show()
plt.close()

# Machine learning Regression Model

Running the multiple linear regression model

Methodology: 
+ Standard scalar/ normalisation
+ categorical values - convert true/false value to 1 and 0
+ predition score
+ draw a correlation map 

# Conclusions