#### Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split
import os

#### Load Dataset

In [None]:
print(os.getcwd())
os.chdir("/workspace/")

In [None]:
!wget "https://raw.githubusercontent.com/Call-for-Code/Spot-Challenge-Wildfires/main/data/Jan_30-with_historical_weather_forecasts_refreshed_again_on Jan_31.zip"
zip = zipfile.ZipFile("Jan_30-with_historical_weather_forecasts_refreshed_again_on Jan_31.zip")
zip.extractall()

#### Dataframe Analysis

In [None]:
main_path = "Jan_30"
file_wildfires = f"{main_path}/Historical_Wildfires.csv"
wildfires_df = pd.read_csv(file_wildfires)
wildfires_df["Date"] = pd.to_datetime(wildfires_df["Date"])
wildfires_df.head()

In [None]:
wildfires_df.describe().transpose()

In [None]:
file_weather = f"{main_path}/HistoricalWeather.csv"
weather_df = pd.read_csv(file_weather)

# rename columns
weather_df = weather_df.rename(
    columns={
        "count()[unit: km^2]": "Area",
        "min()": "Min",
        "max()": "Max",
        "mean()": "Mean",
        "variance()": "Variance",
    }
)

weather_df["Date"] = pd.to_datetime(weather_df["Date"])

weather_df.head()

In [None]:
# Reformat the data
df_pivot = weather_df.pivot_table(
    values=["Min", "Max", "Mean", "Variance"],
    index=["Date", "Region"],
    columns=["Parameter"],
)
# Reset dataframe index
df_pivot.reset_index(inplace=True)

# Renaming Column names
df_pivot.columns = [
    col[0] if not (col[1]) else "{1}_{0}".format(*col)
    for col in df_pivot.columns.values
]

# Rearranging Data and column
params = df_pivot.columns.tolist()[3:]
params.sort()
weather_data = df_pivot[df_pivot.columns.tolist()[:3] + params].copy()
weather_data.head()

In [None]:
file_wildfires = f"{main_path}/VegetationIndex.csv"
ndvi_df = pd.read_csv(file_wildfires)

# convert to datetime format
ndvi_df["Date"] = pd.to_datetime(ndvi_df["Date"])

print(ndvi_df.dtypes)
ndvi_df.head()

In [None]:
file_forecasts = f"{main_path}/HistoricalWeatherForecasts.csv"
forecasts_df = pd.read_csv(file_forecasts)
forecasts_df["Date"] = pd.to_datetime(forecasts_df["Date"])

forecasts_df.head()

In [None]:
# merge historical fire and weather data into one DataFrame
df_all = wildfires_df.merge(weather_data, how="left", on=["Date", "Region"])
df_all.describe().transpose()

In [None]:
df_all.columns

In [None]:
df_all.isna().sum()

In [None]:
df_all.corr()["Estimated_fire_area"].sort_values(ascending=False)

In [None]:
df_all.to_csv("dataset.csv", index=False, encoding="utf-8")

In [None]:
df_all.shape

In [None]:
ndvi_df.shape

In [None]:
df_temp = df_all.merge(ndvi_df, how="inner", on=["Date", "Region"])
df_temp.describe().transpose()
df_temp.to_csv("dataset1.csv", index=False, encoding="utf-8")

In [None]:
df_corr = df_all.drop(
    [
        "Region",
        "Date",
        "Mean_confidence",
        "Std_confidence",
        "Var_confidence",
        "Count",
        "Replaced",
    ],
    axis=1,
).copy()

plt.figure(figsize=(20, 12))
sns.heatmap(df_corr.corr(), cmap="coolwarm", annot=True, vmin=0, vmax=1)

In [None]:
df_all2 = df_all[df_all["Region"] == "NSW"].copy()
df_all2.drop_duplicates(inplace=True)
df_all2.reset_index(drop=True, inplace=True)
df_all2 = df_all2.dropna(how="any")
df_all2 = df_all2.drop(
    [
        "Date",
        "Region",
        "Mean_confidence",
        "Std_confidence",
        "Var_confidence",
        "Count",
        "Replaced",
    ],
    axis=1,
).copy()