In [None]:
# Importing essential libraries
import pandas as pd  # Pandas for data manipulation and analysis
import matplotlib.pyplot as plt  # Matplotlib for plotting graphs
import seaborn as sns  # Seaborn for statistical data visualization
import zipfile  # Zipfile for handling ZIP archive files
from autogluon.tabular import TabularPredictor  # AutoGluon for automated machine learning
from sklearn.model_selection import train_test_split  # Scikit-learn for splitting datasets
import os  # OS module for operating system interaction

In [None]:
print(os.getcwd())  # Print the current working directory
os.chdir("/workspace/")  # Change the current working directory to '/workspace/'

In [None]:
# Download a ZIP file containing dataset
!wget "https://raw.githubusercontent.com/Call-for-Code/Spot-Challenge-Wildfires/main/data/Jan_30-with_historical_weather_forecasts_refreshed_again_on Jan_31.zip"

# Open and extract the contents of the ZIP file
zip = zipfile.ZipFile("Jan_30-with_historical_weather_forecasts_refreshed_again_on Jan_31.zip")
zip.extractall()

In [None]:
# Define file paths for datasets
main_path = "Jan_30"
file_wildfires = f"{main_path}/Historical_Wildfires.csv"

# Load the wildfires dataset into a pandas DataFrame
wildfires_df = pd.read_csv(file_wildfires)  
wildfires_df["Date"] = pd.to_datetime(wildfires_df["Date"])  # Convert the 'Date' column to datetime format for easier manipulation
wildfires_df.head()  # Display the first few rows of the wildfires DataFrame

In [None]:
wildfires_df.describe().transpose()  # Provide a statistical summary of the wildfires DataFrame

In [None]:
# Define the file path for the weather dataset
file_weather = f"{main_path}/HistoricalWeather.csv"
weather_df = pd.read_csv(file_weather)  # Load the weather dataset into a pandas DataFrame

# Rename columns
weather_df = weather_df.rename(
    columns={
        "count()[unit: km^2]": "Area",
        "min()": "Min",
        "max()": "Max",
        "mean()": "Mean",
        "variance()": "Variance",
    }
)

weather_df["Date"] = pd.to_datetime(weather_df["Date"])  # Convert the 'Date' column in the weather DataFrame to datetime format

weather_df.head()  # Display the first few rows of the weather DataFrame

In [None]:
# Pivot the weather DataFrame to reformat the data for analysis
df_pivot = weather_df.pivot_table(
    values=["Min", "Max", "Mean", "Variance"],
    index=["Date", "Region"],
    columns=["Parameter"],
)

# Reset the index of the pivoted DataFrame to flatten the data structure
df_pivot.reset_index(inplace=True)

# Rename columns in the pivoted DataFrame for easier access
df_pivot.columns = [
    col[0] if not (col[1]) else "{1}_{0}".format(*col)
    for col in df_pivot.columns.values
]

# Rearrange the data and columns in the pivoted DataFrame
params = df_pivot.columns.tolist()[3:]
params.sort()
weather_data = df_pivot[df_pivot.columns.tolist()[:3] + params].copy()
weather_data.head()

In [None]:
# Define the file path for the vegetation index dataset
file_wildfires = f"{main_path}/VegetationIndex.csv"
ndvi_df = pd.read_csv(file_wildfires)  # Load the vegetation index dataset into a pandas DataFrame

# Convert the 'Date' column in the vegetation index DataFrame to datetime format
ndvi_df["Date"] = pd.to_datetime(ndvi_df["Date"])

print(ndvi_df.dtypes)  # Display the data types of the columns in the vegetation index DataFrame
ndvi_df.head()  # Display the first few rows of the vegetation index DataFrame

In [None]:
# Define the file path for the weather forecasts dataset
file_forecasts = f"{main_path}/HistoricalWeatherForecasts.csv"
forecasts_df = pd.read_csv(file_forecasts)  # Load the weather forecasts dataset into a pandas DataFrame

# Convert the 'Date' column in the weather forecasts DataFrame to datetime format
forecasts_df["Date"] = pd.to_datetime(forecasts_df["Date"])

forecasts_df.head()  # Display the first few rows of the weather forecasts DataFrame

In [None]:
# Merge the historical fire and weather data into a single DataFrame
df_all = wildfires_df.merge(weather_data, how="left", on=["Date", "Region"])
df_all.describe().transpose()  # Provide a statistical summary of the merged DataFrame

In [None]:
df_all.columns  # Display column names of the merged DataFrame

In [None]:
df_all.isna().sum()  # Calculate and display the number of missing values in each column

In [None]:
# Calculate and display correlation of all columns with 'Estimated_fire_area'
df_all.corr()["Estimated_fire_area"].sort_values(ascending=False)

In [None]:
# Save the merged DataFrame to a CSV file
df_all.to_csv("dataset.csv", index=False, encoding="utf-8")

In [None]:
df_all.shape  # Display the shape (dimensions) of the merged DataFrame

In [None]:
ndvi_df.shape  # Display the shape (dimensions) of the vegetation index DataFrame

In [None]:
# Merge the merged DataFrame with the vegetation index DataFrame
df_temp = df_all.merge(ndvi_df, how="inner", on=["Date", "Region"])

df_temp.describe().transpose()  # Provide a statistical summary of the newly merged DataFrame
df_temp.to_csv("dataset1.csv", index=False, encoding="utf-8")  # Save the newly merged DataFrame to a CSV file

In [None]:
# Create a DataFrame for correlation analysis by dropping certain columns
df_corr = df_all.drop(
    [
        "Region",
        "Date",
        "Mean_confidence",
        "Std_confidence",
        "Var_confidence",
        "Count",
        "Replaced",
    ],
    axis=1,
).copy()

# Create a plot for the correlation matrix
plt.figure(figsize=(20, 12))
sns.heatmap(df_corr.corr(), cmap="coolwarm", annot=True, vmin=0, vmax=1)

In [None]:
# Filter the DataFrame for a specific region ('NSW') and create a copy
df_all2 = df_all[df_all["Region"] == "NSW"].copy()
df_all2.drop_duplicates(inplace=True)  # Drop duplicate rows
df_all2.reset_index(drop=True, inplace=True)  # Reset the index of the filtered DataFrame
df_all2 = df_all2.dropna(how="any")  # Drop rows with any missing values

# Drop certain columns from the filtered DataFrame and create a copy
df_all2 = df_all2.drop(
    [
        "Date",
        "Region",
        "Mean_confidence",
        "Std_confidence",
        "Var_confidence",
        "Count",
        "Replaced",
    ],
    axis=1,
).copy()