In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

# Data Cleaning

In [None]:
df = pd.read_csv("merged_dataset_not_cleaned.csv")

In [None]:
df.head()

In [None]:
df = df.drop(columns=['County', 'Start Date and Time', 'End Date and Time', 'Duration'])

In [None]:
df.info()

In [None]:
df['Land Cover'] = df['Land Cover'].astype('category')

In [None]:
df['Land Cover'].value_counts()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df = df.dropna(subset=['Maximum Temperature'])

df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
missing_values_count = df['Acres Burned'].isna().sum()
print(f"Number of missing values in the 'Acres Burned' column: {missing_values_count}")

In [None]:
df = df.dropna(subset=['Acres Burned', 'Dew Point', 'Cloud Cover', 'Relative Humidity', 'Solar Radiation', 
                       'Solar Energy', 'PDSI', 'NDVI'])

df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

# By looking at the minimum and maximum values of each variable, there does not seem to be 
# any errors in the data, given the nature of the data (weather, drought index, etc)

In [None]:
df.info()

In [None]:
# Filling in the missing wind gust values with the corresponding wind speed (since according to the 
# Weather Crossing Documentation, a missing value means that the wind gust was not significantly
# greater than the wind speed)

df['Wind Gust'] = df['Wind Gust'].fillna(df['Wind Speed'])

In [None]:
df.info()

# Data Transformation

In [None]:
# Calculating the Wildfire Severity score using 3 different metrics

df['Acres_Burned_Normalized'] = (df['Acres Burned'] - df['Acres Burned'].min()) / (df['Acres Burned'].max() - df['Acres Burned'].min())

df['Wildfire_Scale_Acres_Normalized'] = pd.cut(df['Acres_Burned_Normalized'], bins=[0, 0.2, 0.4, 0.6, 0.8, 1], labels=[1, 2, 3, 4, 5], include_lowest=True)
df['Wildfire_Scale_Acres_Quantiles'] = pd.qcut(df['Acres Burned'], 5, labels=[1, 2, 3, 4, 5])

# Defining the scale similar to the NWCG class scale (https://www.nwcg.gov/term/glossary/size-class-of-fire)
# Did not include the original Class A (<1/4 acres), for instance Class B in the NWCG is Class "A",
# Class C in the NWCG is Class "B", and so forth in this case 

def assign_class(acres):
    if acres > (1/4) and acres < 10:
        return 'A'
    elif acres >= 10 and acres < 100:
        return 'B'
    elif acres >= 100 and acres < 300:
        return 'C'
    elif acres >= 300 and acres < 1000:
        return 'D'
    elif acres >= 1000 and acres < 5000:
        return 'E'
    elif acres >= 5000:
        return 'F'

df['Severity_Class'] = df['Acres Burned'].apply(assign_class)

# Drop the 'Acres_Burned_Normalized' column
df = df.drop(columns=['Acres_Burned_Normalized'])


median_acres = df['Acres Burned'].median()

df['Wildfire_Scale_Acres_Binary'] = np.where(df['Acres Burned'] <= median_acres, 0, 1)

In [None]:
# Dropping the 'Acres Burned' column

df = df.drop(columns=['Acres Burned'])

In [None]:
df.info()

In [None]:
# Calculating the variance inflation predictor of the predictor variables

In [None]:
df.columns

In [None]:
df = df.drop(['Wildfire_Scale_Acres_Quantiles', 'Wildfire_Scale_Acres_Binary', 'Wildfire_Scale_Acres_Normalized'], axis = 1)

In [None]:
X = df.drop(columns=['Name', 'Date', 'Latitude', 'Longitude', 'Severity_Class'])
y = df['Severity_Class']

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

variables_to_remove_list = []

def recursive_feature_elimination_rf(data, target, n_features_to_select):
    global variables_to_remove_list
    
    estimator = RandomForestClassifier()
    selector = RFE(estimator, n_features_to_select=n_features_to_select)
    selector = selector.fit(data, target)
    
    selected_features = data.columns[selector.get_support()]
    dropped_features = data.columns[~selector.get_support()]
    
    # Update the global list with the dropped features
    variables_to_remove_list.extend(dropped_features)
    
    print("Dropped features:", list(dropped_features))
    
    return data.loc[:, selected_features]

n_features_to_select = 10  
reduced_data = recursive_feature_elimination_rf(X, y, n_features_to_select)

df = df.drop(columns = variables_to_remove_list)

In [None]:
df.head()

In [None]:
# Exporting the dataframe before normalizing the variables

df.to_csv("transformed_dataset_pre-normalization.csv", index = False)

In [None]:
# Normalize the predictor variables using min-max scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

original_X_variable_list = ['Maximum Temperature',
       'Minimum Temperature', 'Temperature', 'Dew Point', 'Precipitation',
       'Wind Speed', 'Wind Gust', 'Wind Direction', 'Cloud Cover',
       'Relative Humidity', 'Solar Radiation', 'Solar Energy', 'PDSI', 'NDVI', 'Land Cover',
       'Elevation', 'Slope']


new_X_variable_list = [item for item in original_X_variable_list if item not in variables_to_remove_list]

print(new_X_variable_list)


scaler = MinMaxScaler()

df[new_X_variable_list] = scaler.fit_transform(df[new_X_variable_list])


df.head()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.to_csv("transformed_dataset.csv", index = False)

In [None]:
df.head()

In [None]:
df.columns