## ABOUT
__Author__: Pat McCornack

__Date__: 2/22/24

__Purpose__: Train model on sample points to be applied to full raster extent. 

----

In [10]:
import os
from os import path, listdir, mkdir
import datetime as dt

import pandas as pd
import geopandas as gpd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

from joblib import dump

## Define Paths
Use active_data_dir to switch between working off of the local machine or PNNL Drive. 

Update the paths_dict with appropriate directories and files.

In [20]:
local_root_dir = r"C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\F40_modeling"
pnnl_root_dir = r"\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\F40_modeling"

# Define which data directory to work off of
active_root_dir = local_root_dir

datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
paths_dict = {
    "out_base_dir" : os.path.join(local_root_dir, 'model_outputs/tabular'),  # Where to save outputs 
    "new_dir_name" : "F40_model_results",
    "ref_data_dir" : os.path.join(active_root_dir, "..\LF_raster_data\_tables"),  # Location of LANDFIRE csvs (e.g. LF22_FVT_230.csv)
    "model_dir" : os.path.join(active_root_dir, 'models'),  # Where to save model to 
    "sample_points_dir" : os.path.join(active_root_dir, "data\sample_points"),  # Shapefile of sample points to train on 
    "sample_points_fname" : "LF22_sample_points_2024-05-03.shp",
    "model_fname" : f"LF22_F40_model_{datetime}"
}

# __Functions__

----

### __Create a directory to output modeling results__
Names the output directory using the datetime that the script was run. 
Returns the name of the directory. The returned directory is used to output the trained model and/or results. 

In [12]:
def make_dir(base_dir, new_dir_name='model_results'):
        """
        Create a directory named using the current datetime.
        Input: base_dir - path to the directory where the new directory will be created. 
        Creates directory and returns string of path to that directory. 
        """

        datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        output_dir = os.path.join(base_dir, new_dir_name + "_" + datetime)

        os.makedirs(output_dir)
        return output_dir

## __Pre-Process the Data__

### Create Data Dictionaries to Append Features
Some features are separate attributes of the LANDFIRE dataset (e.g. BPS Fire Regime) and others are useful for results analysis (e.g. FDst attributes). These can be mapped to points using LANDFIRE CSVs. The below creates dictionaries to perform that mapping. 

This function is called by join_features.

In [13]:
def read_ref_data(ref_data_dir=paths_dict["ref_data_dir"]):
    """
    Returns a dictionary of dictionaries of mappings between LANDFIRE raster values and other attributes associated with those values. 

    One example use case is using the BPS_NAME dictionary to map from the BPS layer value to the BPS_NAME attribute.
    """
    data_dir = ref_data_dir
    BPS_fname = "LF20_BPS_220.csv"
    LF22_FDST_fname = "LF22_FDST_230.csv"

    # Create empty dictionary
    LF_ref_dicts = {}

    # Get BPS reference dictionary
    BPS_df = pd.read_csv(os.path.join(data_dir, BPS_fname))
    LF_ref_dicts["BPS_NAME"] = dict(BPS_df[['VALUE', 'BPS_NAME']].values)
    LF_ref_dicts["BPS_FRG_NE"] = dict(BPS_df[['VALUE', 'BPS_FRG_NEW']].values)

    # Get FDST reference dictionaries 
    FDST_df = pd.read_csv(os.path.join(data_dir, LF22_FDST_fname ))
    LF_ref_dicts["FDST_TYPE"] = dict(FDST_df[['VALUE', 'D_TYPE']].values)
    LF_ref_dicts["FDST_SEV"] = dict(FDST_df[['VALUE', 'D_SEVERITY']].values)
    LF_ref_dicts["FDST_TSD"] = dict(FDST_df[['VALUE', 'D_TIME']].values)

    return LF_ref_dicts
                         
        

### Append Features using Data Dictionaries
Append in selected features using the LANDFIRE data dictionaries.

__Note:__ Items in feature_list must be in the source_layers dictionary. 

In [14]:
# ! This is used to analyze results
def join_features(sample_points, feature_list = ['BPS_FRG_NE', 'FDST_TYPE', 'FDST_SEV', 'FDST_TSD']):
    """
    Returns the sample_points layer with the features in feature_list appended. 

    Items in feature_list must be in the source_layers dictionary.  
    """
    
    LF_ref_dicts = read_ref_data()
    
    source_layers = {
        'BPS_NAME' : 'BPS', 
        'BPS_FRG_NE' : 'BPS',
        'FDST_TYPE' : 'LF22_FDST',
        'FDST_SEV' : 'LF22_FDST', 
        'FDST_TSD' : 'LF22_FDST'
    }

    for feature in feature_list:
        sample_points[feature] = sample_points[source_layers[feature]].map(LF_ref_dicts[feature]).copy()

    return sample_points

### Prepare the Sample Points
Before training the model, the sample points need to be filtered. Filtering steps are:
- Remove Null points (-9999/-1111 values) - these will not be updated in the final raster. 
- Remove non-burnable F40 classes - We assume that these are static and will not be updated in the final raster. 
- Optionally, join in additional features. feature_list may be modified to select which features to add in. 

In [15]:
def data_prep(sample_point_fpath, join_features=False):
    """
    Reads in sample points and prepares the data for the model.
    sample_point_fpath: Path to the sample points to be used to train the model
    Returns processed sample points. 
    """
    # Read in gdf
    sample_points = gpd.read_file(sample_point_fpath)

    # Drop unneeeded columns if present
    sample_points = sample_points.drop(['Classified', 'GrndTruth', 'NEAR_FID', 'NEAR_DIST'], axis=1,
                                       errors='ignore')

    # Remove observations with -9999/-1111 in any field 
    matches = sample_points[(sample_points.isin([-1111, -9999])).any(axis=1)]  # Find rows with -1111/-9999 in any column
    sample_points = sample_points.drop(matches.index, axis=0)  # Drop those rows

    # Remove Non-Burnable Classes
    F40_NB = [91, 92, 93, 98, 99]  # Nonburnable F40 Classes
    sample_points = sample_points.loc[~sample_points['LF22_F40'].isin(F40_NB)]  # Drop NB classes

    # Join in additional features if specified
    if join_features == True:
        sample_points = join_features(sample_points, feature_list=['BPS_NAME', 'BPS_FRG_NE' 'FDST_TYPE', 'FDST_SEV', 'FDST_TSD'])
    
    return sample_points


## __Train Model__
Trains and returns HGBC model provided data. Subsets the training data to a provided list of predictors and predicts the specified target feature.

__Note__: Aspect, Elevation, and Slope are the only continuous LANDFIRE datasets, therefore any feature not in that list is assumed to be categorical.

In [16]:
def train_model(train_data, predictors, target='LF22_F40', seed=1234):
    """
    Returns trained model of specified type given a set of predictors. 
    """
    # Set parameters for model
    class_weight = "balanced"
    learning_rate = 0.1
    max_iter = 100

    print(f"Predictors: {predictors}")
    print(f"Target: {target}")

    # Get list of categorical variables to encode
    cat_variables = [x for x in predictors if x not in ['ASPECT', 'ELEVATION', 'SLOPE']]

    # Separate training data predictors/response
    y_train = train_data[target].copy()
    X_train = train_data[predictors].copy()

    # Fit specified classifier with training data
    model = HistGradientBoostingClassifier(
        categorical_features=cat_variables,
        class_weight=class_weight,
        learning_rate=learning_rate,
        max_iter=max_iter,
        random_state=seed)
    
    # Fit the model with the training data
    model.fit(X_train, y_train)

    # Return the trained model
    return model   
    

## __Evaluate Model__
Provided a trained model, test data, a set of predictors, and a target - return a list of predictions metrics on prediction accuracy.

In [17]:
def eval_model(model, test_data, predictors, target='LF22_F40'):
    """
    Returns a dictionary containing 1. 'metrics' : a list of metrics quantifying model performance, and 2. 'predictions' : a list of predictions corresponding to 
    each observation in the sample_points data.
    """ 
    # Separate the predictors from target
    y_test = test_data[target].copy()
    X_test = test_data[predictors].copy()

    # Run model to predict target
    y_pred = model.predict(X_test)

    # Get metrics
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    bal_acc = round(balanced_accuracy_score(y_test, y_pred),3)
    recall = round(recall_score(y_test, y_pred, average='macro'), 3)
    precision = round(precision_score(y_test, y_pred, average='macro'), 3)
    f1 = round(f1_score(y_test, y_pred, average='macro'), 3)

    print(f"Predictors: {predictors}")
    print(f"Target: {target}")
    print(f"Accuracy: {accuracy}")
    print(f"Balanced Accuracy: {bal_acc}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F1: {f1}")

    return {
        "metrics" : [accuracy, bal_acc, recall, precision, f1, predictors],
        "predictions" : y_pred
    }


# __Main__
----

## __Train Model__
Update the paths in paths_dict, if necessary, then run to train and save model.

In [None]:
seed = 1234

# Define path to sample points to train on 
sample_point_fpath = os.path.join(paths_dict['sample_points_dir', paths_dict['sample_points_fname']])

# Define target and predictors
target = 'LF22_F40'
predictors = ['LF22_FVT', 'LF22_FVH', 'LF22_FVC', 'LF22_FDST', 'ZONE', 'BPS_FRG_NE']

# Read in and prepare the data
sample_points = data_prep(sample_point_fpath, predictors)

# Train the model
model = train_model(sample_points, predictors)

# Save the model
dump(model, os.path.join(paths_dict['model_dir'], paths_dict['model_fname']))


# Assessing using FVT Predictions for F40 Model
----

## __Baseline__
First check the overall performance of the F40 model using LANDFIRE data. 

In [18]:
seed = 1234
model_type = 'HGBC'

# Define train/test split proportion
train_frac = 0.7
test_frac = 0.3

# Define set of predictors and target
predictors = ['LF22_FVT', 'LF22_FVH', 'LF22_FVC', 'LF22_FDST', 'ZONE', 'BPS_FRG_NE']
target = 'LF22_F40'

# Read in and prepare the data
sample_points = data_prep(os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname']), predictors)

# Perform train/test split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

# Train model
model = train_model(train_points, predictors)

# Get the model score
print("Score on the test set: ")
results = eval_model(model=model, test_data=test_points, predictors=predictors, target=target)

Predictors: ['LF22_FVT', 'LF22_FVH', 'LF22_FVC', 'LF22_FDST', 'ZONE', 'BPS_FRG_NE']
Target: LF22_F40
Score on the test set: 
Predictors: ['LF22_FVT', 'LF22_FVH', 'LF22_FVC', 'LF22_FDST', 'ZONE', 'BPS_FRG_NE']
Target: LF22_F40
Accuracy: 0.985
Balanced Accuracy: 0.953
Recall: 0.953
Precision: 0.938
F1: 0.94


In [21]:
# Save out results
## Concatenate predictions to test_points set
test_points[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=paths_dict['new_dir_name'])
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test_points.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


['C:\\Users\\mcco573\\OneDrive - PNNL\\Documents\\_Projects\\BPA Wildfire\\F40_modeling\\model_outputs/tabular\\F40_model_results_2024-05-10_11-17-36\\LF22_F40_HGBC_model_2024-05-10_11-17-37']

##  __Baseline -- Disturbed Areas Only__
Check how the F40 model performs with LANDFIRE data on only disturbed points and official LANDFIRE data. 
Note that this uses the same model trained in 'Baseline' above. 

In [None]:
# Define path to geodatabase and layer of sample points  - this is the test set
data_dir = r"C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\Fuel Attributes Model\data\sample_points"
test_sample_point_fpath = 'sample_points_4-17-24_200k_Disturbed.shp'

# Define set of predictors
predictors = ['LF22_FVT', 'LF22_FVH', 'LF22_FVC', 'LF22_FDST', 'LF22_F40', 'ZONE', 'BPS_FRG_NE']

# Read in and prepare the data
test = data_prep(os.path.join(data_dir, test_sample_point_fpath), predictors)

target = 'LF22_F40'
model_type = 'HGBC'

# Get the model score
print("Score on the test set: ")
results = eval_model(model=model, test_data=test, attributes=predictors, model_type='HGBC',target=target)

## __Using Predicted LF22 FVT__
A raster of predicted LF22 FVT values for all disturbed points has been generated using LF20 data. This raster has been uploaded to the sample points as PRED_FVT. The goal here is to assess whether the accuracy changes when using the predicted FVT data rather than the LANDFIRE FVT data. 

----- 

In [None]:
# Define path to geodatabase and layer of sample points  - this is the test set
data_dir = r"C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\Fuel Attributes Model\data\sample_points"
test_sample_point_fpath = 'sample_points_4-17-24_200k_Disturbed.shp'


# Define set of predictors
predictors = ['LF22_FVT', 'LF22_FVH', 'LF22_FVC', 'LF22_FDST', 'LF22_F40', 'ZONE', 'BPS_FRG_NE']

# Read in and prepare the data
test = data_prep(os.path.join(data_dir, test_sample_point_fpath), predictors)
test = test.rename(columns = {
    'LF22_FVT' : 'Original_LF22_FVT',
    'PRED_FVT' : 'LF22_FVT'  # Rename prediction column in order to run model. 
})

target = 'LF22_F40'
model_type = 'HGBC'


# Get the model score
print("Score on the test set: ")
results = eval_model(model=model, test_data=test, attributes=predictors, model_type='HGBC',target=target)

In [None]:
# Save out results
## Concatenate predictions to test set
test[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'])
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))

In [None]:
# Save the model
model_dir = r"C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\F40 Random Forest Model\models"
model_fname = f"{target}_model_{datetime}"
dump(model, os.path.join(model_dir, model_fname))