## ABOUT
__Author__: Pat McCornack

__Date__: 2/21/24

__Purpose__: Model Development and Evaluation for FBFM40 classification. Allows the user to iterate through different model types, parameters, and sets of predictors to evaluate which yields the best results. Can also output a model trained on sample points to be used elsewhere. 

The best performing model to date (3/25/24) is the following:
- Model: HGBC (Histogram Based Gradient Boosting Classifier)
- Weight: Balanced - This is used to account for class imbalance
- Predictors: ['FVT', 'FVC', 'FVH', 'FDST', 'ZONE', 'BPS_FRG_NEW']
    - Note that BPS_FRG_NEW is a feature in the LANDFIRE "Biophysical Setting" (BpS) layer that was extracted. 
- F40_GROUP: False - Note that while setting this to true may increase accuracy, we are not convinced that using the F40 "parent classes" (e.g. Grass-Shrub) is a valid approach. 

----

In [2]:
import os
from os import path, listdir, mkdir
import datetime as dt
from IPython.display import clear_output

import pandas as pd
import geopandas as gpd
import fiona

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, classification_report
from sklearn.preprocessing import TargetEncoder

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from joblib import dump, load

In [None]:
# Define filepaths

paths_dict = {
    'LF23_gdb' : r'\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\F40_modeling\data\geodatabases\LF23_data.gdb',
    'LF23_sample_points_fname' : 'LF23_sample_points_100k_3_12_24',
    'LF22_sample_points_fname' : 'LF22_sample_points_100k_3_8_24',
    'runs_fpath' : r'\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\F40_modeling\data\model_run_dicts\FRDB_analog_runs.csv',
    'model_out_dir' : r"\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\F40_modeling\models\models_3-18-24",
    'model_output_dir' : r"\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\F40_modeling\model_outputs\tabular",

}

## Functions

----

### Fetch categorical features
Selects categorical features from a list of predictors. Used for categorical variable encoding. 

The current implementation simply removes the continuous features from the list of columns and assigns the rest are categorical.

In [3]:
def get_cat_features(predictors):
    """
    Returns a list of categorical features to be encoded.
    """
    # Define the noncategorical (i.e. continuous) features
    noncat_variables = ['ASPECT', 'SLOPE', 'ELEVATION', 'geometry', 'F40']

    # Subset out the categorical features
    cat_variables = [x for x in predictors if x not in noncat_variables]

    return cat_variables

### Categorical Variable Encoder
Categorical variables must be encoded before being used for the Random Forest. Note that due to HGBC's native encoding support this function is not needed when using that model type. 

In [4]:
def targetEncoder(sample_points, target):
    """
    Replaces the categorical features in sample_points with encodings using target encoder.
    target - target variable feature name (e.g. "F40").
    """

    cat_variables = get_cat_features(sample_points)

    # Encode the features
    enc = TargetEncoder(target_type="multiclass", random_state=1234).set_output(transform="pandas")
    enc.fit(sample_points[cat_variables], sample_points[target])  # Fit the encoder
    df_trans = enc.transform(sample_points[cat_variables])  # Create the encoded features

    # Replace the features with encoded feature in sample_points
    sample_points = sample_points.drop(cat_variables, axis=1)
    sample_points = pd.concat([sample_points, df_trans], axis=1)
    
    return sample_points

### Random Over-Sampling Function
The class representation of the target variable (FBFM40) is heavily imbalanced in the BPA service territory. Random oversampling is one method to address class imbalance - This function will resample minority classes with replacement to create a more balanced class distribution.

__Note:__ This increases the size of the dataset and therefore the classifier will require more computational resources/time. While testing we found that over sampling did not provide any significant advantage over using weights to account for class imbalance. 

In [5]:
def overSampler(predictor_df, target_df, seed):
    """
    Oversamples the dataset to correct target class imbalance. Returns the resampled predictors and responses as separate dataframes.
    predictor_df - dataframe of features used as predictors.
    target_df - dataframe of the target feature (e.g. F40).
    """

    ros = RandomOverSampler(random_state=seed)
    pred_resampled, resp_resampled = ros.fit_resample(predictor_df, target_df)
    return pred_resampled, resp_resampled

### SMOTE - Synthetic Minority Oversampling Technique
Another method of oversampling to address target class imbalance in the dataset. Generates synthetic data points of the minority class by interpolating between the minority class instances using k nearest neighbors.

__Note:__ This increases the size of the dataset and therefore the classifier will require more computational resources/time. Over sampling was not found to significantly improve performance during testing and SMOTE is not currently implemented. 

In [6]:
def smote_resample(predictor_df, response_df, seed):
    """
    Oversamples the dataset using the Synthetic Minority Oversampling Technique to correct target class imbalance. Returns the resampled predictors and responses as separate dataframes.
    predictor_df - dataframe of features used as predictors.
    target_df - dataframe of the target feature (e.g. F40).
    """

    sm = SMOTE(sampling_strategy='auto', random_state=seed, k_neighbors=5)
    pred_resampled, resp_resampled = sm.fit_resample(predictor_df, response_df)
    return pred_resampled, resp_resampled

### Create a directory to output modeling results
Names the output directory using the datetime that the script was run. 
Returns the name of the directory. The returned directory is used to output the trained model and/or results. 

In [7]:
def make_dir(base_dir):
        """
        Create a directory named using the current datetime and returns the path to that directory.
        Input: base_dir - path to the directory where the new directory will be created. 
        """

        datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        output_dir = os.path.join(base_dir, "model_results_" + datetime)

        os.makedirs(output_dir)
        return output_dir

### Data Pre-Processing 
Used to prepare the dataset for model development. The user must specify the path to the geodatabase and name of the point layer in that geodatabase to be processed.

The function expects a point layer with a minimum of the following features: [FVT, FDST, FVC, FVH, F40]

Note that we found that the best performing set of predictors was ['FVT', 'FVC', 'FVH', 'FDST', 'ZONE', 'BPS_FRG_NEW'] and the engineered features did not provide value in improving modeling results. The only modifications needed for the above set of predictors is to filter out the NULL values (-9999\-1111) and the non-burnable F40 classes. 

__Steps:__
1. Drops unneeded columns
2. Creates EVT_* (e.g. EVT_CLASS) features by mapping FVT to each EVT hierarchy. 
3. Reclasses FDST by grouping by disturbance type
4. Splits FVC/FVH into 4 separate columns: Tree/Forest, Shrub, Herb, Other. This is done to reduce dimensionality of the predictors.
5. Creates a feature that aggregates F40 classes into their "parent class" - (i.e. "Grass-Shrub").
6. Removes rows with -9999/-1111 in any field -- these are Null values.
7. Removes Non-Burnable classes from the dataset using NB F40 values.

In [8]:
def data_prep(gdb_path, gdb_layer):
    """
    Reads in layer of sample points from specified geodatabase and prepares the data for the model.
    gdb_path - Path to the geodatabase containing the data.
    gdb_layer - name of the point layer in the geodatabase. 
    """
    # Read in gdf
    sample_points = gpd.read_file(gdb_path, layer=gdb_layer)


    # Drop unneeeded columns if present
    sample_points = sample_points.drop(['Classified', 'GrndTruth', 'NEAR_FID', 'NEAR_DIST'], axis=1,
                                       errors='ignore')


    # Add in EVT columns based on FVT 
    ## Read in EVT data
    EVT_df_path = r"\\pnl\projects\BPAWildfire\data\Landfire\F40_Modeling\model_development\LC22_EVT_230_bpa.csv"
    EVT_df = pd.read_csv(EVT_df_path)
    EVT_df.head()

    ## Create dictionaries to map FVT to EVT Hierarchies
    EVT_ORDER_dict = dict(zip(EVT_df['EVT_FUEL'], EVT_df['EVT_ORDER']))
    EVT_CLASS_dict = dict(zip(EVT_df['EVT_FUEL'], EVT_df['EVT_CLASS']))
    EVT_PHYS_dict = dict(zip(EVT_df['EVT_FUEL'], EVT_df['EVT_PHYS']))
    EVT_SBCLS_dict = dict(zip(EVT_df['EVT_FUEL'], EVT_df['EVT_SBCLS']))

    ## Create EVT_* columns
    sample_points['EVT_ORDER'] = sample_points['FVT'].map(EVT_ORDER_dict)
    sample_points['EVT_CLASS'] = sample_points['FVT'].map(EVT_CLASS_dict)
    sample_points['EVT_PHYS'] = sample_points['FVT'].map(EVT_PHYS_dict)
    sample_points['EVT_SBLCS'] = sample_points['FVT'].map(EVT_SBCLS_dict)


    # Reclass FDST to group by disturbance type
    fdst_reclass_dict = {
        range(111, 134) : 1,  # Fire
        range(211, 234) : 2,  # Mechanical Add
        range(311, 334) : 3,  # Mechanical Remove
        range(411, 434) : 4,  # Windthrow
        range(511, 534) : 5,  # Insects - Disease
        range(611, 634) : 6,  # Mechanical Unknown
        range(711, 734) : 7   # Mastication
    }
    sample_points['FDST_RECLASS'] = sample_points['FDST'].apply(lambda x: next((v for k, v in fdst_reclass_dict.items() if x in k), x))


    # Split FVC into four features
    ## 0 is fill value

    ## Reclass to FVC_tree
    fvc_tree_dict = {
    range(11, 101) : 0,
    range(111, 173) : 0
    }
    sample_points['FVC_TREE'] = sample_points['FVC'].apply(lambda x: next((v for k, v in fvc_tree_dict.items() if x in k), x))

    ## Reclass to FVC_shrub
    fvc_shrub_dict = {
    range(11, 110) : 0,
    range(121, 173) : 0
    }
    sample_points['FVC_SHRUB'] = sample_points['FVC'].apply(lambda x: next((v for k, v in fvc_shrub_dict.items() if x in k), x))

    ## Reclass to FVC_herb
    fvc_herb_dict = {
    range(11, 120) : 0,
    range(150, 173) : 0
    }
    sample_points['FVC_HERB'] = sample_points['FVC'].apply(lambda x: next((v for k, v in fvc_herb_dict.items() if x in k), x))

    ## Reclass to FVC_other
    ## 0:Fill, 1:Non-burnable Other, 2:Burnable Other, 3:Sparse Vegetation
    fvc_other_dict = {
    range(11, 13) : 1,
    range(13, 18) : 2,
    range(18, 96) : 1,
    range(100, 101) : 3,
    range(101, 130): 0,
    range(150, 151) : 3,
    range(151, 173) : 0
    }
    sample_points['FVC_OTHER'] = sample_points['FVC'].apply(lambda x: next((v for k, v in fvc_other_dict.items() if x in k), x))


    # Split FVH into four features
    ## 0 is fill value

    ## Reclass to FVH_tree
    fvh_tree_dict = {
    range(11, 531) : 0
    }
    sample_points['FVH_TREE'] = sample_points['FVH'].apply(lambda x: next((v for k, v in fvh_tree_dict.items() if x in k), x))

    ## Reclass to FVH_shrub
    fvh_shrub_dict = {
    range(11, 500) : 0,
    range(603, 652) : 0
    }
    sample_points['FVH_SHRUB'] = sample_points['FVH'].apply(lambda x: next((v for k, v in fvh_shrub_dict.items() if x in k), x))

    ## Reclass to FVH_herb
    fvh_herb_dict = {
    range(11, 101) : 0,
    range(502, 651) : 0
    }
    sample_points['FVH_HERB'] = sample_points['FVH'].apply(lambda x: next((v for k, v in fvh_herb_dict.items() if x in k), x))

    ## Reclass to FVH_other
    ## 0:Fill, 1:Non-burnable Other, 2:Burnable Other, 3:Sparse Vegetation
    fvh_other_dict = {
    range(11, 13) : 1,
    range(13, 18) : 2,
    range(18, 96) : 1,
    range(100, 101) : 3,
    range(101, 130): 0,
    range(425, 652) : 0
    }
    sample_points['FVH_OTHER'] = sample_points['FVH'].apply(lambda x: next((v for k, v in fvh_other_dict.items() if x in k), x))

    
    # Create feature of grouped F40
    ## Create dictionary to aggregate F40 models into groups
    F40_groups = {
        range(91, 100):'NonBurnable',
        range(101,110):'Grass',
        range(121,125):'Grass-Shrub',
        range(141,150):'Shrub',
        range(161,166):'Timber-Understory',
        range(181,190):'Timber Litter',
        range(201,204):'Slash-Blowdown'
    }

    ## Create column to aggregate F40 models into groups
    sample_points['F40_GROUP'] = sample_points['F40'].apply(lambda x: next((v for k, v in F40_groups.items() if x in k), x))


    # Remove -9999/-1111
    matches = sample_points[(sample_points.isin([-1111, -9999])).any(axis=1)]  # Find rows with -1111/-9999 in any column
    sample_points = sample_points.drop(matches.index, axis=0)  # Drop those rows


    # Remove Non-Burnable Classes
    F40_NB = [91, 92, 93, 98, 99]  # Nonburnable F40 Classes
    sample_points = sample_points.loc[~sample_points['F40'].isin(F40_NB)]  # Drop NB classes


    # Drop columns that are not predictors
    #sample_points = sample_points.drop(['FVC', 'FVH', 'FDST', 'EVT_ORDER'], axis=1)  
    
    return sample_points


### Random Forest Classifier


#### Instantiate RF Classifier

In [9]:
def randomForestClassifier(n_est=100, min_samples_leaf=50, bootstrap=True, oob_score=True, 
                           n_jobs=-1, random_state=1234, max_features='auto', class_weight=None):
    """
    Instantiate a sklearn.ensemble.RandomForestClassifier.
    """
    rf_classifier = RandomForestClassifier(
        n_estimators=n_est,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        oob_score=oob_score,
        n_jobs=n_jobs,
        random_state=random_state,
        max_features=max_features,
        class_weight=class_weight
    )

    return rf_classifier


### Histogram-Based Gradient Boosting Classifier 

#### Instantiate HGBC

In [10]:
def histGradientBoostingClassifier(categorical_feature_list, class_weight=None):
    """
    Instantiate a sklearn.ensemble.HistGradientBoostingClassifier.
    Takes a list of categorical features to be encoded using native function. 
    """

    hgb_classifier = HistGradientBoostingClassifier(
        categorical_features=categorical_feature_list,  # Natively handle categorical variables
        class_weight=class_weight
        
    )

    return hgb_classifier

## Train Model
Trains and returns a model given a dataset to train on and model specifications. Model specifications incldue:
- Model_type: Random Forest or Histogram-based Boosting Classifier
- Class_weight: Which method to use to correct class imbalance. "balanced" will specify that the classifier assign higher weights to minority classes. "oversampled" will apply random oversampling to the data to increase the representation of minority classes in the dataset. 
- Seed: Specifies the random state to ensure reproducibility
- Run: A subset of predictors specified using a reference csv.
- F40_GROUP: Specify whether to append the F40_GROUP feature to the set of predictors.

In [11]:
def train_model(train_data, model_type, class_weight, seed, run, F40_GROUP=False):
    """
    Trains specified model. 
    model_type = "RF" / "HGBC"
    class_weight = "balanced" / "oversampled" / "none"
    run = list of predictors from reference csv.
    Returns model and test dataset.
    """
    target = "F40"

    # Encode the data if running a Random Forest Model
    if model_type == "RF":
        train_data = targetEncoder(train_data, target)
    

    # Get list of predictors for run
    predictors = run.loc[run == 1].index.tolist()  
    predictors = [x for x in predictors if x not in ['F40', 'Run']]  # Drops "Runs" and "F40" 
    print(predictors)

    # Append F40_GROUP if true
    if F40_GROUP == True:
        predictors.append('F40_GROUP')
    

    # If Random Forest - get list of encoded predictors
    # If HGBC - get list of categorical variables to be encoded
    if model_type == "RF":
        predictors = [j for i in predictors for j in train_data.columns.tolist() if i in j]
    elif model_type == "HGBC":
        cat_variables = get_cat_features(predictors)


    # Separate training data
    y_train = train_data[target].copy()
    x_train = train_data[predictors].copy()


    # Oversample the training data if true
    if class_weight == "oversampled":
        x_train, y_train = overSampler(x_train, y_train, seed)
        class_weight = None  # For instantiating the classifier


    # Fit specified classifier with training data
    if model_type == "RF":
        model = randomForestClassifier(n_est=100, min_samples_leaf=50, bootstrap=True, oob_score=True,
                                            n_jobs=-1, random_state=seed, max_features='sqrt', class_weight=class_weight)
    elif model_type == "HGBC":
        model = histGradientBoostingClassifier(cat_variables, class_weight=class_weight)  # Use default parameters for now
    

    # Fit the model with the training data
    model.fit(x_train, y_train)

    # Return the fit model
    return model         
    

## Predict using trained model
Return F40 class predictions and evaluation metrics given a trained model and a test dataset. 
- model - a trained sklearn model produced using train_model() above.  
- model_type: Random Forest or Histogram-based Boosting Classifier.
- test_data: Test dataset that includes target labels.
- run_index: The index of the run (i.e. set of predictors) used to train the model.
- run: A subset of predictors specified using a reference csv.
- F40_GROUP: Specify whether to append the F40_GROUP feature to the set of predictors.




In [12]:
def predict_F40(model, model_type, test_data, run_index, run, F40_GROUP=False):
    """
    Predicts F40 class of test data given a trained model.
    model_type = 'RF' / 'HGBC'
    run_index = integer corresponding to index of run in reference csv.
    run = set of predictors from reference csv. 
    returns dict with 
        'metrics' : A set of metrics to evaluate performance
        'predictions' : A dataframe of the test_dataset predictors with both predictions and actual labels attached. 
    """
    # Create results file 
    results = pd.DataFrame(columns=['Run', 'Accuracy', 'Balanced_Accuracy', 'Precision', 'Recall', 'F1_score', 'Predictors'])    

    target = "F40"

    # Encode the data if running a Random Forest Model
    if model_type == "RF":
        test_data = targetEncoder(test_data, target)

     # Get list of predictors for run
    predictors = run.loc[run == 1].index.tolist()  
    predictors = [x for x in predictors if x not in ['F40', 'Run']] # Drops "Runs" and "F40"
    
    
    # Append F40_GROUP
    if F40_GROUP == True:
        predictors.append('F40_GROUP')

    # If Random forest, get list of encoded predictors
    if model_type == "RF":
        predictors = [j for i in predictors for j in test_data.columns.tolist() if i in j]

    # Separate the predictors if using test data
    y_test = test_data[target].copy()
    x_test = test_data[predictors].copy()

    # Perform prediction
    y_pred = model.predict(x_test)

    # Get metrics
    accuracy = accuracy_score(y_test, y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Append metrics to results dataframe
    results.loc[len(results)] = [run_index, accuracy, bal_acc, precision, recall, f1, x_test.columns.tolist()]

    # Create dataframe with test data, original F40, and predicted F40
    model_df = pd.concat([x_test, y_test], axis=1)
    model_df["F40-predicted"] = y_pred

    # Return metrics and model results
    return {'metrics':results,
            'predictions':model_df}



## Run Model Iterations
Allows a single function to iterate through different models/parameters in order to identify what provides the best performance. 
Currently specified to loop through both RF and HGBC models with different methods to address class imbalance and whether to include F40_GROUP as a predictor. Each "iteration" will run each set of predictors in the reference csv provided through runs_path and create a file named with the model specification and containing metrics for each run (i.e. set of predictors) for that iteration.

These files can then be used to evaluate model performance in terms of which predictors and which model types/parameters provide the best performance.

__Note:__ This function can be very time intensive - especially if oversampling is used. 

In [13]:
def run_iterations(sample_points, runs_path, seed, models, weights, F40_GROUPS):    
    """
    Runs various iterations of models using supplied runs (i.e. sets of predictors and parameters)
    models = list of models - options 'HGBC' / 'RF'ArithmeticError
    weights = list of methods to try to correct class imbalance - options: 'balanced', None, 'oversampled'
    F40_GROUPS = whether to include F40_GROUP as predictor as list - i.e. [False, True], [True], or [False]
    """

    # Get run iterations
    runs = pd.read_csv(runs_path)

    # Make directory to output results to
    base_dir = r"\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\F40_modeling\model_outputs\tabular"
    output_dir = make_dir(base_dir)
    output_file = os.path.join(output_dir, "model_results.csv")

    # Define constants
    train_frac = 0.8
    test_frac = 0.2
    target="F40"

    # Perform train/test split
    train, test = train_test_split(sample_points, train_size=train_frac, test_size=test_frac,
                                random_state=seed, shuffle=True, stratify=sample_points[target])

    for weight in weights:
        for model_type in models:
            for GROUP in F40_GROUPS: 
                results = pd.DataFrame(columns=['Run', 'Accuracy', 'Balanced_Accuracy', 'Precision', 'Recall', 'F1_score', 'Predictors'])
                output_file = os.path.join(output_dir,f"results_{model_type}_F40_{GROUP}_Weight_{weight}.csv")
                for i in range(len(runs)):
                    # Track which run is processing
                    clear_output(wait=True)
                    print(f"{weight}, {model_type}, {GROUP}, Run {i+1} of {len(runs)}")

                    model = train_model(train_data=train, model_type=model_type, class_weight=weight, seed=1234, run=runs.iloc[i],
                                        F40_GROUP=GROUP)
                    model_results = predict_F40(model=model, model_type=model_type, test_data=test, run_index=i, run=runs.iloc[i],
                                                F40_GROUP=GROUP)
                    results.loc[len(results)] = model_results['metrics'].iloc[0]
                results.to_csv(output_file)

# Main
-----

## Data Preparation
Load in the data.

In [14]:
seed = 1234

# Read in data
## Specify paths to geodatabase and layer from geodatabas

## Path to geodatabase
gdb_path_pnnl = paths_dict['LF23_gdb']

## GDB layer
gdb_layer = paths_dict['LF23_sample_points_fpath']

## Read in and process data from GDB
sample_points = data_prep(gdb_path_pnnl, gdb_layer)

# Runs reference csv path - each set of predictors specified here will be run
runs_path = paths_dict['runs_fpath']

sample_points.columns


Index(['F40', 'FDST', 'FVC', 'FVH', 'ZONE', 'PYROME', 'ECOREGION', 'FVT',
       'BPS_GROUPVEG', 'BPS_FRG_NEW', 'ASPECT', 'ELEVATION', 'SLOPE', 'BPS',
       'geometry', 'EVT_ORDER', 'EVT_CLASS', 'EVT_PHYS', 'EVT_SBLCS',
       'FDST_RECLASS', 'FVC_TREE', 'FVC_SHRUB', 'FVC_HERB', 'FVC_OTHER',
       'FVH_TREE', 'FVH_SHRUB', 'FVH_HERB', 'FVH_OTHER', 'F40_GROUP'],
      dtype='object')

## Run Model Iterations
Runs all set sets of predictors specified in the runs_path csv. Used to identify best set of parameters and predictors.
Parameters in question are model_type, imbalance correction method, and whether to include F40_GROUP. Creates a directory and outputs csv files with the results for each combination to that directory. 

In [76]:
# Run model iterations to identify best set of predictors/parameters
run_iterations(sample_points=sample_points, runs_path=runs_path, seed=1234,
               models = ['HGBC', 'RF'], weights=['balanced', None, 'oversampled'], F40_GROUPS=[False, True])

None, RF, True, Run 14 of 14
['FVC', 'FVH', 'BPS_GROUPVEG', 'ASPECT', 'ELEVATION', 'SLOPE']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Train and Save Model 
Train and save the model that has been found to perform the best.
- Model: Histogram-based Gradient Boosting Classifier with class-weight='balanced'
- Predictors: ['FVT', 'FVC', 'FVH', 'FDST', 'ZONE', 'BPS_FRG_NEW']

Note that including F40_GROUP improves performance but we are unsure if this is valid.

__NOTE__: This is currently set to train on _ALL_ of the sample points with the idea that the model will be applied to the full BPA service territory raster. 

In [21]:
# Get run iterations - will train using run 15
runs = pd.read_csv(runs_path) 

# Define Parameters
model_type="HGBC"
GROUP = False
weight = "balanced"
i = 15  # run_index

# Train Model
model = train_model(train_data=sample_points, model_type=model_type, class_weight=weight, seed=1234, run=runs.iloc[i],
                    F40_GROUP=GROUP)
model

# Save the model out
out_dir = paths_dict['models_out_dir']
out_file = os.path.join(out_dir, f"{model_type}_Preds_run-{i}_GROUP-{GROUP}_weight-{weight}_{dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.joblib")

dump(model, out_file)
print(f"Saved to: {out_file}")

['FVT', 'FVC', 'FVH', 'FDST', 'ZONE', 'BPS_FRG_NEW']
Saved to: C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\F40 Random Forest Model\models\models_3-18-24\HGBC_Preds_run-15_GROUP-False_weight-balanced_2024-03-18_12-29-34.joblib


## Run single model and return predictions
Run a specified model and get the prediction results and metrics.

In [None]:
# Get run iterations
runs = pd.read_csv(runs_path)    

# Define constants
train_frac = 0.8
test_frac = 0.2
target="F40"

# Perform train/test split
train, test = train_test_split(sample_points, train_size=train_frac, test_size=test_frac,
                            random_state=seed, shuffle=True, stratify=sample_points[target])

# Define Parameters
model_type="HGBC"
GROUP = False
weight = "balanced"
i = 15  # run_index

# Train Model
model = train_model(train_data=train, model_type=model_type, class_weight=weight, seed=1234, run=runs.iloc[i],
                    F40_GROUP=GROUP)

# Use model to predict on test data
model_results = predict_F40(model=model, model_type=model_type, test_data=test, run_index=i, run=runs.iloc[i],
                            F40_GROUP=GROUP)


IndexError: single positional indexer is out-of-bounds

In [None]:
# Save the predictions for analysis
output_dir = paths_dict['model_output_dir']
output_file = f"{model_type}_Preds_run-{i}_GROUP-{GROUP}_weight-{weight}.csv"

# Write and check the predictions
#model_results['predictions'].to_csv(os.path.join(output_dir, output_file))
model_results['predictions'].head()


Unnamed: 0,FVC_TREE,FVC_SHRUB,FVC_HERB,FVC_OTHER,FVH_TREE,FVH_SHRUB,FVH_HERB,FVH_OTHER,PYROME,EVT_SBLCS,BPS_FRG_NEW,ASPECT,ELEVATION,SLOPE,F40_GROUP,F40,F40-predicted
88987,104,0,0,0,619,0,0,0,7.0,,7,159,1304,9,Timber Litter,188,188
29427,0,113,0,0,0,507,0,0,13.0,Mixed evergreen-deciduous shrubland,8,-1,1909,2,Grass-Shrub,122,122
37797,0,112,0,0,0,520,0,0,19.0,Evergreen shrubland,8,-1,1316,0,Shrub,142,142
74236,106,0,0,0,619,0,0,0,9.0,Evergreen closed tree canopy,2,58,1393,30,Shrub,142,142
22174,102,0,0,0,607,0,0,0,20.0,Evergreen open tree canopy,2,271,1708,37,Grass-Shrub,122,122


In [None]:
# Check the model results
model_results['metrics']

Unnamed: 0,Run,Accuracy,Balanced_Accuracy,Precision,Recall,F1_score,Predictors
0,179,0.844907,0.553852,0.861451,0.844907,0.849339,"[FVC_TREE, FVC_SHRUB, FVC_HERB, FVC_OTHER, FVH..."


## Run Model using LF22 and LF23 data
Evaluate model performance using LF22 data as the training dataset and LF23 data as the test (and vice versa). The goal is to evaluate the validity of the model applied between years - which is the ultimate goal. 

----

### Read in and Prepare Data

In [79]:
seed = 1234

# Read in data
## Specify paths to geodatabase and layer from geodatabase

## Path to geodatabase
gdb_path_pnnl = paths_dict['LF23_gdb']
## GDB layers
LF22_gdb_layer = paths_dict['LF22_sample_points_fpath']
LF23_gdb_layer = paths_dict['LF23_sample_points_fpath']

## Read in and process data from GDB
LF22_sample_points = data_prep(gdb_path_pnnl, LF22_gdb_layer)
LF23_sample_points = data_prep(gdb_path_pnnl, LF23_gdb_layer)

# Reference csv of runs
runs_path = paths_dict['runs_fpath']


### Train on LF22 and Predict on LF23

In [91]:
# Read in data
## Get run iterations
runs = pd.read_csv(runs_path)    

## Read in train/test sets
train = LF22_sample_points
test = LF23_sample_points

# Define constants
train_frac = 0.8
test_frac = 0.2
target="F40"

# Define Parameters
model_type="HGBC"
GROUP = True
weight = "balanced"
i = 15  # run_index

# Train Model
model = train_model(train_data=train, model_type=model_type, class_weight=weight, seed=1234, run=runs.iloc[i],
                    F40_GROUP=GROUP)

# Use model to predict on test data
model_results = predict_F40(model=model, model_type=model_type, test_data=test, run_index=i, run=runs.iloc[i],
                            F40_GROUP=GROUP)



['FVT', 'FVC', 'FVH', 'FDST', 'ZONE', 'BPS_FRG_NEW']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [92]:
# Optionally write out model predictions
output_dir = paths_dict['model_output_dir']
output_file = f"Train-LF22_Predict-LF23_{model_type}_Preds_run-{i}_GROUP-{GROUP}_weight-{weight}.csv"

#model_results['predictions'].to_csv(os.path.join(output_dir, output_file))

# Show Results
model_results['metrics']

Unnamed: 0,Run,Accuracy,Balanced_Accuracy,Precision,Recall,F1_score,Predictors
0,15,0.960472,0.906817,0.969159,0.960472,0.962203,"[FVT, FVC, FVH, FDST, ZONE, BPS_FRG_NEW, F40_G..."


### Train on LF23 and Predict on LF22

In [86]:
# Get run iterations
runs = pd.read_csv(runs_path)    

# Define constants
train_frac = 0.8
test_frac = 0.2
target="F40"

# Perform train/test split
train = LF23_sample_points
test = LF22_sample_points

# Define Parameters
model_type="HGBC"
GROUP = False
weight = None
i = 179  # run_index


# Train Model
model = train_model(train_data=train, model_type=model_type, class_weight=weight, seed=1234, run=runs.iloc[i],
                    F40_GROUP=GROUP)

# Use model to predict on test data
model_results = predict_F40(model=model, model_type=model_type, test_data=test, run_index=i, run=runs.iloc[i],
                            F40_GROUP=GROUP)


IndexError: single positional indexer is out-of-bounds

In [None]:
# Optionally write out model predictions
output_dir = paths_dict['model_output_dir']
output_file = f"Train-LF23_Predict-LF22_{model_type}_Preds_run-{i}_GROUP-{GROUP}_weight-{weight}.csv"

model_results['predictions'].to_csv(os.path.join(output_dir, output_file))

# Show Results
model_results['metrics']

Unnamed: 0,Run,Accuracy,Balanced_Accuracy,Precision,Recall,F1_score,Predictors
0,179,0.735776,0.502878,0.75063,0.735776,0.735398,"[FVC_TREE, FVC_SHRUB, FVC_HERB, FVC_OTHER, FVH..."
