# ABOUT

__Author__: Pat McCornack

__Date__: 4/9/24

__Purpose__: Develop model to update FVT using most recent FDist data. 

------

In [1]:
import os
import datetime as dt
from joblib import dump
from itertools import combinations

import numpy as np
import pandas as pd
import geopandas as gpd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import TargetEncoder, StandardScaler
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier


import matplotlib.pyplot as plt




## Define Paths
Define set of filepaths to conveniently switch between working off local files or the PNNL drive. Set active_data_dir to either local_data_dir or pnnl_data_dir depending on which you're working off of. 

In [2]:
local_root_dir = r"C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\Fuel Attributes Model"
pnnl_root_dir = r"\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\Fuel Attributes Model"

# Define which data directory to work off of
active_root_dir = local_root_dir

paths_dict = {
    "out_base_dir" : os.path.join(active_root_dir, r"model_outputs\tabular"),  # Where to save outputs 
    "ref_data_dir" : os.path.join(active_root_dir, r"..\LF_raster_data\_tables"),  # Location of LF csvs (e.g. LF22_FVT_230.csv)
    "sample_points_dir" : os.path.join(active_root_dir, r"data\sample_points"),  # Location of shapefile to train model on
    "sample_points_fname" : "sample_points_4-17-24_200k_Disturbed.shp",  # Name of shapefile to train model on
    "runs_dict_fpath" : os.path.join(active_root_dir, r"data\runs_dict.csv"),  # Used to test different sets of predictors 
    "model_dir" : os.path.join(active_root_dir, "models")
}




# __Functions__
----


## __Create a Directory to Output Modeling Results__
Names the output directory using the datetime that the script was run. 
Returns the name of the directory. The returned directory is used to output the trained model and/or results. 

In [3]:
def make_dir(base_dir, new_dir_name='model_results'):
        """
        Returns path to a directory created at the specified base_dir location. 

        The name of the created directory can optionally be specified using the dir_name argument. 
        """

        datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        output_dir = os.path.join(base_dir, new_dir_name + "_" + datetime)

        os.makedirs(output_dir)
        return output_dir

## __Categorical Variable Encoder__
Categorical variables must be encoded before being used in some model types. Note that due to HGBC's native encoding support this function is not needed when using that model type.

__Note:__ Aspect, elevation, and slope are the only LANDFIRE variables of interest that are not categorical, which is why they're removed from the predictor list before the data is encoded. 

In [4]:
def targetEncoder(train, test, predictors, target):
    """
    Returns the train and test sets with the categorical predictors encoded. The predictors argument is a list of features being used as 
    predictors in the model, while target is the name of target feature. 
    """
    # Subset to attributes of interest
    attributes = predictors + [target]
    train = train[attributes]
    test = test[attributes]

    # Separate out the predictors from the target and remove continuous variables
    cat_predictors = [x for x in predictors if x not in ['ASPECT', 'ELEVATION', 'SLOPE']]  # Drops continuous variables

    # Encode the features
    enc = TargetEncoder(target_type="multiclass", random_state=1234).set_output(transform="pandas")
    enc.fit(train[cat_predictors], train[target])  # Fit the encoder
    train_trans = enc.transform(train[cat_predictors])  # Encode the train data
    test_trans = enc.transform(test[cat_predictors])  # Encode the test data

    # Replace the features with encoded features
    train = train.drop(cat_predictors, axis=1)
    train = pd.concat([train, train_trans], axis=1)

    test = test.drop(cat_predictors, axis=1)
    test = pd.concat([test, test_trans], axis=1)
    
    return train, test

## __Standardize Data__
This is primarily used to standardize data for the MLP since tree-based methods aren't sensitive to data scale. By default it is set to standardize only topographic variables given that they are the only continuous LANDFIRE features of interest. 

In [5]:
def standardize_data(train, test, features=['SLOPE', 'ASPECT', 'ELEVATION']):
    """
    Return train and test datasets with the features in the 'features' argument having been standardized.
    """

    # Fit the scaler to the training data
    scaler = StandardScaler().fit(train[features])

    # Scale the training data
    train[features] = scaler.transform(train[features])

    # Scale the test data
    test[features] = scaler.transform(test[features])

    return train, test


## __Get List of Lists of Predictor Sets__
In the interest of assessing model performance using multiple sets of predictors, this function utilizes an excel file to generate a list of lists of sets of predictors that can be iterated through the model train/evaluation functions. 

In [6]:
def get_runs(runs_dict_fpath=paths_dict['runs_dict_fpath']):
    """
    Returns a list of lists, where each list is a set of predictors to evaluate model performance with. 

    runs_dict_fpath is the path to an excel file where each column corresponds to a feature, and each row corresponds to a set
    of predictors. If a cell contains a '1', then that feature is included that set of predictors.
    """
    df = pd.read_csv(runs_dict_fpath)
    
    # Drop the "Run" column
    df = df.drop('Run', axis=1)
    
    runs = []

    for i in range(len(df)):
        row = df.iloc[i]
        runs.append(row.loc[row==1].index.tolist())
    
    return runs


## __Generate Combinations of Features of Interest__
Alternatively, automatically generate all possible combinations of a set of potential predictor features to evaluate. 

__Caution__: The size of the list of lists of predictor sets can quickly become unmanageable when too many features of interest are included. 

In [7]:
def generate_combinations(attributes, target):
    """
    Returns a list of lists of all possible combinations of the list of attributes provided. 
    """

    attributes = [x for x in attributes if x not in [target]]  # Drop the target variable
    predictors = []

    for i in range(len(attributes)):
        combs = [list(x) for x in combinations(attributes, i)]
        for item in combs:
            predictors.append(item)

    return predictors[1:]    

## __Pre-Process the Data__

### __Create Data Dictionaries to Append Features__
Some features are separate attributes of the LANDFIRE dataset (e.g. BPS Fire Regime) and others are useful for results analysis (e.g. FDst attributes). These can be mapped to points using LANDFIRE CSVs. The below creates dictionaries to perform that mapping. 

This function is called by join_features.

In [8]:
def read_ref_data(ref_data_dir=paths_dict["ref_data_dir"]):
    """
    Returns a dictionary of dictionaries of mappings between LANDFIRE raster values and other attributes associated with those values. 
    """
    data_dir = ref_data_dir
    BPS_fname = "LF20_BPS_220.csv"
    LF16_EVT_fname = "LF16_EVT_200.csv"
    LF22_FDST_fname = "LF22_FDST_230.csv"
    LF22_FVT_fname = "LF22_FVT_230.csv"
    LF22_EVT_fname = "LF22_EVT_230.csv"
    
    # Create empty dictionary
    LF_ref_dicts = {}

    # Get BPS reference dictionary
    BPS_df = pd.read_csv(os.path.join(data_dir, BPS_fname))
    LF_ref_dicts["BPS_NAME"] = dict(BPS_df[['VALUE', 'BPS_NAME']].values)
    LF_ref_dicts["BPS_FRG"] = dict(BPS_df[['VALUE', 'FRG_NEW']].values)

    # Get FDST reference dictionaries 
    FDST_df = pd.read_csv(os.path.join(data_dir, LF22_FDST_fname ))
    LF_ref_dicts["FDST_TYPE"] = dict(FDST_df[['VALUE', 'D_TYPE']].values)
    LF_ref_dicts["FDST_SEV"] = dict(FDST_df[['VALUE', 'D_SEVERITY']].values)
    LF_ref_dicts["FDST_TSD"] = dict(FDST_df[['VALUE', 'D_TIME']].values)

    # Get EVT reference dictionaries
    EVT_df = pd.read_csv(os.path.join(data_dir, LF16_EVT_fname))
    LF_ref_dicts["EVT_PHYS"] = dict(EVT_df[['VALUE', 'EVT_PHYS']].values)
    LF_ref_dicts["EVT_GP_N"] = dict(EVT_df[['VALUE', 'EVT_GP_N']].values) 
    LF_ref_dicts["EVT_CLASS"] = dict(EVT_df[['VALUE', 'EVT_CLASS']].values) 

    # Get FVT reference dictionaries
    LF22_FVT_df = pd.read_csv(os.path.join(data_dir, LF22_FVT_fname))
    LF_ref_dicts['LF22_FVT'] = dict(LF22_FVT_df[['VALUE', 'EVT_FUEL_N']].values)
    
    ## Map FVT to EVT Groupings
    LF22_EVT_df = pd.read_csv(os.path.join(data_dir, LF22_EVT_fname))
    LF_ref_dicts['FVT_EVT'] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_FUEL_N']].values)
    LF_ref_dicts["FVT_PHYS"] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_PHYS']].values)
    LF_ref_dicts["FVT_GP_N"] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_GP_N']].values) 
    LF_ref_dicts["FVT_CLASS"] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_CLASS']].values) 
    LF_ref_dicts["FVT_LF"] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_LF']].values)

    return LF_ref_dicts
                         
        

### __Append Features using Data Dictionaries__
Append in selected features using the LANDFIRE data dictionaries.

__Note:__ Items in feature_list must be in the source_layers dictionary. 

In [9]:
def join_features(sample_points, feature_list = ['BPS_NAME', 'BPS_FRG', 'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N', 'EVT_CLASS']):
    """
    Returns the sample_points layer with the features in feature_list appended. 

    Items in feature_list must be in the source_layers dictionary.  
    """
    
    LF_ref_dicts = read_ref_data()
    
    source_layers = {
        'BPS_NAME' : 'BPS', 
        'BPS_FRG' : 'BPS',
        'FDST_TYPE' : 'LF22_FDST',
        'FDST_SEV' : 'LF22_FDST', 
        'FDST_TSD' : 'LF22_FDST', 
        'EVT_PHYS' : 'LF16_EVT',
        'EVT_GP_N' : 'LF16_EVT',
        'EVT_CLASS' : 'LF16_EVT'
    }

    # Iterate through feature_list and append features to sample_points
    for feature in feature_list:
        sample_points[feature] = sample_points[source_layers[feature]].map(LF_ref_dicts[feature]).copy()

    return sample_points

### __Low Count Filter__
Some classes have exceedingly low representation in the dataset. In order to run train_test_split, the target class must have a count greater than 1. The following will filter out classes with counts of 1. Given the size of the dataset excluding these classes will not detrimentally impact model performance.

In [10]:
def low_count_filter(sample_points, target):  
    """
    Returns modified sample_points dataframe. Classes in the target feature that have counts of 1 are removed. 
    """  
    # Group by FVT
    LF22_FVT_counts = sample_points.groupby(target).count()

    # Identify FVTs with low observation counts
    low_count_FVT = LF22_FVT_counts[LF22_FVT_counts.iloc[:,0] < 5].index.tolist()

    # Remove those FVTs from sample_points
    sample_points = sample_points.loc[~sample_points[target].isin(low_count_FVT)]
    print(f"Low count {target}: {low_count_FVT}")
    print(f"After removing low count {target}: {sample_points.shape}")
    print(f"Sample Points Attributes: {sample_points.columns}")

    return sample_points

### __Prepare the Sample Points__
Before training the model, the sample points need to be filtered. Filtering steps are:
- Remove near points - defined as points that are less than 70m apart.
- Remove Null points (-9999/-1111 values) - these will not be updated in the final raster. 
- Remove undisturbed values - the model is intended to update points that have been disturbed in the last 10 years. 
- Remove points with agricultural or developed FVT classes - these are updated using datasets not used for other classes. 
- Remove observations with classes that have very low representation (less than 5 observations in the class) in the dataset. This is done to allow train_test_split to run. 
- Optionally, join in additional features. feature_list may be modified to select which features to add in (see join_features for further details). 

In [11]:
def data_prep(sample_point_fpath, target):
    """
    Returns dataframe of processed sample points. Sample points are read in from shapefile. Processing includes filtering out null points and developed/agricultural FVTs as 
    well as appending additional features. 
    """

    # Read in gdf
    sample_points = gpd.read_file(sample_point_fpath)
    print(f"Total Number of points: {sample_points.shape[0]}")

    # Filter out near points
    sample_points = sample_points.loc[sample_points['NEAR_FID'] == -1]
    print(f"After removing near points: {sample_points.shape[0]}")

    # Drop unneeeded columns if present
    sample_points = sample_points.drop(['Classified', 'GrndTruth', 'NEAR_FID', 'NEAR_DIST'], axis=1,
                                       errors='ignore')
    
    # Remove observations with -9999/-1111 in any field 
    sample_points = sample_points.loc[~sample_points.isin([-1111, -9999]).any(axis=1)]
    print(f"After removing null points: {sample_points.shape[0]}")

    # Filter out points that weren't disturbed
    sample_points = sample_points.loc[sample_points['LF22_FDST'] != 0]
    print(f"After removing undisturbed points: {sample_points.shape[0]}")

    # Join in additional features
    sample_points = join_features(sample_points)

    # Filter out agricultural and developed points
    developed_fvt = list(range(20,33)) + list(range(2901,2906))
    ag_fvt = [80, 81, 82] + list(range(2960, 2971))
    fvt_filter = developed_fvt + ag_fvt
    sample_points = sample_points.loc[~sample_points['LF22_FVT'].isin(fvt_filter)]

    # Filter out classes in the target feature with very low counts
    sample_points = low_count_filter(sample_points, target)

    return sample_points


## __Instantiate Estimators__

### __Histogram Based Gradient Boosting Classifier__
Scikit-learn implementation of Histogram Based Gradient Boosting Classifier

In [12]:
def histGradientBoostingClassifier(categorical_feature_list, class_weight='Balanced', seed=1234):
    """
    Returns specified histogram-based gradient boosting classifier. 
    """

    hgb_classifier = HistGradientBoostingClassifier(
        categorical_features=categorical_feature_list,  # Natively handle categorical variables
        class_weight=class_weight,
        random_state=seed,
        learning_rate=0.01,
        max_iter=100
    )

    return hgb_classifier

### __Catboost Gradient Boosting__
Catboost implementation of Gradient Boosting Classifier

In [13]:
def catBoost_hgbc(cat_features):
    """
    Returns specified gradient boosting classifier from catBoost library. 
    """
    cb_hgbc = CatBoostClassifier(
        iterations=2000, 
        learning_rate=0.01,
        loss_function="MultiClass",
        #boosting_type="Plain",
        cat_features=cat_features,  # List of categorical features
        auto_class_weights='Balanced', # Used to handle class imbalance
        random_seed=1234,
        task_type='GPU'  # Train the model on GPU
    )

    return cb_hgbc

### __Random Forest Classifier__
Scikit-learn implementation Random Forest Classifier


In [14]:
def randomForestClassifier(n_est=100, min_samples_leaf=50, bootstrap=True, oob_score=True, 
                           n_jobs=-1, random_state=1234, max_features='auto', class_weight=None):
    """
    Returns specified Random Forest Classifier. 
    """
    rf_classifier = RandomForestClassifier(
        n_estimators=n_est,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        oob_score=oob_score,
        n_jobs=n_jobs,
        random_state=random_state,
        max_features=max_features,
        class_weight=class_weight,
        max_depth = None
    )

    return rf_classifier


### __Multi-Layer Perceptron__
Scikit-learn implementation of Multi-Layer Perceptron. Note that this has not been tuned.

In [15]:
def multiLayerPerceptron():
    """
    Returns specified Multi Layer Perceptron.
    """
    clf = MLPClassifier(
        max_iter = 10000,
        activation = 'relu',  # 'relu' is the default
        hidden_layer_sizes = (256, 128, 64)
    )

    return clf

## __Train Model__
Trains and returns HGBC model provided data. Subsets the training data to a provided list of predictors and predicts the specified target feature.

__Note__: Aspect, Elevation, and Slope are the only continuous LANDFIRE datasets, therefore any feature not in that list is assumed to be categorical.

In [16]:
def train_model(train_data, predictors, target, model_type='HGBC'):
    """
    Returns trained model of specified type given a set of predictors. 
    """

    class_weight = "balanced"

    # Get list of predictors for run
    cat_features = [x for x in predictors if x not in ['ASPECT', 'ELEVATION', 'SLOPE']]  # Categorical features for HGBC models  
    
    # Setting the data type to categorical speeds up model training for catBoost
    if model_type == "catBoost":
        train_data[cat_features] = train_data[cat_features].astype("category")

    # Separate training data predictors/response
    y_train = train_data[target].copy()
    X_train = train_data[predictors].copy()

    # Fit selected model_type
    if model_type == 'RF':
        model = randomForestClassifier(n_est=100, min_samples_leaf=50, bootstrap=True, oob_score=True,
                                            n_jobs=-1, random_state=1234, max_features='sqrt', class_weight=class_weight)
    elif model_type == 'HGBC':
        model = histGradientBoostingClassifier(categorical_feature_list=cat_features, class_weight=class_weight)
    
    elif model_type == 'catBoost':
        model = catBoost_hgbc(cat_features=cat_features)

    elif model_type == 'MLP':
        print('Defining MLP...')
        model = multiLayerPerceptron()
    
    # Fit the model with the training data
    print('Fitting Model...')
    model.fit(X_train, y_train)

    # Return the fit model
    return model         
    

## __Evaluate Model__
Provided a trained model, test data, a set of predictors, and a target - return a list of predictions metrics on prediction accuracy.

In [17]:
def eval_model(model, test_data, predictors, target, model_type='HGBC'):
    """
    Returns a dictionary containing 1. 'metrics' : a list of metrics quantifying model performance, and 2. 'predictions' : a list of predictions corresponding to 
    each observation in the sample_points data.
    """
       
    # Separate the predictors from target
    y_test = test_data[target].copy()
    x_test = test_data[predictors].copy()

    # Perform prediction
    y_pred = model.predict(x_test)

    # Get metrics
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    bal_acc = round(balanced_accuracy_score(y_test, y_pred),3)
    recall = round(recall_score(y_test, y_pred, average='macro'), 3)
    precision = round(precision_score(y_test, y_pred, average='macro'), 3)
    f1 = round(f1_score(y_test, y_pred, average='macro'), 3)

    print(f"Predictors: {predictors}")
    print(f"Accuracy: {accuracy}")
    print(f"Balanced Accuracy: {bal_acc}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F1: {f1}")

    return {
        "metrics" : [accuracy, bal_acc, recall, precision, f1, predictors],
        "predictions" : y_pred
    }


## __Results Evaluation Functions__

### __Plot the Class Distribution__
Plots the distribution of the variable of interest. Can be set to display the raw counts for each class or the percentage of the dataste that the class accounts for. 

In [18]:
def plot_distribution(df, group_var, metric='Count', sort=True, title=f'Distribution', figsize=(15, 50)):
    """
    Plots the distribution of group_var. 'metric' can be set to either 'Count' or 'Percent' to change the display type. 
    The 'sort' argument specifies whether to sort the display in order from most common to least common. 
    """

    # Get count of target class
    df_gp = df.groupby(group_var).count().reset_index()
    df_gp = df_gp.iloc[:, 0:2].rename(columns={df_gp.columns[1] : 'Count'})

    # Get classes proportional representation
    df_gp['Percent'] = df_gp['Count'] / df_gp['Count'].sum() * 100
    df_gp[group_var] = df_gp[group_var].astype('str')  # For plotting change FVT to str
    if sort == True: 
        df_gp = df_gp.sort_values(by="Count", ascending=True)

    # Visualize the distribution
    fig, ax = plt.subplots(figsize=figsize)
    hbars = ax.barh(df_gp[group_var],
                df_gp[metric],
                align='center')
    ax.set(title=title,
        xlabel= f'{metric}', 
        ylabel=f'{group_var}')  
    ax.bar_label(hbars, fmt=" %.2f")
    
    # Remove whitespace at margins
    plt.ylim(ax.patches[0].get_y(), ax.patches[-1].get_y() + ax.patches[-1].get_height()+0.5)

    plt.show()

### __Plot Confusion Matrix__
Generates a confusion matrix of true labels vs. predicted labels.
Normalization options are: 
- 'true' : Normalizes over the rows.
- 'pred' : Normalizes over the columns. 
- 'all' : Normalizes over the entire population. 

In [19]:
# Plot a confusion matrix
def plot_cm(y_true, y_pred, normalize=None, figsize=10, title="Confusion Matrix"):
    """
    Plots confusion matrix of true values (y_true) against predicted values (y_pred). Can be normalized by setting 
    normalize='true'/'pred'/'all'.
    """

    np.set_printoptions(precision=2) 

    # Define title
    title = f"{title}, normalize = {normalize}"

    # Plot confusion matrix
    disp = ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                                                   cmap=plt.cm.Blues,
                                                   normalize=normalize,
                                                   xticks_rotation='vertical'
                                                   )
    disp.ax_.set_title(title)

    # Set plot size
    fig = disp.ax_.get_figure()
    fig.set_figwidth(figsize)
    fig.set_figheight(figsize)

    # Display results
    plt.show()

# __Main__

----

## Train Model

In [20]:
# Load in data and perform train/test split.
seed = 1234

# Define predictors and target
predictors = ['LF20_FVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_FVC', 'LF20_FVH']
target = 'LF22_FVT'

# Define path to sample points
sample_points_fpath =  os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

# Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

# Define the type of model to be used
model_type = 'HGBC'  

# Train model
model = train_model(train_data=sample_points, predictors=predictors, target=target, model_type=model_type)

# Save model 
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
model_fpath = os.path.join(paths_dict['model_dir'], model_fname)
dump(model, model_fpath)
print(f"Model written to: {model_fpath}")

Total Number of points: 200390
After removing near points: 196097
After removing null points: 163092
After removing undisturbed points: 163092
Low count LF22_FVT: [2016, 2064, 2066, 2099, 2128, 2177, 2184, 2211, 2219, 2260, 2567, 2910, 2911]
After removing low count LF22_FVT: (160346, 31)
Sample Points Attributes: Index(['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF16_EVT', 'LF14_EVT', 'LF22_FDST',
       'ZONE', 'BPS', 'ASPECT', 'SLOPE', 'ELEVATION', 'LF20_EVT', 'LF20_SCla',
       'LF20_FVH', 'LF20_FVC', 'LF22_FVH', 'LF22_FVC', 'LF16_FVH', 'LF16_FVC',
       'PRED_FVT', 'LF22_F40', 'BPS_FRG_NE', 'geometry', 'BPS_NAME', 'BPS_FRG',
       'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N',
       'EVT_CLASS'],
      dtype='object')
Fitting Model...
Model written to: C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\Fuel Attributes Model\models\LF22_FVT_HGBC_model_2024-05-16_11-30-32


## __FVT__

In [68]:
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVT'

## Define train/test split proportion
train_frac = 0.7
test_frac = 0.3

## Define path to sample points
sample_points_fpath =  os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

## Perform Train/Test Split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

Total Number of points: 200390
After removing near points: 196097
After removing null points: 163092
After removing undisturbed points: 163092
Low count LF22_FVT: [2016, 2064, 2066, 2099, 2128, 2177, 2184, 2211, 2219, 2260, 2567, 2910, 2911]
After removing low count LF22_FVT: (163057, 31)
Sample Points Attributes: Index(['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF16_EVT', 'LF14_EVT', 'LF22_FDST',
       'ZONE', 'BPS', 'ASPECT', 'SLOPE', 'ELEVATION', 'LF20_EVT', 'LF20_SCla',
       'LF20_FVH', 'LF20_FVC', 'LF22_FVH', 'LF22_FVC', 'LF16_FVH', 'LF16_FVC',
       'PRED_FVT', 'LF22_F40', 'BPS_FRG_NE', 'geometry', 'BPS_NAME', 'BPS_FRG',
       'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N',
       'EVT_CLASS'],
      dtype='object')


In [69]:
# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors
predictors = ['LF20_FVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_FVC', 'LF20_FVH']

# Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, features=predictors)
else:
    train, test = train_points, test_points

# Train model
model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

# Evaluate Model
print("Score on the Training Set: ")
results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)

print("Score on the test set: ")
results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)

Fitting Model...
Score on the Training Set: 
Predictors: ['LF20_FVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_FVC', 'LF20_FVH']
Accuracy: 0.733
Balanced Accuracy: 0.899
Recall: 0.899
Precision: 0.68
F1: 0.756
Score on the test set: 
Predictors: ['LF20_FVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_FVC', 'LF20_FVH']
Accuracy: 0.67
Balanced Accuracy: 0.558
Recall: 0.558
Precision: 0.462
F1: 0.491


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Save out results
## Concatenate predictions to test set
test[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=f'{target}_model_results')
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


Create a test set using the FVT predictions. This will then be fed into the FVC/FVH models to see how replacing the true FVT values with predicted values will impact the FVC/FVH predictions

In [None]:
fvt_test = test.copy()
fvt_test["LF22_FVT_Predictions"] = results['predictions']

### __Train model to Create Predicted FVT Raster__
Use all sample points to train model. 

In [22]:
# Load in data and perform train/test split.
seed = 1234

# Define predictors and target
predictors = ['LF20_FVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_FVC', 'LF20_FVH']
target = 'LF22_FVT'

# Define path to sample points
sample_points_fpath =  os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

# Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

# Define the type of model to be used
model_type = 'HGBC'  

# Train model
model = train_model(train_data=sample_points, predictors=predictors, target=target, model_type=model_type)

# Save model 
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
model_fpath = os.path.join(paths_dict['model_dir'], model_fname)
dump(model, model_fpath)
print(f"Model written to: {model_fpath}")

Total Number of points: 200390
After removing near points: 196097
After removing null points: 163092
After removing undisturbed points: 163092
Low count LF22_FVT: [2016, 2064, 2066, 2099, 2128, 2177, 2184, 2211, 2219, 2260, 2567, 2910, 2911]
After removing low count LF22_FVT: (160346, 31)
Sample Points Attributes: Index(['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF16_EVT', 'LF14_EVT', 'LF22_FDST',
       'ZONE', 'BPS', 'ASPECT', 'SLOPE', 'ELEVATION', 'LF20_EVT', 'LF20_SCla',
       'LF20_FVH', 'LF20_FVC', 'LF22_FVH', 'LF22_FVC', 'LF16_FVH', 'LF16_FVC',
       'PRED_FVT', 'LF22_F40', 'BPS_FRG_NE', 'geometry', 'BPS_NAME', 'BPS_FRG',
       'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N',
       'EVT_CLASS'],
      dtype='object')
Fitting Model...


KeyError: 'model_fname'

In [24]:
# Save model 
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
model_fpath = os.path.join(paths_dict['model_dir'], model_fname)
dump(model, model_fpath)
print(f"Model written to: {model_fpath}")

Model written to: C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\Fuel Attributes Model\models\LF22_FVT_HGBC_model_2024-05-10_16-49-07


## __FVC__

In [42]:
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVC'

## Define train/test split proportion
train_frac = 0.7
test_frac = 0.3

## Define path to sample points
sample_points_fpath =  os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

## Perform Train/Test Split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

Total Number of points: 200390
After removing near points: 196097
After removing null points: 163092
After removing undisturbed points: 163092
Low count LF22_FVC: [117]
After removing low count LF22_FVC: (163088, 31)
Sample Points Attributes: Index(['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF16_EVT', 'LF14_EVT', 'LF22_FDST',
       'ZONE', 'BPS', 'ASPECT', 'SLOPE', 'ELEVATION', 'LF20_EVT', 'LF20_SCla',
       'LF20_FVH', 'LF20_FVC', 'LF22_FVH', 'LF22_FVC', 'LF16_FVH', 'LF16_FVC',
       'PRED_FVT', 'LF22_F40', 'BPS_FRG_NE', 'geometry', 'BPS_NAME', 'BPS_FRG',
       'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N',
       'EVT_CLASS'],
      dtype='object')
      LF22_FVT  LF20_FVT  LF16_FVT  LF16_EVT  LF14_EVT  LF22_FDST  ZONE   BPS  \
7742      2126      2146      2274      9309      3145        113    12  1069   
7743      2146      2146      2146      7146      3145        113    12  1069   
7744      2126      2126      2126      7126      3019        113    12  1050   
7745

In [43]:
# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors
predictors = ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME']


# Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, predictors, target)
else:
    train, test = train_points, test_points

# Train model
model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

# Evaluate Model
print("Score on the Training Set: ")
results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)

print("Score on the test set: ")
results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)

Fitting Model...
Score on the Training Set: 
Predictors: ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME']
Accuracy: 0.991
Balanced Accuracy: 0.996
Recall: 0.996
Precision: 0.966
F1: 0.976
Score on the test set: 
Predictors: ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME']
Accuracy: 0.991
Balanced Accuracy: 0.988
Recall: 0.988
Precision: 0.96
F1: 0.965


In [49]:
# Save out results
## Concatenate predictions to test_points set
test_points[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=paths_dict['new_dir_name'])
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test_points.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


KeyError: 'new_dir_name'

In [63]:
## Create a model using the full dataset
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVC'

## Define path to sample points
sample_points_fpath =  os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors
predictors = ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME']

# Train model
model = train_model(train_data=sample_points, predictors=predictors, target=target, model_type=model_type)

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(paths_dict['model_dir'], model_fname))

Total Number of points: 200390
After removing near points: 196097
After removing null points: 163092
After removing undisturbed points: 163092
Low count LF22_FVC: [117]
After removing low count LF22_FVC: (163088, 31)
Sample Points Attributes: Index(['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF16_EVT', 'LF14_EVT', 'LF22_FDST',
       'ZONE', 'BPS', 'ASPECT', 'SLOPE', 'ELEVATION', 'LF20_EVT', 'LF20_SCla',
       'LF20_FVH', 'LF20_FVC', 'LF22_FVH', 'LF22_FVC', 'LF16_FVH', 'LF16_FVC',
       'PRED_FVT', 'LF22_F40', 'BPS_FRG_NE', 'geometry', 'BPS_NAME', 'BPS_FRG',
       'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N',
       'EVT_CLASS'],
      dtype='object')
Fitting Model...


['C:\\Users\\mcco573\\OneDrive - PNNL\\Documents\\_Projects\\BPA Wildfire\\Fuel Attributes Model\\models\\LF22_FVC_HGBC_model_2024-05-15_15-53-00']

### Run on multiple sets of predictors
Used to compare/contrast the performance of the model using various sets of predictors. 

In [None]:
# Run multiple sets of predictors to compare

# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors - includes target and predictors
predictors_list = [
    ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE'],
    ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME'],
    ['LF22_FVT', 'LF20_FVC', 'LF16_FVC', 'LF22_FDST', 'ZONE'],
    ['LF22_FVT', 'LF20_FVC', 'LF16_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME'],
    ['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF20_FVC', 'LF16_FVC', 'LF20_FVH', 'LF16_FVH', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME']
]

# Define results DF
results_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'predictors', 'model_type', 'train/test'])

# Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, predictors, target)
else:
    train, test = train_points, test_points

for predictors in predictors_list: 
    # Train model
    model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

    # Evaluate Model
    print("Score on the Training Set: ")
    results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)
    results_df.loc[len(results_df)] = results['metrics'] + [f'{model_type}', 'train']

    print("Score on the test set: ")
    results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)
    results_df.loc[len(results_df)] = results['metrics'] + [f'{model_type}', 'test']

In [None]:
results_df

### __FVC with Predicted FVT__


In [None]:
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVC'

## Define train/test split proportion
train_frac = 0.8
test_frac = 0.2

## Define path to sample points
sample_points_fpath = os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

## Perform Train/Test Split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

In [None]:
# Define and run model
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

## Define set of predictors - includes target and predictors
predictors = ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE']

## Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, predictors, target)
else:
    train, test = train_points, test_points

## Train model
model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

## Evaluate Model
print("Score on the Training Set: ")
results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)

print("Score on test set with predicted FVTs")
results = eval_model(model=model, test_data=fvt_test, predictors=predictors, target=target, model_type=model_type)


In [None]:
# Save out results
## Concatenate predictions to test_points set
test_points[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=paths_dict['new_dir_name'])
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test_points.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


##  __FVH__

In [66]:
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVH'

## Define train/test split proportion
train_frac = 0.7
test_frac = 0.3

## Define path to sample points
sample_points_fpath = os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

## Perform Train/Test Split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

Total Number of points: 200390
After removing near points: 196097
After removing null points: 163092
After removing undisturbed points: 163092
Low count LF22_FVH: []
After removing low count LF22_FVH: (163092, 31)
Sample Points Attributes: Index(['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF16_EVT', 'LF14_EVT', 'LF22_FDST',
       'ZONE', 'BPS', 'ASPECT', 'SLOPE', 'ELEVATION', 'LF20_EVT', 'LF20_SCla',
       'LF20_FVH', 'LF20_FVC', 'LF22_FVH', 'LF22_FVC', 'LF16_FVH', 'LF16_FVC',
       'PRED_FVT', 'LF22_F40', 'BPS_FRG_NE', 'geometry', 'BPS_NAME', 'BPS_FRG',
       'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N',
       'EVT_CLASS'],
      dtype='object')


In [67]:
# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors - includes target and predictors
predictors = ['LF22_FVT', 'LF20_FVH', 'LF22_FDST', 'ZONE', 'BPS_NAME']
#predictors = ['LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_FVC', 'LF20_FVH',  target]

# Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, predictors, target)
else:
    train, test = train_points, test_points

# Train model
model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

# Evaluate Model
print("Score on the Training Set: ")
results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)

print("Score on the test set: ")
results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)

Fitting Model...
Score on the Training Set: 
Predictors: ['LF22_FVT', 'LF20_FVH', 'LF22_FDST', 'ZONE', 'BPS_NAME']
Accuracy: 0.993
Balanced Accuracy: 0.994
Recall: 0.994
Precision: 0.953
F1: 0.966
Score on the test set: 
Predictors: ['LF22_FVT', 'LF20_FVH', 'LF22_FDST', 'ZONE', 'BPS_NAME']
Accuracy: 0.992
Balanced Accuracy: 0.991
Recall: 0.991
Precision: 0.941
F1: 0.954


In [61]:
# Save out results
## Concatenate predictions to test_points set
test_points[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=paths_dict['new_dir_name'])
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test_points.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


KeyError: 'new_dir_name'

In [65]:
## Create a model using the full dataset
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVH'

## Define path to sample points
sample_points_fpath =  os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors
predictors = ['LF22_FVT', 'LF20_FVH', 'LF22_FDST', 'ZONE', 'BPS_NAME']

# Train model
model = train_model(train_data=sample_points, predictors=predictors, target=target, model_type=model_type)

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(paths_dict['model_dir'], model_fname))

Total Number of points: 200390
After removing near points: 196097
After removing null points: 163092
After removing undisturbed points: 163092
Low count LF22_FVH: []
After removing low count LF22_FVH: (163092, 31)
Sample Points Attributes: Index(['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF16_EVT', 'LF14_EVT', 'LF22_FDST',
       'ZONE', 'BPS', 'ASPECT', 'SLOPE', 'ELEVATION', 'LF20_EVT', 'LF20_SCla',
       'LF20_FVH', 'LF20_FVC', 'LF22_FVH', 'LF22_FVC', 'LF16_FVH', 'LF16_FVC',
       'PRED_FVT', 'LF22_F40', 'BPS_FRG_NE', 'geometry', 'BPS_NAME', 'BPS_FRG',
       'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N',
       'EVT_CLASS'],
      dtype='object')
Fitting Model...


['C:\\Users\\mcco573\\OneDrive - PNNL\\Documents\\_Projects\\BPA Wildfire\\Fuel Attributes Model\\models\\LF22_FVH_HGBC_model_2024-05-15_15-57-07']

### Evaluate a list of models
Run multiple sets of predictors to evaluate against each other. 

In [None]:
# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC

# Generate combinations of potential predictor sets
##predictors = ['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'EVT_GP_N', 'EVT_PHYS', 'EVT_CLASS']
##predictors_list = generate_combinations(predictors, 'LF22_FVT')

# Alternatively, get list of lists of predictors sets from file 
##predictors_list = get_runs()

# Alternatively, define a list of lists of attribute combinations
predictors_list = [
    ['ASPECT', 'ELEVATION', 'SLOPE', 'LF22_FDST', 'ZONE'],
    ['ASPECT', 'ELEVATION', 'SLOPE', 'LF22_FDST', 'ZONE'],
    ['ASPECT', 'ELEVATION', 'SLOPE', 'LF20_EVT', 'LF22_FDST', 'ZONE'],
    ['LF20_EVT', 'LF22_FDST', 'ZONE', 'BPS_NAME']

]

# Dataframe to store results
result_df = pd.DataFrame(columns=['Accuracy', 'Balanced_Accuracy', 'Recall', 'Precision', 'F1', 'Predictors'])

# Iterate through combinations of predictors
for predictors in predictors_list:
    # Train model
    model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

    # Evaluate Model
    results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)['metrics']

    result_df.loc[len(result_df)] = results

result_df

# Model Evaluation
----

!!! __NOTE:__ THIS HAS NOT BEEN UPDATED TO REFLECT CHANGES TO FUNCTOINS


As of 4/23/24 the best performance achieved has been an accuracy of ~60% using a HGBC and the predictors: ['LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF22_FVT']. 

All parameters were set to default except for: 
- categorical_features = List of categorical features
- class_weight = "balanced"
- random_state = 1234
- learning_rate = .01
- max_iter = 50

After tinkering with a number of hyperparameters, but not conducting a full grid search, it appears that tuning the parameters may not yield significant improvements. 

__Goal__: Try to tease out what is causing usch low model performance. Is it due to under/over fitting, or is the set of predictors insufficent to further separate out the classes? 


In [None]:
# Run the model 
seed = 1234
target = 'LF22_FVT'

# Define train/test split
train_frac = 0.8
test_frac = 0.2

# Define path to sample points
sample_points_fpath = paths_dict['sample_point_fpath']

# Read in and prepare data
sample_points = data_prep(sample_points_fpath)

# Perform Train/Test Split
train, test = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

# Standardize the data
train, test = standardize_topo(train, test)

# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC

# Define set of attributes - includes target and predictors
attributes = ['LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_SCla', target]

# Train model
model = train_model(train_data=train, attributes=attributes, model_type=model_type)

# Evaluate Model
results = eval_model(model=model, test_data=test, attributes=attributes, model_type=model_type)

## Compare distributions of actual vs. predicted
In general we see a trend where the model underpredicts the most common classes and overpredicts the less common classes.

In [None]:
# Define metric to display - Percent of total or Count
metric = 'count'  # 'count' / 'perc'

# Create dataframe of predictions vs. true labels
pred_df = pd.DataFrame(data={'True' : test['LF22_FVT'], 'Prediction' : results['predictions']})

# Get counts of predictions
pred_gp = pred_df.groupby('Prediction').count().reset_index()
pred_gp = pred_gp[['Prediction', 'True']].rename(columns={'True':'pred_count'})
pred_gp['pred_perc'] = pred_gp['pred_count'] / pred_gp['pred_count'].sum() * 100
pred_gp['Prediction'] = pred_gp['Prediction'].astype('str')

# Get counts of true
true_gp = pred_df.groupby('True').count().reset_index()
true_gp = true_gp[['True', 'Prediction']].rename(columns={'Prediction':'true_count'})
true_gp['true_perc'] = true_gp['true_count'] / true_gp['true_count'].sum() * 100
true_gp['True'] = true_gp['True'].astype('str')

# Join the dataframes
pred_gp = pred_gp.set_index('Prediction')
true_gp = true_gp.set_index('True')

fvt_gp = true_gp.join(pred_gp, how='inner')
fvt_gp = fvt_gp.sort_values(by='true_count')

# Plot the data side-by-side
ind = np.arange(len(fvt_gp))  # The y locations for the groups
width = 0.3  # The width of the bars

fig, ax = plt.subplots(figsize=(15,40))
true_bars = ax.barh(ind+width/2, fvt_gp[f'true_{metric}'], width, label='Actual')
pred_bars = ax.barh(ind-width/2, fvt_gp[f'pred_{metric}'], width, label='Predicted')

ax.set_title('Actual vs. Predicted FVT Distribution')
ax.set_ylabel('FVT')
ax.set_xlabel(f'{metric}')
ax.set_yticks(ind)
ax.set_yticklabels(fvt_gp.index.values)
ax.legend()
ax.bar_label(true_bars, fmt=" %.2f")
ax.bar_label(pred_bars, fmt=" %.2f")

plt.ylim(ax.patches[0].get_y()-0.5, ax.patches[-1].get_y() + ax.patches[-1].get_height()+0.5)

plt.show()

### How much does each class contribute to the total classification error?
Where is the classification error coming from? The top most frequently misclassified, 2045 and 2080, contribute close to 8% of the error, and the rest is distributed across the other classes. These two classes also happen to be the most dominant on the landscape. Although it's interesting that the classifier performed better on 2028, despite it be almost as prevalent as 2045.

- 2045: Tr Northern Rocky Mountain Dry-Mesic Montane Mixed Conifer Forest
- 2080: Sh Inter-Mountain Basins Big Sagebrush Shrubland
- 2028: Tr Mediterranean California Mesic Mixed Conifer Forest and Woodland

In [None]:
# Define metric to display - Count or Percent
metric = 'perc_total_misclass'  # 'count' / 'perc' / 'perc_total_misclass

# Look at misclass distributions
fvt_gp['diff_count'] = fvt_gp['true_count'] - fvt_gp['pred_count']  # Get the count of misclasses
fvt_gp['diff_perc'] = abs(fvt_gp['diff_count']) / abs(fvt_gp['diff_count']).sum() * 100  # Get the percentage of total misclasses
fvt_gp['diff_perc_total_misclass']  = fvt_gp['diff_perc'] * (1 - results['metrics'][0])  # Get the percent contribution to misclassification
fvt_gp_diff_sort = fvt_gp.sort_values(by='diff_perc')

# Plot the data side-by-side
fig, ax = plt.subplots(figsize=(15,40))
diff_bars = ax.barh(fvt_gp_diff_sort.index.values, fvt_gp_diff_sort[f'diff_{metric}'])


ax.set_ylabel('FVT')
ax.set_xlabel(f'{metric}')
ax.set_yticks(ind)
ax.set_yticklabels(fvt_gp_diff_sort.index.values)
ax.bar_label(diff_bars, fmt=" %.2f")

plt.ylim(ax.patches[0].get_y()-0.5, ax.patches[-1].get_y() + ax.patches[-1].get_height()+0.5)

print(f"Total Misclass Error: {round((1-results['metrics'][0]) * 100, 2)}%")

plt.show()

## Evaluate the confusion matrix
Looking at the two most commonly misclassified classes (2080, 2045), how are they being misclassified? 

__2045__: Tr Northern Rocky Mountain Dry-Mesic Montane Mixed Conifer Forest

60% of samples were classified correctly. False predictions were fairly spread over the classes, but the most common mispredictions were:
- 2047 (10%): Tr Northern Rocky Mountain Mesic Montane Mixed Conifer Forest
- 2053 (7.6%): Tr Northern Rocky Mountain Ponderosa Pine Woodland and Savanna
- 2227 (6.3%): Tr Pseudotsuga menziesii Forest Alliance

__2080__: Sh Inter-Mountain Basins Big Sagebrush Shrubland

58% of samples were correctly classifed. False predictions were also fairly spread over the classes, but the most common mispredictoins were:
- 2125 (9.8%): Sh Inter-Mountain Basins Big Sagebrush Steppe
- 2065 (6.1%): Sh Columbia Plateau Scabland Shrubland
- 2123 (5.1%): He Columbia Plateau Steppe and Grassland
- 2282 (4.4%): Sh Great Basin & Intermountain Ruderal Shrubland
- 2273 (3.8%): He Great Basin & Intermountain Introduced Annual Grassland


In [None]:
# Visualize where misclasses are occurring using confusion matrix
plot_cm(y_true = test[target], y_pred=results["predictions"], normalize='true', figsize=60)

## What are the class-specific misclassification distributions? 

In [None]:
LF_ref_dicts = read_ref_data()

pred_df['True_FVT_NAME'] = pred_df['True'].map(LF_ref_dicts['LF22_FVT']) 
pred_df['Pred_FVT_NAME'] = pred_df['Prediction'].map(LF_ref_dicts['LF22_FVT']) 

pred_df['True_EVT_CLASS'] = pred_df['True'].map(LF_ref_dicts['FVT_CLASS']) 
pred_df['Pred_EVT_CLASS'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_CLASS']) 

pred_df['True_EVT_PHYS'] = pred_df['True'].map(LF_ref_dicts['FVT_PHYS']) 
pred_df['Pred_EVT_PHYS'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_PHYS']) 

pred_df['True_GP_N'] = pred_df['True'].map(LF_ref_dicts['FVT_GP_N']) 
pred_df['Pred_GP_N'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_GP_N']) 

pred_df['True_LF'] = pred_df['True'].map(LF_ref_dicts['FVT_LF']) 
pred_df['Pred_LF'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_LF']) 

### __2045__: Tr Northern Rocky Mountain Dry-Mesic Montane Mixed Conifer Forest


In [None]:
true_2045 = pred_df.loc[pred_df['True'] == 2045]
class_distribution(true_2045, metric = 'Percent', variable="Pred_FVT_NAME", group_var="True", sort='False')

### __2080__: Sh Inter-Mountain Basins Big Sagebrush Shrubland

In [None]:
true_2080 = pred_df.loc[pred_df['True'] == 2080]
class_distribution(true_2080, metric = 'Percent', variable="Pred_FVT_NAME", group_var="True", sort='False')

## Are the misclassifications ecologically similar?
-----
If the misclassifications are ecologically similar then they may not effect the F40 classification.

__QUESTIONS__: 
1. Are the results ecologically similar enough to use with the F40 model?
2. Is there a pattern that suggests what the model is struggling to separate?

In [None]:
# Get the predictions and join the EVT_* groupings
pred_df = pd.DataFrame(data={'True' : test['LF22_FVT'], 'Prediction' : results['predictions']})

LF_ref_dicts = read_ref_data()

pred_df['True_FVT_NAME'] = pred_df['True'].map(LF_ref_dicts['LF22_FVT']) 
pred_df['Pred_FVT_NAME'] = pred_df['Prediction'].map(LF_ref_dicts['LF22_FVT']) 

pred_df['True_EVT_FUEL'] = pred_df['True'].map(LF_ref_dicts['FVT_EVT'])
pred_df['Pred_EVT_FUEL'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_EVT'])

pred_df['True_EVT_CLASS'] = pred_df['True'].map(LF_ref_dicts['FVT_CLASS']) 
pred_df['Pred_EVT_CLASS'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_CLASS']) 

pred_df['True_EVT_PHYS'] = pred_df['True'].map(LF_ref_dicts['FVT_PHYS']) 
pred_df['Pred_EVT_PHYS'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_PHYS']) 

pred_df['True_GP_N'] = pred_df['True'].map(LF_ref_dicts['FVT_GP_N']) 
pred_df['Pred_GP_N'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_GP_N']) 

pred_df['True_LF'] = pred_df['True'].map(LF_ref_dicts['FVT_LF']) 
pred_df['Pred_LF'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_LF']) 

# Remove the NaNs - the confusion matrix won't work with them
null_count = len(pred_df.loc[pred_df.isnull().any(axis=1)])
total_count = len(pred_df)
print(f"Removed {null_count} null values out of {total_count} points ({round(null_count/total_count * 100, 2)}%).")
df = pred_df.loc[~pred_df.isnull().any(axis=1)]


In [None]:
pred_df.to_csv(r"C:\Users\mcco573\OneDrive - PNNL\Desktop\temp.csv")

In [None]:
sample_points.to_csv(r"C:\Users\mcco573\OneDrive - PNNL\Desktop\temp_sample_pts.csv")

In [None]:
# Get the accuracy of LF assignment
accuracy = accuracy_score(df['True_LF'], df['Pred_LF']) * 100
print(f"Accuracy: {round(accuracy, 2)}%")

# Plot the confusion matrix
plot_cm(df['True_LF'], df['Pred_LF'], figsize=10, normalize=None, title="EVT_LF Groups")

In [None]:
# Plot EVT_CLASS Grouping

# Get the accuracy of CLASS assignment
accuracy = accuracy_score(df['True_EVT_CLASS'], df['Pred_EVT_CLASS']) * 100
print(f"Accuracy: {round(accuracy, 2)}%")

# Plot the confusion matrix
plot_cm(df['True_EVT_CLASS'], df['Pred_EVT_CLASS'], figsize=10, normalize='true', title="EVT_CLASS Groups")

In [None]:
# Plot EVT_PHYS Groupings

# Get the accuracy of PHYS assignment
accuracy = accuracy_score(df['True_EVT_PHYS'], df['Pred_EVT_PHYS']) * 100
print(f"Accuracy: {round(accuracy, 2)}%")

# Plot the confusion matrix
plot_cm(df['True_EVT_PHYS'], df['Pred_EVT_PHYS'], figsize=15, normalize='true', title='EVT_PHYS Groups')

In [None]:
# Plot EVT_GP_N Groupings

# Get the accuracy of PHYS assignment
accuracy = accuracy_score(df['True_GP_N'], df['Pred_GP_N']) * 100
print(f"Accuracy: {round(accuracy, 2)}%")

plot_cm(df['True_GP_N'], df['Pred_GP_N'], figsize=60, normalize='true', title="EVT_GP_N Groups")

In [None]:
pred_df.head()

In [None]:
pred_df.columns

## Evaluating the effect of minority classes
----

## Evaluate FVT Class Distribution
Before digging into where misclasses are occurring, look at the distribution of FVT classes. At first inspection there are many classes with very few occurrences. The increase in cardinality that results from including these classes may significantly degrade model performance by obscuring signal. 


In [None]:
# Visualize overall class distribution of sample points
class_distribution(sample_points, metric='Percent', variable='LF22_FVT', sort=True)

In [None]:
# Define path to sample points
sample_points_fpath = paths_dict['sample_point_fpath']

# Read in and prepare data
sample_points = data_prep(sample_points_fpath)

print(f"\n")
## Get counts of target class
variable = 'LF22_FVT'
df_fvt_gp = sample_points.groupby(variable).count().reset_index()
df_fvt_gp = df_fvt_gp[[variable, 'LF20_FVT']].rename(columns={'LF20_FVT':'Count'})
df_fvt_gp['Percent'] = df_fvt_gp['Count'] / df_fvt_gp['Count'].sum() * 100
df_fvt_gp[variable] = df_fvt_gp[variable].astype('str')  # For plotting change FVT to str
df_fvt_gp = df_fvt_gp.sort_values(by='Count', ascending=False)

## How many classes comprise 90% of the data?
cumsum = 0
majority_classes = []

# Get list of the most common classes that comprise 90% of the data
for i in range(len(df_fvt_gp)):
    cumsum += df_fvt_gp.iloc[i]['Percent']
    majority_classes.append(int(df_fvt_gp.iloc[i]['LF22_FVT']))

    if cumsum >= 90:
        break

print(f"Cumulative Percent: {cumsum}")
print(f"{len(majority_classes)} Majority Classes out of {len(df_fvt_gp.index)} Total:\n{majority_classes}")


### Assess Model Performance after dropping the classes comprising the bottom 10% of the data
We may be able to improve model performance by dropping the 166 classes that comprise the bottom 10% of the data. In doing so, we sacrifice being able to identify those classes in order to correctly classify the most common classes. 

Dropping those classes improved classification accuracy by only about 8%, but balanced accuracy increased by closer to 20%. I suspect the set of predictors I have are insufficient to adequately separate the classes apart - some piece of information is missing. This likely has to do with the state and transition models used to model sucession. This could be verified by checking where misclassifications are occurring. 


In [None]:
# Run the model 
seed = 1234
target = 'LF22_FVT'

# Define train/test split
train_frac = 0.8
test_frac = 0.2

# Keep only the majority classes identified above
sample_points = sample_points.loc[sample_points['LF22_FVT'].isin(majority_classes)]
print(f"After removing all minority classes: {len(sample_points)}")

# Perform Train/Test Split
train, test = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

# Standardize the data
train, test = standardize_topo(train, test)

# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC

# Define set of attributes - includes target and predictors
attributes = ['LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', target]
#attributes = ['LF16_FVT', 'LF20_FVT', 'LF14_EVT', 'ZONE', 'LF22_FDST', 'BPS_NAME', 'BPS_FRG', target]

# Train model
model = train_model(train_data=train, attributes=attributes, model_type=model_type)

# Evaluate Model
results = eval_model(model=model, test_data=test, attributes=attributes, model_type=model_type)

## Compare distributions of actual vs. predicted (Majority Class Only)
In general we see a trend where the model underpredicts the most common classes and overpredicts the less common classes.

In [None]:
# Define metric to display - Percent of total or Count
metric = 'count'  # 'count' / 'perc'

# Create dataframe of predictions vs. true labels
pred_df = pd.DataFrame(data={'True' : test['LF22_FVT'], 'Prediction' : results['predictions']})

# Get counts of predictions
pred_gp = pred_df.groupby('Prediction').count().reset_index()
pred_gp = pred_gp[['Prediction', 'True']].rename(columns={'True':'pred_count'})
pred_gp['pred_perc'] = pred_gp['pred_count'] / pred_gp['pred_count'].sum() * 100
pred_gp['Prediction'] = pred_gp['Prediction'].astype('str')

# Get counts of true
true_gp = pred_df.groupby('True').count().reset_index()
true_gp = true_gp[['True', 'Prediction']].rename(columns={'Prediction':'true_count'})
true_gp['true_perc'] = true_gp['true_count'] / true_gp['true_count'].sum() * 100
true_gp['True'] = true_gp['True'].astype('str')

# Join the dataframes
pred_gp = pred_gp.set_index('Prediction')
true_gp = true_gp.set_index('True')

fvt_gp = true_gp.join(pred_gp, how='inner')
fvt_gp = fvt_gp.sort_values(by='true_count')

# Plot the data side-by-side
ind = np.arange(len(fvt_gp))  # The y locations for the groups
width = 0.3  # The width of the bars

fig, ax = plt.subplots(figsize=(15,40))
true_bars = ax.barh(ind+width/2, fvt_gp[f'true_{metric}'], width, label='Actual')
pred_bars = ax.barh(ind-width/2, fvt_gp[f'pred_{metric}'], width, label='Predicted')

ax.set_title('Actual vs. Predicted FVT Distribution')
ax.set_ylabel('FVT')
ax.set_xlabel(f'{metric}')
ax.set_yticks(ind)
ax.set_yticklabels(fvt_gp.index.values)
ax.legend()
ax.bar_label(true_bars, fmt=" %.2f")
ax.bar_label(pred_bars, fmt=" %.2f")

plt.ylim(ax.patches[0].get_y()-0.5, ax.patches[-1].get_y() + ax.patches[-1].get_height()+0.5)

plt.show()

## Evaluate the confusion matrix (Majority Class Only)
Looking at the two most commonly misclassified classes (2080, 2045), how are they being misclassified? 

__2045__: Tr Northern Rocky Mountain Dry-Mesic Montane Mixed Conifer Forest

60% of samples were classified correctly. False predictions were fairly spread over the classes, but the most common mispredictions were:
- 2047 (10%): Tr Northern Rocky Mountain Mesic Montane Mixed Conifer Forest
- 2053 (7.6%): Tr Northern Rocky Mountain Ponderosa Pine Woodland and Savanna
- 2227 (6.3%): Tr Pseudotsuga menziesii Forest Alliance

__2080__: Sh Inter-Mountain Basins Big Sagebrush Shrubland

58% of samples were correctly classifed. False predictions were also fairly spread over the classes, but the most common mispredictoins were:
- 2125 (9.8%): Sh Inter-Mountain Basins Big Sagebrush Steppe
- 2065 (6.1%): Sh Columbia Plateau Scabland Shrubland
- 2123 (5.1%): He Columbia Plateau Steppe and Grassland
- 2282 (4.4%): Sh Great Basin & Intermountain Ruderal Shrubland
- 2273 (3.8%): He Great Basin & Intermountain Introduced Annual Grassland


In [None]:
# Visualize where misclasses are occurring using confusion matrix
plot_cm(y_true = test[target], y_pred=results["predictions"], normalize='true', figsize=60)

## What are the class-specific misclassification distributions? (Majority Class Only)

In [None]:
LF_ref_dicts = read_ref_data()

pred_df['True_FVT_NAME'] = pred_df['True'].map(LF_ref_dicts['LF22_FVT']) 
pred_df['Pred_FVT_NAME'] = pred_df['Prediction'].map(LF_ref_dicts['LF22_FVT']) 

pred_df['True_EVT_CLASS'] = pred_df['True'].map(LF_ref_dicts['FVT_CLASS']) 
pred_df['Pred_EVT_CLASS'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_CLASS']) 

pred_df['True_EVT_PHYS'] = pred_df['True'].map(LF_ref_dicts['FVT_PHYS']) 
pred_df['Pred_EVT_PHYS'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_PHYS']) 

pred_df['True_GP_N'] = pred_df['True'].map(LF_ref_dicts['FVT_GP_N']) 
pred_df['Pred_GP_N'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_GP_N']) 

pred_df['True_LF'] = pred_df['True'].map(LF_ref_dicts['FVT_LF']) 
pred_df['Pred_LF'] = pred_df['Prediction'].map(LF_ref_dicts['FVT_LF']) 

In [None]:
true_2045 = pred_df.loc[pred_df['True'] == 2045]
class_distribution(true_2045, metric = 'Percent', variable="Pred_FVT_NAME", group_var="True", sort='False')

### __2080__: Sh Inter-Mountain Basins Big Sagebrush Shrubland

In [None]:
true_2080 = pred_df.loc[pred_df['True'] == 2080]
class_distribution(true_2080, metric = 'Percent', variable="Pred_FVT_NAME", group_var="True", sort='False')

## Are the misclassifications ecologically similar? (Majority Class Only)

If the misclassifications are ecologically similar then they may not effect the F40 classification.

__QUESTIONS__: 
1. Are the results ecologically similar enough to use with the F40 model?
2. Is there a pattern that suggests what the model is struggling to separate?

In [None]:
# Plot EVT_PHYS Groupings

# Get the accuracy of PHYS assignment
accuracy = accuracy_score(df['True_EVT_PHYS'], df['Pred_EVT_PHYS']) * 100
print(f"Accuracy: {round(accuracy, 2)}%")

# Plot the confusion matrix
plot_cm(df['True_EVT_PHYS'], df['Pred_EVT_PHYS'], figsize=10, normalize='true', title='EVT_PHYS Groups')

In [None]:
# Plot EVT_GP_N Groupings

# Get the accuracy of PHYS assignment
accuracy = accuracy_score(df['True_GP_N'], df['Pred_GP_N']) * 100
print(f"Accuracy: {round(accuracy, 2)}%")

plot_cm(df['True_GP_N'], df['Pred_GP_N'], figsize=40, normalize='true', title="EVT_GP_N Groups")

In [None]:
pred_df

In [None]:
pred_df.columns