# ABOUT

__Author__: Pat McCornack

__Date__: 4/9/24

__Purpose__: Develop model to update FVT using most recent FDist data. 

------

In [None]:
import os
import datetime as dt
from joblib import dump
from itertools import combinations

import numpy as np
import pandas as pd
import geopandas as gpd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import TargetEncoder, StandardScaler
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier


import matplotlib.pyplot as plt




## Define Paths
Define set of filepaths to conveniently switch between working off local files or the PNNL drive. Set active_data_dir to either local_data_dir or pnnl_data_dir depending on which you're working off of. 

In [None]:
local_root_dir = r"C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\Fuel Attributes Model"
pnnl_root_dir = r"\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\Fuel Attributes Model"
# Define which data directory to work off of
active_root_dir = local_root_dir

paths_dict = {
    "out_base_dir" : os.path.join(active_root_dir, r"model_outputs\tabular"),  # Where to save outputs 
    "ref_data_dir" : os.path.join(active_root_dir, r"..\LF_raster_data\_tables"),  # Location of LF csvs (e.g. LF22_FVT_230.csv)
    "sample_points_dir" : os.path.join(active_root_dir, r"data\sample_points"),  # Location of shapefile to train model on
    "sample_points_fname" : r"sample_points_4-17-24_200k_Disturbed.shp",  # Name of shapefile to train model on
    "runs_dict_fpath" : os.path.join(active_root_dir, r"data\runs_dict.csv"),  # Used to test different sets of predictors 
    "model_dir" : os.path.join(active_root_dir, "models")
}

# __Functions__
----


## __Create a Directory to Output Modeling Results__
Names the output directory using the datetime that the script was run. 
Returns the name of the directory. The returned directory is used to output the trained model and/or results. 

In [None]:
def make_dir(base_dir, new_dir_name='model_results'):
        """
        Returns path to a directory created at the specified base_dir location. 

        The name of the created directory can optionally be specified using the dir_name argument. 
        """

        datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        output_dir = os.path.join(base_dir, new_dir_name + "_" + datetime)

        os.makedirs(output_dir)
        return output_dir

## __Categorical Variable Encoder__
Categorical variables must be encoded before being used in some model types. Note that due to HGBC's native encoding support this function is not needed when using that model type.

__Note:__ Aspect, elevation, and slope are the only LANDFIRE variables of interest that are not categorical, which is why they're removed from the predictor list before the data is encoded. 

In [None]:
def targetEncoder(train, test, predictors, target):
    """
    Returns the train and test sets with the categorical predictors encoded. The predictors argument is a list of features being used as 
    predictors in the model, while target is the name of target feature. 
    """
    # Subset to attributes of interest
    attributes = predictors + [target]
    train = train[attributes]
    test = test[attributes]

    # Separate out the predictors from the target and remove continuous variables
    cat_predictors = [x for x in predictors if x not in ['ASPECT', 'ELEVATION', 'SLOPE']]  # Drops continuous variables

    # Encode the features
    enc = TargetEncoder(target_type="multiclass", random_state=1234).set_output(transform="pandas")
    enc.fit(train[cat_predictors], train[target])  # Fit the encoder
    train_trans = enc.transform(train[cat_predictors])  # Encode the train data
    test_trans = enc.transform(test[cat_predictors])  # Encode the test data

    # Replace the features with encoded features
    train = train.drop(cat_predictors, axis=1)
    train = pd.concat([train, train_trans], axis=1)

    test = test.drop(cat_predictors, axis=1)
    test = pd.concat([test, test_trans], axis=1)
    
    return train, test

## __Standardize Data__
This is primarily used to standardize data for the MLP since tree-based methods aren't sensitive to data scale. By default it is set to standardize only topographic variables given that they are the only continuous LANDFIRE features of interest. 

In [None]:
def standardize_data(train, test, features=['SLOPE', 'ASPECT', 'ELEVATION']):
    """
    Return train and test datasets with the features in the 'features' argument having been standardized.
    """

    # Fit the scaler to the training data
    scaler = StandardScaler().fit(train[features])

    # Scale the training data
    train[features] = scaler.transform(train[features])

    # Scale the test data
    test[features] = scaler.transform(test[features])

    return train, test


## __Get List of Lists of Predictor Sets__
In the interest of assessing model performance using multiple sets of predictors, this function utilizes an excel file to generate a list of lists of sets of predictors that can be iterated through the model train/evaluation functions. 

In [None]:
def get_runs(runs_dict_fpath=paths_dict['runs_dict_fpath']):
    """
    Returns a list of lists, where each list is a set of predictors to evaluate model performance with. 

    runs_dict_fpath is the path to an excel file where each column corresponds to a feature, and each row corresponds to a set
    of predictors. If a cell contains a '1', then that feature is included that set of predictors.
    """
    df = pd.read_csv(runs_dict_fpath)
    
    # Drop the "Run" column
    df = df.drop('Run', axis=1)
    
    runs = []

    for i in range(len(df)):
        row = df.iloc[i]
        runs.append(row.loc[row==1].index.tolist())
    
    return runs


## __Generate Combinations of Features of Interest__
Alternatively, automatically generate all possible combinations of a set of potential predictor features to evaluate. 

__Caution__: The size of the list of lists of predictor sets can quickly become unmanageable when too many features of interest are included. 

In [None]:
def generate_combinations(attributes, target):
    """
    Returns a list of lists of all possible combinations of the list of attributes provided. 
    """

    attributes = [x for x in attributes if x not in [target]]  # Drop the target variable
    predictors = []

    for i in range(len(attributes)):
        combs = [list(x) for x in combinations(attributes, i)]
        for item in combs:
            predictors.append(item)

    return predictors[1:]    

## __Pre-Process the Data__

### __Create Data Dictionaries to Append Features__
Some features are separate attributes of the LANDFIRE dataset (e.g. BPS Fire Regime) and others are useful for results analysis (e.g. FDst attributes). These can be mapped to points using LANDFIRE CSVs. The below creates dictionaries to perform that mapping. 

This function is called by join_features.

In [None]:
def read_ref_data(ref_data_dir=paths_dict["ref_data_dir"]):
    """
    Returns a dictionary of dictionaries of mappings between LANDFIRE raster values and other attributes associated with those values. 
    """
    data_dir = ref_data_dir
    BPS_fname = "LF20_BPS_220.csv"
    LF16_EVT_fname = "LF16_EVT_200.csv"
    LF22_FDST_fname = "LF22_FDST_230.csv"
    LF22_FVT_fname = "LF22_FVT_230.csv"
    LF22_EVT_fname = "LF22_EVT_230.csv"
    
    # Create empty dictionary
    LF_ref_dicts = {}

    # Get BPS reference dictionary
    BPS_df = pd.read_csv(os.path.join(data_dir, BPS_fname))
    LF_ref_dicts["BPS_NAME"] = dict(BPS_df[['VALUE', 'BPS_NAME']].values)
    LF_ref_dicts["BPS_FRG"] = dict(BPS_df[['VALUE', 'FRG_NEW']].values)

    # Get FDST reference dictionaries 
    FDST_df = pd.read_csv(os.path.join(data_dir, LF22_FDST_fname ))
    LF_ref_dicts["FDST_TYPE"] = dict(FDST_df[['VALUE', 'D_TYPE']].values)
    LF_ref_dicts["FDST_SEV"] = dict(FDST_df[['VALUE', 'D_SEVERITY']].values)
    LF_ref_dicts["FDST_TSD"] = dict(FDST_df[['VALUE', 'D_TIME']].values)

    # Get EVT reference dictionaries
    EVT_df = pd.read_csv(os.path.join(data_dir, LF16_EVT_fname))
    LF_ref_dicts["EVT_PHYS"] = dict(EVT_df[['VALUE', 'EVT_PHYS']].values)
    LF_ref_dicts["EVT_GP_N"] = dict(EVT_df[['VALUE', 'EVT_GP_N']].values) 
    LF_ref_dicts["EVT_CLASS"] = dict(EVT_df[['VALUE', 'EVT_CLASS']].values) 

    # Get FVT reference dictionaries
    LF22_FVT_df = pd.read_csv(os.path.join(data_dir, LF22_FVT_fname))
    LF_ref_dicts['LF22_FVT'] = dict(LF22_FVT_df[['VALUE', 'EVT_FUEL_N']].values)
    
    ## Map FVT to EVT Groupings
    LF22_EVT_df = pd.read_csv(os.path.join(data_dir, LF22_EVT_fname))
    LF_ref_dicts['FVT_EVT'] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_FUEL_N']].values)
    LF_ref_dicts["FVT_PHYS"] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_PHYS']].values)
    LF_ref_dicts["FVT_GP_N"] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_GP_N']].values) 
    LF_ref_dicts["FVT_CLASS"] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_CLASS']].values) 
    LF_ref_dicts["FVT_LF"] = dict(LF22_EVT_df[['EVT_FUEL', 'EVT_LF']].values)

    return LF_ref_dicts
                         
        

### __Append Features using Data Dictionaries__
Append in selected features using the LANDFIRE data dictionaries.

__Note:__ Items in feature_list must be in the source_layers dictionary. 

In [None]:
def join_features(sample_points, feature_list = ['BPS_NAME', 'BPS_FRG', 'FDST_TYPE', 'FDST_SEV', 'FDST_TSD', 'EVT_PHYS', 'EVT_GP_N', 'EVT_CLASS']):
    """
    Returns the sample_points layer with the features in feature_list appended. 

    Items in feature_list must be in the source_layers dictionary.  
    """
    
    LF_ref_dicts = read_ref_data()
    
    source_layers = {
        'BPS_NAME' : 'BPS', 
        'BPS_FRG' : 'BPS',
        'FDST_TYPE' : 'LF22_FDST',
        'FDST_SEV' : 'LF22_FDST', 
        'FDST_TSD' : 'LF22_FDST', 
        'EVT_PHYS' : 'LF16_EVT',
        'EVT_GP_N' : 'LF16_EVT',
        'EVT_CLASS' : 'LF16_EVT'
    }

    # Iterate through feature_list and append features to sample_points
    for feature in feature_list:
        sample_points[feature] = sample_points[source_layers[feature]].map(LF_ref_dicts[feature]).copy()

    return sample_points

### __Low Count Filter__
Some classes have exceedingly low representation in the dataset. In order to run train_test_split, the target class must have a count greater than 1. The following will filter out classes with counts of 1. Given the size of the dataset excluding these classes will not detrimentally impact model performance.

In [None]:
def low_count_filter(sample_points, target):  
    """
    Returns modified sample_points dataframe. Classes in the target feature that have counts of 1 are removed. 
    """  
    # Group by FVT
    LF22_FVT_counts = sample_points.groupby(target).count()

    # Identify FVTs with low observation counts
    low_count_FVT = LF22_FVT_counts[LF22_FVT_counts.iloc[:,0] < 5].index.tolist()

    # Remove those FVTs from sample_points
    sample_points = sample_points.loc[~sample_points[target].isin(low_count_FVT)]
    print(f"Low count {target}: {low_count_FVT}")
    print(f"After removing low count {target}: {sample_points.shape}")
    print(f"Sample Points Attributes: {sample_points.columns}")

    return sample_points

### __Prepare the Sample Points__
Before training the model, the sample points need to be filtered. Filtering steps are:
- Remove near points - defined as points that are less than 70m apart.
- Remove Null points (-9999/-1111 values) - these will not be updated in the final raster. 
- Remove undisturbed values - the model is intended to update points that have been disturbed in the last 10 years. 
- Remove points with agricultural or developed FVT classes - these are updated using datasets not used for other classes. 
- Remove observations with classes that have very low representation (less than 5 observations in the class) in the dataset. This is done to allow train_test_split to run. 
- Optionally, join in additional features. feature_list may be modified to select which features to add in (see join_features for further details). 

In [None]:
def data_prep(sample_point_fpath, target):
    """
    Returns dataframe of processed sample points. Sample points are read in from shapefile. Processing includes filtering out null points and developed/agricultural FVTs as 
    well as appending additional features. 
    """

    # Read in gdf
    sample_points = gpd.read_file(sample_point_fpath)
    print(f"Total Number of points: {sample_points.shape[0]}")

    # Filter out near points
    sample_points = sample_points.loc[sample_points['NEAR_FID'] == -1]
    print(f"After removing near points: {sample_points.shape[0]}")

    # Drop unneeeded columns if present
    sample_points = sample_points.drop(['Classified', 'GrndTruth', 'NEAR_FID', 'NEAR_DIST'], axis=1,
                                       errors='ignore')
    
    # Remove observations with -9999/-1111 in any field 
    sample_points = sample_points.loc[~sample_points.isin([-1111, -9999]).any(axis=1)]
    print(f"After removing null points: {sample_points.shape[0]}")

    # Filter out points that weren't disturbed
    sample_points = sample_points.loc[~(sample_points['LF22_FDST'] == 0)]
    print(f"After removing undisturbed points: {sample_points.shape[0]}")

    # Join in additional features
    sample_points = join_features(sample_points)

    # Filter out agricultural and developed points
    developed_fvt = list(range(20,33)) + list(range(2901,2906))
    ag_fvt = [80, 81, 82] + list(range(2960, 2971))
    fvt_filter = developed_fvt + ag_fvt
    sample_points = sample_points.loc[~sample_points['LF22_FVT'].isin(fvt_filter)]

    # Filter out classes in the target feature with very low counts
    sample_points = low_count_filter(sample_points, target)

    return sample_points


## __Instantiate Estimators__

### __Histogram Based Gradient Boosting Classifier__
Scikit-learn implementation of Histogram Based Gradient Boosting Classifier

In [None]:
def histGradientBoostingClassifier(categorical_feature_list, class_weight='Balanced', seed=1234):
    """
    Returns specified histogram-based gradient boosting classifier. 
    """

    hgb_classifier = HistGradientBoostingClassifier(
        categorical_features=categorical_feature_list,  # Natively handle categorical variables
        class_weight=class_weight,
        random_state=seed,
        learning_rate=0.01,
        max_iter=100
    )

    return hgb_classifier

### __Catboost Gradient Boosting__
Catboost implementation of Gradient Boosting Classifier

In [None]:
def catBoost_hgbc(cat_features):
    """
    Returns specified gradient boosting classifier from catBoost library. 
    """
    cb_hgbc = CatBoostClassifier(
        iterations=2000, 
        learning_rate=0.01,
        loss_function="MultiClass",
        #boosting_type="Plain",
        cat_features=cat_features,  # List of categorical features
        auto_class_weights='Balanced', # Used to handle class imbalance
        random_seed=1234,
        task_type='GPU'  # Train the model on GPU
    )

    return cb_hgbc

### __Random Forest Classifier__
Scikit-learn implementation Random Forest Classifier


In [None]:
def randomForestClassifier(n_est=100, min_samples_leaf=50, bootstrap=True, oob_score=True, 
                           n_jobs=-1, random_state=1234, max_features='auto', class_weight=None):
    """
    Returns specified Random Forest Classifier. 
    """
    rf_classifier = RandomForestClassifier(
        n_estimators=n_est,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        oob_score=oob_score,
        n_jobs=n_jobs,
        random_state=random_state,
        max_features=max_features,
        class_weight=class_weight,
        max_depth = None
    )

    return rf_classifier


### __Multi-Layer Perceptron__
Scikit-learn implementation of Multi-Layer Perceptron. Note that this has not been tuned.

In [None]:
def multiLayerPerceptron():
    """
    Returns specified Multi Layer Perceptron.
    """
    clf = MLPClassifier(
        max_iter = 10000,
        activation = 'relu',  # 'relu' is the default
        hidden_layer_sizes = (256, 128, 64)
    )

    return clf

## __Train Model__
Trains and returns HGBC model provided data. Subsets the training data to a provided list of predictors and predicts the specified target feature.

__Note__: Aspect, Elevation, and Slope are the only continuous LANDFIRE datasets, therefore any feature not in that list is assumed to be categorical.

In [None]:
def train_model(train_data, predictors, target, model_type='HGBC'):
    """
    Returns trained model of specified type given a set of predictors. 
    """

    class_weight = "balanced"

    # Get list of predictors for run
    cat_features = [x for x in predictors if x not in ['ASPECT', 'ELEVATION', 'SLOPE']]  # Categorical features for HGBC models  
    
    # Setting the data type to categorical speeds up model training for catBoost
    if model_type == "catBoost":
        train_data[cat_features] = train_data[cat_features].astype("category")

    # Separate training data predictors/response
    y_train = train_data[target].copy()
    X_train = train_data[predictors].copy()

    # Fit selected model_type
    if model_type == 'RF':
        model = randomForestClassifier(n_est=100, min_samples_leaf=50, bootstrap=True, oob_score=True,
                                            n_jobs=-1, random_state=1234, max_features='sqrt', class_weight=class_weight)
    elif model_type == 'HGBC':
        model = histGradientBoostingClassifier(categorical_feature_list=cat_features, class_weight=class_weight)
    
    elif model_type == 'catBoost':
        model = catBoost_hgbc(cat_features=cat_features)

    elif model_type == 'MLP':
        print('Defining MLP...')
        model = multiLayerPerceptron()
    
    # Fit the model with the training data
    print('Fitting Model...')
    model.fit(X_train, y_train)

    # Return the fit model
    return model         
    

## __Evaluate Model__
Provided a trained model, test data, a set of predictors, and a target - return a list of predictions metrics on prediction accuracy.

In [None]:
def eval_model(model, test_data, predictors, target, model_type='HGBC'):
    """
    Returns a dictionary containing 1. 'metrics' : a list of metrics quantifying model performance, and 2. 'predictions' : a list of predictions corresponding to 
    each observation in the sample_points data.
    """
       
    # Separate the predictors from target
    y_test = test_data[target].copy()
    x_test = test_data[predictors].copy()

    # Perform prediction
    y_pred = model.predict(x_test)

    # Get metrics
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    bal_acc = round(balanced_accuracy_score(y_test, y_pred),3)
    recall = round(recall_score(y_test, y_pred, average='macro'), 3)
    precision = round(precision_score(y_test, y_pred, average='macro'), 3)
    f1 = round(f1_score(y_test, y_pred, average='macro'), 3)

    print(f"Predictors: {predictors}")
    print(f"Accuracy: {accuracy}")
    print(f"Balanced Accuracy: {bal_acc}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F1: {f1}")

    return {
        "metrics" : [accuracy, bal_acc, recall, precision, f1, predictors],
        "predictions" : y_pred
    }


## __Results Evaluation Functions__

### __Plot the Class Distribution__
Plots the distribution of the variable of interest. Can be set to display the raw counts for each class or the percentage of the dataste that the class accounts for. 

In [None]:
def plot_distribution(df, group_var, metric='Count', sort=True, title=f'Distribution', figsize=(15, 50)):
    """
    Plots the distribution of group_var. 'metric' can be set to either 'Count' or 'Percent' to change the display type. 
    The 'sort' argument specifies whether to sort the display in order from most common to least common. 
    """

    # Get count of target class
    df_gp = df.groupby(group_var).count().reset_index()
    df_gp = df_gp.iloc[:, 0:2].rename(columns={df_gp.columns[1] : 'Count'})

    # Get classes proportional representation
    df_gp['Percent'] = df_gp['Count'] / df_gp['Count'].sum() * 100
    df_gp[group_var] = df_gp[group_var].astype('str')  # For plotting change FVT to str
    if sort == True: 
        df_gp = df_gp.sort_values(by="Count", ascending=True)

    # Visualize the distribution
    fig, ax = plt.subplots(figsize=figsize)
    hbars = ax.barh(df_gp[group_var],
                df_gp[metric],
                align='center')
    ax.set(title=title,
        xlabel= f'{metric}', 
        ylabel=f'{group_var}')  
    ax.bar_label(hbars, fmt=" %.2f")
    
    # Remove whitespace at margins
    plt.ylim(ax.patches[0].get_y(), ax.patches[-1].get_y() + ax.patches[-1].get_height()+0.5)

    plt.show()

### __Plot Confusion Matrix__
Generates a confusion matrix of true labels vs. predicted labels.
Normalization options are: 
- 'true' : Normalizes over the rows.
- 'pred' : Normalizes over the columns. 
- 'all' : Normalizes over the entire population. 

In [None]:
# Plot a confusion matrix
def plot_cm(y_true, y_pred, normalize=None, figsize=10, title="Confusion Matrix"):
    """
    Plots confusion matrix of true values (y_true) against predicted values (y_pred). Can be normalized by setting 
    normalize='true'/'pred'/'all'.
    """

    np.set_printoptions(precision=2) 

    # Define title
    title = f"{title}, normalize = {normalize}"

    # Plot confusion matrix
    disp = ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                                                   cmap=plt.cm.Blues,
                                                   normalize=normalize,
                                                   xticks_rotation='vertical'
                                                   )
    disp.ax_.set_title(title)

    # Set plot size
    fig = disp.ax_.get_figure()
    fig.set_figwidth(figsize)
    fig.set_figheight(figsize)

    # Display results
    plt.show()

# __Main__

----

## __FVT__

In [None]:
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVT'

## Define train/test split proportion
train_frac = 0.7
test_frac = 0.3

## Define path to sample points
sample_points_fpath =  os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_point_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

## Perform Train/Test Split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

In [None]:
# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors
predictors = ['LF20_FVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_FVC', 'LF20_FVH']

# Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, features=predictors)
else:
    train, test = train_points, test_points

# Train model
model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

# Evaluate Model
print("Score on the Training Set: ")
results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)

print("Score on the test set: ")
results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)

In [None]:
# Save out results
## Concatenate predictions to test set
test[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=f'{target}_model_results')
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


Create a test set using the FVT predictions. This will then be fed into the FVC/FVH models to see how replacing the true FVT values with predicted values will impact the FVC/FVH predictions

In [None]:
fvt_test = test.copy()
fvt_test["LF22_FVT_Predictions"] = results['predictions']

## __FVC__

In [None]:
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVC'

## Define train/test split proportion
train_frac = 0.7
test_frac = 0.3

## Define path to sample points
sample_points_fpath =  os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

## Perform Train/Test Split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

In [None]:
# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors
predictors = ['LF22_FVT', 'LF20_FVC', 'LF16_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME']


# Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, predictors, target)
else:
    train, test = train_points, test_points

# Train model
model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

# Evaluate Model
print("Score on the Training Set: ")
results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)

print("Score on the test set: ")
results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)

In [None]:
# Save out results
## Concatenate predictions to test set
test[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=f'{target}_model_results')
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


### Run on multiple sets of predictors
Used to compare/contrast the performance of the model using various sets of predictors. 

In [None]:
# Run multiple sets of predictors to compare

# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors - includes target and predictors
predictors_list = [
    ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE'],
    ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME'],
    ['LF22_FVT', 'LF20_FVC', 'LF16_FVC', 'LF22_FDST', 'ZONE'],
    ['LF22_FVT', 'LF20_FVC', 'LF16_FVC', 'LF22_FDST', 'ZONE', 'BPS_NAME'],
    ['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF20_FVC', 'LF16_FVC', 'LF20_FVH', 'LF16_FVH', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME']
]

# Define results DF
results_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'predictors', 'model_type', 'train/test'])

# Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, predictors, target)
else:
    train, test = train_points, test_points

for predictors in predictors_list: 
    # Train model
    model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

    # Evaluate Model
    print("Score on the Training Set: ")
    results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)
    results_df.loc[len(results_df)] = results['metrics'] + [f'{model_type}', 'train']

    print("Score on the test set: ")
    results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)
    results_df.loc[len(results_df)] = results['metrics'] + [f'{model_type}', 'test']

In [None]:
results_df

### __FVC with Predicted FVT__


In [None]:
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVC'

## Define train/test split proportion
train_frac = 0.8
test_frac = 0.2

## Define path to sample points
sample_points_fpath = os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

## Perform Train/Test Split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

In [None]:
# Define and run model
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

## Define set of predictors - includes target and predictors
predictors = ['LF22_FVT', 'LF20_FVC', 'LF22_FDST', 'ZONE']

## Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, predictors, target)
else:
    train, test = train_points, test_points

## Train model
model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

## Evaluate Model
print("Score on the Training Set: ")
results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)

print("Score on test set with predicted FVTs")
results = eval_model(model=model, test_data=fvt_test, predictors=predictors, target=target, model_type=model_type)


In [None]:
# Save out results
## Concatenate predictions to test_points set
test_points[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=paths_dict['new_dir_name'])
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test_points.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


##  __FVH__

In [None]:
# Load in data and perform train/test split.
seed = 1234
target = 'LF22_FVH'

## Define train/test split proportion
train_frac = 0.8
test_frac = 0.2

## Define path to sample points
sample_points_fpath = os.path.join(paths_dict['sample_points_dir'], paths_dict['sample_points_fname'])

## Read in and prepare data
sample_points = data_prep(sample_points_fpath, target)

## Perform Train/Test Split
train_points, test_points = train_test_split(sample_points, train_size=train_frac, test_size=test_frac, 
                               random_state=seed, shuffle=True, stratify=sample_points[target])

In [None]:
# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC / catBoost / MLP

# Define set of predictors - includes target and predictors
predictors = ['LF22_FVT', 'LF20_FVH', 'LF16_FVH', 'LF22_FDST', 'ZONE', 'BPS_NAME']
#predictors = ['LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'LF20_FVC', 'LF20_FVH',  target]

# Extra processing if using Random Forest or Multi-Layer Perceptron
if model_type == 'RF' or model_type == 'MLP':
    train, test = targetEncoder(train_points, test_points, predictors, target)
    predictors = train.columns.tolist()

    # For multi-layer perceptron, encoded data needs to be standardized
    if model_type == 'MLP':
        train, test = standardize_data(train, test, predictors, target)
else:
    train, test = train_points, test_points

# Train model
model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

# Evaluate Model
print("Score on the Training Set: ")
results = eval_model(model=model, test_data=train, predictors=predictors, target=target, model_type=model_type)

print("Score on the test set: ")
results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)

In [None]:
# Save out results
## Concatenate predictions to test_points set
test_points[f'PRED_{target}'] = results['predictions']

## Create dataframe of metrics
metrics_df = pd.DataFrame(columns=['accuracy', 'balanced accuracy', 'recall', 'precision' ,'f1 score', 'attributes', 'model_type'])
metrics_df.loc[0] = results['metrics'] + [model_type]

## Save out the dataframes
out_dir = make_dir(paths_dict['out_base_dir'], new_dir_name=paths_dict['new_dir_name'])
preds_out_fname = f"Predictions_{target}_{model_type}.csv"
test_points.to_csv(os.path.join(out_dir, preds_out_fname))

metrics_out_fname = f"Metrics_{target}_{model_type}.csv"
metrics_df.to_csv(os.path.join(out_dir, metrics_out_fname))

# Save out model to results dir
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_fname = f"{target}_{model_type}_model_{datetime}"
dump(model, os.path.join(out_dir, model_fname))


### Evaluate a list of models
Run multiple sets of predictors to evaluate against each other. 

In [None]:
# Define the type of model to be used
model_type = 'HGBC'  # RF / HGBC

# Generate combinations of potential predictor sets
##predictors = ['LF22_FVT', 'LF20_FVT', 'LF16_FVT', 'LF14_EVT', 'LF22_FDST', 'ZONE', 'ASPECT', 'SLOPE', 'ELEVATION', 'BPS_NAME', 'EVT_GP_N', 'EVT_PHYS', 'EVT_CLASS']
##predictors_list = generate_combinations(predictors, 'LF22_FVT')

# Alternatively, get list of lists of predictors sets from file 
##predictors_list = get_runs()

# Alternatively, define a list of lists of attribute combinations
predictors_list = [
    ['ASPECT', 'ELEVATION', 'SLOPE', 'LF22_FDST', 'ZONE'],
    ['ASPECT', 'ELEVATION', 'SLOPE', 'LF22_FDST', 'ZONE'],
    ['ASPECT', 'ELEVATION', 'SLOPE', 'LF20_EVT', 'LF22_FDST', 'ZONE'],
    ['LF20_EVT', 'LF22_FDST', 'ZONE', 'BPS_NAME']

]

# Dataframe to store results
result_df = pd.DataFrame(columns=['Accuracy', 'Balanced_Accuracy', 'Recall', 'Precision', 'F1', 'Predictors'])

# Iterate through combinations of predictors
for predictors in predictors_list:
    # Train model
    model = train_model(train_data=train, predictors=predictors, target=target, model_type=model_type)

    # Evaluate Model
    results = eval_model(model=model, test_data=test, predictors=predictors, target=target, model_type=model_type)['metrics']

    result_df.loc[len(result_df)] = results

result_df