# ABOUT
__Author__: Pat McCornack

__Date__: 4/15/2024

__Purpose:__ Script to apply a trained model to predict FVT values. Outputs a raster of predicted FVT values across the bpa service territory. 

----

In [None]:
import os
import glob

import pandas as pd

import rasterio
from rasterio.merge import merge
from rasterio.windows import Window

import datetime as dt

from joblib import load
from joblib import Parallel, delayed

from IPython.display import clear_output


In [None]:
# Define filepaths
local_root_dir = r"C:\Users\mcco573\OneDrive - PNNL\Documents\_Projects\BPA Wildfire\Fuel Attributes Model"
pnnl_root_dir = r"\\pnl\projects\BPAWildfire\data\Landfire\fuels_modeling\Fuel Attributes Model"

active_root_dir = pnnl_root_dir


paths_dict = {
    'data_dir' : os.path.join(active_root_dir, r'..\LF_raster_data\bpa_service_territory'),
    'out_base_dir' : os.path.join(active_root_dir, r'model_outputs\geospatial'),
    'new_dir_name' : 'LF22_Pred_FVT',  # Name of created directory to hold results - will have datetime appended to it
    'ref_data_dir' : os.path.join(active_root_dir, r'..\LF_raster_data\_tables'),
    'model_dir' : os.path.join(active_root_dir, 'models'),
    'model_fname' : "LF22_FVT_HGBC_model_2024-05-10_16-49-07"
    
}

# __Functions__

-----

## __Make Directory__
Creates a directory where data will be output - labeled with the datetime that the script is run. Returns name of directory.

In [None]:
def make_dir(base_dir, new_dir_name):
        """
        Create a directory named using the current datetime.
        Returns:
        - Name of the directory as a string
        """

        datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        output_dir = os.path.join(base_dir, new_dir_name + "_" + datetime)

        os.makedirs(output_dir)
        return output_dir

## __Feature Engineering__
Use values from the rasters to create new features. 

### Read Reference Data
Use the LANDFIRE csv tables to create dictionaries to be used to map raster layer values to other features.
1. Separate LF22_FDST into separate features for type, severity, and time since disturbance.
2. Map the BPS value to BPS_NAME in order to reduce cardinality. 

In [None]:
ref_data_dir = paths_dict['ref_data_dir']
def read_ref_data(ref_data_dir=ref_data_dir):
    """
    Creates dictionaries from LF attributes tables that are used to map layer values to other features. Returns a dictionary of dictionaries. 
    """
    data_dir = ref_data_dir
    BPS_fname = "LF20_BPS_220.csv"

    # Create empty dictionary
    LF_ref_dicts = {}

    # Get BPS reference dictionary
    BPS_df = pd.read_csv(os.path.join(data_dir, BPS_fname))
    LF_ref_dicts["BPS_NAME"] = dict(BPS_df[['VALUE', 'BPS_NAME']].values)
   
    return LF_ref_dicts                        
        

### Join Features
Use dictionaries created using read_ref_data to map raster values to new features. 

In [None]:
def join_features(df, feature_list = ['BPS_NAME']):
    """
    Joins in additional features using LF attribute tables. 
    """
    
    LF_ref_dicts = read_ref_data()
    
    source_layers = {
        'BPS_NAME' : 'BPS', 
    }

    for feature in feature_list:
        df[feature] = df[source_layers[feature]].map(LF_ref_dicts[feature]).copy()

    return df

## __Preprocess window dataframe__
Prepares the data to be run through the model. Separates out null and non-disturbed. Returns a dictionary with: 
1. A clean dataframe to be run through the model. 
2. A dataframe of the dropped observations to be rejoined to model predictions. This allows for the data to be reshaped to a 2D numpy array and written as a raster. 

In [None]:
def data_prep(df):
    """
    Prepares data to run model on.
    Input: 
    - A dataframe containing created using the flattened numpy arrays returned from reading in the raster chunks. 
    Returns: 
    - Clean dataframe without NULL data or non-burnable F40 Classes 
    - Dataframe with the dropped observations - to be reappended after prediction
    """
    # Join in additional features
    df = join_features(df, feature_list = ['BPS_NAME'])

    # Remove -9999/-1111 (Null values)
    null_points = df[(df.isin([-1111, -9999])).any(axis=1)]  # Find rows with -1111/-9999 in any column
    df = df.drop(null_points.index, axis=0)  # Drop those rows
    
    # Remove non-disturbed points
    non_disturbed = df.loc[df['LF22_FDST'] == 0]
    df = df.drop(non_disturbed.index, axis=0)

    # Filter out agricultural and developed points
    developed_fvt = list(range(20,33)) + list(range(2901,2906))
    ag_fvt = [80, 81, 82] + list(range(2960, 2971))
    fvt_filter = developed_fvt + ag_fvt

    ag_dev = df.loc[df['LF20_FVT'].isin(fvt_filter)]
    df = df.drop(ag_dev.index, axis=0)

    # Join the filtered values together so they can be readded later
    dropped = pd.concat([null_points, non_disturbed, ag_dev], axis=0)
    dropped = dropped[['LF20_FVT']].rename(columns={'LF20_FVT':'FVT_Pred'})  # Renamed to match with predictions
    
    return df, dropped


## __Predict FVT__
### Read Window
This function is used to read in chunks of the rull raster to be processed. The raster data is stored as blocks with height=1 and width=41855 (the raster width), so we read in chunks composed of these blocks (e.g. 1000 blocks at a time). This corresponds to the row_slice argument. 

In [None]:
def windowed_read(ras, row_slice):
  """
  Reads in a subset (window) of the data to be processed.
  Inputs:
  - ras: Raster object read in using rasterio
  - row_slice: Used to define the window height - in the form (row_start, row_end). 

  Returns:
  - data: The data in the window as a 2D numpy array
  - win: The Window object used to define the subset of the data. 
  - win_transform: The affine transform associated with the window. Used to update the metadata of the output of that chunk. 
  """
  
  with rasterio.open(ras) as src:
    col_slice = (0, src.width)  # Define row slice based on block size
    win = Window.from_slices(row_slice, col_slice) 
    data = src.read(window=win)
    win_transform = src.window_transform(win)
    
  return data, win, win_transform

## __Apply Trained Model__ 
Applies a trained model to the prepared data to predict FVT for each observation (i.e. pixel). After making predictions, rejoins the previously dropped observations so that the dataframe can be reshaped to a 2D numpy array and written to a raster. The index is used to keep track of relative pixel locations in the dataframe. This function returns a dataframe with predictions for each non-null and burnable observation. 

The model was not trained using null (-9999, -1111), non-disturbed, or agricultural/developed classes.

In [None]:
def predict_FVT(model, df):
    """
    Predicts F40 class given a trained model and data to predict on.
    Returns:
    - A dataframe of predicted F40 values joined to the previously dropped values. 
    """
    # Prep the data - get the a clean dataframe (i.e. no NULL data/nonburnable classes) and dropped observations
    clean_df, dropped = data_prep(df)

    # Get list of predictors for run
    predictors = clean_df.columns.tolist()
    
    x_test = clean_df[predictors].copy()

    # Run model to predict
    # If clean_df is empty, then all values were NULL and are in dropped
    if clean_df.shape[0] > 0:
        y_pred = model.predict(x_test)
    else:
        return dropped
    
    # Join the dropped observations back in 
    # This allows the result dataframe to be reshaped back to a raster
    df = pd.DataFrame({"FVT_Pred" : y_pred},
                       index=x_test.index)
    df = pd.concat([dropped, df])
    df.sort_index(inplace=True)

    # Return predictions
    return df


In [None]:
def window_predict(row, model_fpath, win_height, raster_dict, ras_shape, out_dir, out_meta):

    model = load(model_fpath)

    row_start = row
    row_end = row + win_height  # This is also the row_offset of the window

    # make sure slice doesn't exceed row/col dims
    if row_end > ras_shape[0]:
        row_end = ras_shape[0]

    # Define the window to be processed
    row_slice = (row_start, row_end)

    data_dict = {}

    # For the current window, load data from each rasters
    for var, fpath in raster_dict.items():
        data_chunk, data_win, data_transform = windowed_read(fpath, row_slice)
        data_dict[var] = data_chunk.ravel()

    # Create a dataframe from the dictionary of datachunks
    df = pd.DataFrame(data_dict)

    # Look at the window currently processing
    clear_output()

    datetime = dt.datetime.now().strftime('%H-%M-%S-%f')
    out_file = f"data_chunk_{datetime}.tif"
    #print(f"Row Slice: {row_slice}")  
    #print(f"Writing {out_file}.")
    #print(f"Processing window {i} of {floor(ras_shape[0] / win_height)}")

    # Run model to predict F40 Classes for window 
    out_arr = predict_FVT(model, df)

    # Reshape to 2D
    out_arr_np = out_arr.to_numpy()
    out_arr_2D = out_arr_np.reshape(data_chunk.shape)
    out_arr_2D = out_arr_2D[0]

    # update output metadata for chunk
    out_meta.update({
    'height': out_arr_2D.shape[0],
    'width': out_arr_2D.shape[1],
    'transform' : data_transform
    })

    # Write chunk out
    datetime = dt.datetime.now().strftime('%H-%M-%S-%f')
    out_file = f"data_chunk_{datetime}.tif"
    with rasterio.open(os.path.join(out_dir, out_file), 'w+', **out_meta) as out:
        out.write(out_arr_2D, indexes=1)

# __Run Model Predictions__
----

In [None]:
# Define paths to either pnnl or local root directory. 

# Define filepath to model
model_fpath = os.path.join(paths_dict['model_dir'],  paths_dict['model_fname'])

# Define the source raster file names
raster_paths = {
    "LF20_FVT" : os.path.join(paths_dict['data_dir'], "LC22_FVT_220_bpa.tif"),
    "LF22_FDST" : os.path.join(paths_dict['data_dir'], "LC22_FDst_230_bpa.tif"),
    "LF20_FVC" : os.path.join(paths_dict['data_dir'], "LC22_FVC_220_bpa.tif"),
    "LF20_FVH" : os.path.join(paths_dict['data_dir'], "LC22_FVH_220_bpa.tif"),
    "ZONE" : os.path.join(paths_dict['data_dir'], "us_lf_zones_bpa.tif"),
    "ASPECT" : os.path.join(paths_dict['data_dir'], "LC20_Asp_220_bpa.tif"),
    "SLOPE" : os.path.join(paths_dict['data_dir'], "LC20_SlpD_220_bpa.tif"),
    "ELEVATION" : os.path.join(paths_dict['data_dir'], "LC20_Elev_220_bpa.tif"),
    "BPS" : os.path.join(paths_dict['data_dir'], "LC22_BPS_220_bpa.tif")
}

# Create directory using current datetime to output data chunks to
datetime = dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')  # Used to name output file
out_dir = make_dir(base_dir=paths_dict['out_base_dir'], new_dir_name=paths_dict['new_dir_name'])

# Arbitrarily grab metadata from LF20_FVT raster to use for updating output metdata
with rasterio.open(raster_paths['LF20_FVT']) as src:
  out_meta = src.meta.copy()

## Read in data
# Open a raster to access its attributes
ras = rasterio.open(raster_paths['LF20_FVT'])

# Define window height and iteration tracker
win_height = 1000  # Number of rows to process at once

# Run predict function in parallel
Parallel(n_jobs=24)(delayed(window_predict)(row, model_fpath, win_height, raster_paths, ras.shape, out_dir, out_meta) for row in range(0, ras.shape[0], win_height))

## Mosaic the Data Chunks
Combines the chunks generated above into a single raster. 

In [None]:
# Get the file paths of the generated data chunks
raster_fpaths = glob.glob(out_dir + "/*.tif")

# Get the rasterio dataset objects corresponding to each path
src_files_to_mosaic = []
for fpath in raster_fpaths:
    src = rasterio.open(fpath)
    src_files_to_mosaic.append(src)

# Merge the data chunks into a single raster
mosaic, out_trans = merge(src_files_to_mosaic)

# Get the metadata for writing
out_meta = src.meta.copy()
out_meta.update({
    "driver" : "GTiff",
    "height" : mosaic.shape[1],
    "width" : mosaic.shape[2],
    "transform" : out_trans
})

# Write out the mosaic raster
fname = out_dir + '.tif'
with rasterio.open(os.path.join(out_dir, fname), "w", **out_meta) as dest:
    dest.write(mosaic)

print(f"Raster written to {os.path.join(out_dir, fname)}")

# Delete the chunks
#for raster in raster_fpaths:
    #os.remove(raster)