In [313]:
import os, warnings
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime, timedelta

# Load functions to load data and make decision trees
%run ./load_nc_and_subset.ipynb
%run ./DecisionTree.ipynb

warnings.simplefilter('ignore') # Ignore warnings. 

In [2]:
# Function to create and train a random forest

# Function to make predictions using a random forest


In [3]:
# Function to create and train a boosted ensemble model

# Function to make predictions using a boosted ensemble model


In [25]:
# Load the data
sesrFName = 'sesr_all_years_USDMTimeScale_conus.nc'
spiFName  = 'SPI_all_years_USDMTimeScale_conus.nc'
usdmFName = 'USDM_grid_all_years.nc'

sesrSName = 'sesr'
spiSName  = 'SPI'
usdmSName = 'USDM'

sesr = LoadNC(sesrFName, sesrSName)
spi  = LoadNC(spiFName, spiSName)
usdm = LoadNCnomask(usdmFName, usdmSName)

In [205]:
# Correct the mask
def load2Dnc(filename, SName, path = '../Data/'):
    '''
    '''
    
    with Dataset(path + filename, 'r') as nc:
        var = nc.variables[SName][:,:]
        
    return var

mask = load2Dnc('land.nc', 'land')
lat = load2Dnc('lat_narr.nc', 'lat') # Dataset is lat x lon
lon = load2Dnc('lon_narr.nc', 'lon') # Dataset is lat x lon

# Turn positive lon values into negative
for i in range(len(lon[:,0])):
    ind = np.where( lon[i,:] > 0 )[0]
    lon[i,ind] = -1*lon[i,ind]

# Turn mask from time x lat x lon into lat x lon x time
T, I, J = mask.shape

maskNew = np.ones((I, J, T)) * np.nan
maskNew[:,:,0] = mask[0,:,:] # No loop is needed since the time dimension has length 1

# Subset the data to the same values as the criteria data
LatMin = 25
LatMax = 50
LonMin = -130
LonMax = -65
maskSub, LatSub, LonSub = SubsetData(maskNew, lat, lon, LatMin = LatMin, LatMax = LatMax,
                                     LonMin = LonMin, LonMax = LonMax) 

In [206]:
# Prepare the data

# For simplicity, consider only cases where there is or is not drought. Worry about intensity another day.
usdm['USDM'][usdm['USDM'] > 0] = 1
usdm['USDM'][usdm['USDM'] == 0] = -1

# Ensure the land-sea mask has been applied
usdm['USDM'][maskSub[:,:,0] == 0] = np.nan
sesr['sesr'][maskSub[:,:,0] == 0] = np.nan
spi['SPI'][maskSub[:,:,0] == 0] = np.nan

# Collect the training data (2019, a null year, and 2011 an extreme drought year)
TestInd = np.where( (usdm['year'] == 2011) | (usdm['year'] == 2019) )[0]
TrainValInd = np.where( (usdm['year'] != 2011) & (usdm['year'] != 2019) )[0]

sesr_test = sesr['sesr'][:,:,TestInd]
spi_test  = spi['SPI'][:,:,TestInd]
usdm_test = usdm['USDM'][:,:,TestInd]

sesr_TrainVal = sesr['sesr'][:,:,TrainValInd]
spi_TrainVal  = spi['SPI'][:,:,TrainValInd]
usdm_TrainVal = usdm['USDM'][:,:,TrainValInd]

# Collect the training and validation datasets. Use 2017, drought in northern plains, as a validation set
ValInd = np.where(usdm['year'] == 2017)[0]
TrainInd = np.where( (usdm['year'] != 2011) & (usdm['year'] != 2017) & (usdm['year'] != 2019) )[0]

sesr_train = sesr['sesr'][:,:,TrainInd]
spi_train  = spi['SPI'][:,:,TrainInd]
usdm_train = usdm['USDM'][:,:,TrainInd]

sesr_val = sesr['sesr'][:,:,ValInd]
spi_val  = spi['SPI'][:,:,ValInd]
usdm_val = usdm['USDM'][:,:,ValInd]

# Transform the data into 1D arrays for easier iterations in the SL learning.
I, J, T_train = usdm_train.shape
T_val         = usdm_val.shape[-1]
T_train_val   = usdm_TrainVal.shape[-1]
T_test        = usdm_test.shape[-1]

# USDM is the label, therefore is the y vector in the learning algorithms.
y_train     = usdm_train.reshape(I*J*T_train, order = 'F')
y_val       = usdm_val.reshape(I*J*T_val, order = 'F')
y_train_val = usdm_TrainVal.reshape(I*J*T_train_val, order = 'F')
y_test      = usdm_test.reshape(I*J*T_test, order = 'F')


sesr_train1D     = sesr_train.reshape(I*J*T_train, order = 'F')
sesr_val1D       = sesr_val.reshape(I*J*T_val, order = 'F')
sesr_train_val1D = sesr_TrainVal.reshape(I*J*T_train_val, order = 'F')
sesr_test1D      = sesr_test.reshape(I*J*T_test, order = 'F')

spi_train1D     = spi_train.reshape(I*J*T_train, order = 'F')
spi_val1D       = spi_val.reshape(I*J*T_val, order = 'F')
spi_train_val1D = spi_TrainVal.reshape(I*J*T_train_val, order = 'F')
spi_test1D      = spi_test.reshape(I*J*T_test, order = 'F')


# Finally, the features are the compination of SESR and SPI
# For x, the first column is SESR, the second is SPI. shape[-1] is the number of features
x_train     = np.asarray([sesr_train1D, spi_train1D]).T
x_val       = np.asarray([sesr_val1D, spi_val1D]).T
x_train_val = np.asarray([sesr_train_val1D, spi_train_val1D]).T
x_test      = np.asarray([sesr_test1D, spi_test1D]).T

# Quick note for the case of perceptrons, any type of regression, etc.
# A column for the bias needs to be included, so a few more lines are needed.
# e.g., for the training data set, the following line would be needed:
# ones_train = np.ones((T_train))
# x_train = np.asarray([ones_train, sesr_train1D, spi_train1D]).T
# and so on for the other datasets.

In [311]:
# Test the decision tree
# Create some simple attributes

attributes = np.asarray([[-1.3, -0.5], [-0.8, 0]])

tree = DecisionTreeTrain(x_train, y_train, attributes, max_depth = 5, multiplier = 1)

yhat, P_pos, P_neg = DecisionTreePredict(tree, x_val, attributes)

print(np.sum(usdm['USDM'] == 1))
print(np.sum(usdm['USDM'] == -1))
print(tree)
print(yhat)
print(P_pos)
print(P_neg)
print(np.nansum(P_pos + P_neg), np.sum(~np.isnan(y_val)))



1087809
6387231
[[None None None 0 0]
 [None 0 -1.3 1 0]
 [1 1 -0.5 2 0]
 [1 1 0.0 2 1]
 [None 0 -0.8 1 1]
 [-1 1 -0.5 2 0]
 [-1 1 0.0 2 1]]
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
5997.0 744640




In [316]:
# Tree is satisfacorially tested. Create a tree for the experiment
# Create attributes based on the USDM classification for SPI
# Since the calculated SPI values for this expermint seem to be small, use values one tick up for SPI
attributes = np.asarray([[-2.0, -1.6], [-1.6, -1.3], [-1.3, -0.8], [-0.8, 0], [np.nanmax(sesr['sesr']), np.nanmax(spi['SPI'])]])
# Note increments stop at the D0 classification. Any values above -0.8 SESr or 0 SPI should be at 
# or above normal moisture, so no drought.
# Maximum values are then used to ensure the entire dataset in divided into subsets.

# Since droughts are extremes, toy with the multiplier parameter for a better fit. Go from 1 to 10, 
# since the number no drought cases is about an order of magnitude higher than drought cases
multipliers = np.arange(1, 10, 0.5)

# Temperory y_val to make the cross entropy calculations easier.
y_val[y_val == -1] = 0

# Make a tree for each multiplier using training data, and calculate the entropy using validitation data
# to determine which is best.
for mult in multipliers:
    tree = DecisionTreeTrain(x_train, y_train, attributes, max_depth = 5, multiplier = mult)
    yhat, P_pos, P_neg = DecisionTreePredict(tree, x_val, attributes)
    
    # Calculate the cross entropy.
    Cross_Entropy = -1/(np.sum(~np.isnan(y_val))) * np.nansum(y_val * np.log(P_pos + 1e-5) + (1 - y_val) * np.log(1 - P_pos + 1e-5))
    
    print('The cross entropy for a tree with multiplier %4.2f is: %4.3f' %(mult, Cross_Entropy))

The cross entropy for a tree with multiplier 1.00 is: 0.392
The cross entropy for a tree with multiplier 1.50 is: 0.392
The cross entropy for a tree with multiplier 2.00 is: 0.392
The cross entropy for a tree with multiplier 2.50 is: 0.392
The cross entropy for a tree with multiplier 3.00 is: 0.392
The cross entropy for a tree with multiplier 3.50 is: 0.085
The cross entropy for a tree with multiplier 4.00 is: 0.085
The cross entropy for a tree with multiplier 4.50 is: 0.085
The cross entropy for a tree with multiplier 5.00 is: 0.085
The cross entropy for a tree with multiplier 5.50 is: 0.085
The cross entropy for a tree with multiplier 6.00 is: 0.085
The cross entropy for a tree with multiplier 6.50 is: 0.062
The cross entropy for a tree with multiplier 7.00 is: 0.062
The cross entropy for a tree with multiplier 7.50 is: 0.062
The cross entropy for a tree with multiplier 8.00 is: 0.062
The cross entropy for a tree with multiplier 8.50 is: 0.062
The cross entropy for a tree with multip

In [318]:
# High multipliers did best. Take multiplier = 9. 
multiplier = 9

# Restore y_val.
y_val[y_val == 0] = -1

# Create the final Tree model
tree = DecisionTreeTrain(x_train_val, y_train_val, attributes, max_depth = 5, multiplier = multiplier)

# Make some predictions
tree_yhat, tree_P_pos, tree_P_neg = DecisionTreePredict(tree, x_test, attributes)