In [80]:
import os
import math
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

%matplotlib notebook
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider
from IPython.core.display import display, HTML
import seaborn as sn

# import tensorflow.contrib.learn as skflow
# import tensorflow as tf
# tf.logging.set_verbosity(tf.logging.ERROR)
# from tensorflow.python.data import Dataset

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import power_transform
from scipy.stats import boxcox
from scipy.special import inv_boxcox


from datetime import datetime

# import logging
# logging.basicConfig(level=logging.INFO)
# logging.info('Tensorflow %s' % tf.__version__) # 1.4.1

In [81]:
def binary_threshold(series, threshold):
    return series.apply(lambda x:(1 if x > threshold else 0))

def clip(series, clip_to_min, clip_to_max): # TODO: Use this to clip negative fire sizes from regression model
    """
    Clips each value in the series to the given min and max values
    e.g. ([-2, 4, 1], 0, 3) -> [0, 3, 1]
    """
    return series.apply(lambda x:(
        min(max(x, clip_to_min), clip_to_max)))

def z_score_transform(series):
    """Maps each element in a series to its z-score"""
    mean = series.mean()
    std_dv = series.std()
    res = {
        "series": series.apply(lambda x:(x - mean) / std_dv),
        "params": {
            "mean": mean,
            "std_dv": std_dv
        }
    }
    return res

def inv_z_score_transform(series, mean, std_dv):
    """Maps each z-score element in a series to its original value"""
    return series.apply(lambda x: x*std_dv+mean)
    
def root_transform(series):
    """Maps each element in a series to its square root"""
    min_val = series.min()
    res = {
        "series": series.apply(lambda x: math.sqrt(1.0+x-min_val)),
        "params": {
            "min_val": min_val
        }
    }
    return res

def inv_root_transform(series, min_val):
    """Maps each element in a series to its square"""
    return series.apply(lambda x: math.pow(x, 2)-1.0+min_val),

def box_cox_transform(series):
    """Applies a Box-Cox transformation to a series"""
    min_val = series.min()
    box_cox_series, lmbda = boxcox(series.apply(lambda x: 1.0+x-min_val))
    res = {
        "series": box_cox_series,
        "params": {
            "min_val": min_val,
            "lmbda": lmbda
        }
    }
    return res
    
def inv_box_cox_transform(series, min_val, lmbda):
    """Applies an inverse Box-Cox transformation to a series with a given lambda value"""
    s = inv_boxcox(series, lmbda) #ndarray
    return pd.Series(map(lambda x: x[0]-1.0+min_val, s))
    
def linear_transform(series):
    """Applies min-max scaling to each value in the series"""
    min_val = series.min()
    max_val = series.max()
    scale = (max_val - min_val) / 2.0
    res = {
        "series": series.apply(lambda x: (x - min_val) / scale - 1.0),
        "params": {
            "min_val": min_val,
            "scale": scale
        }
    }
    return res

def inv_linear_transform(series, min_val, scale):
    """Undoes min-max scaling to each value in the series"""
    return series.apply(lambda x: 1.0+(x*scale+min_val))

def log_transform(series):
    """Shifts each value in the series such that the min=1 then log transforms"""
    min_val = series.min()
    res = {
        "series": series.apply(lambda x: math.log(1.0+x-min_val)),
        "params": {
            "min_val": min_val
        }
    }
    return res

def inv_log_transform(series, min_val):
    """Exponentiates log transformed values then shifts each value back"""
    return series.apply(lambda x: math.exp(x)-1.0+min_val)

def normalize(dataframe):
    """Returns a version of the input DataFrame that has all its features normalized."""
    normalized = pd.DataFrame()
    params = {}
    z_features = []
    root_features = []
    log_features = []
    linear_features = ["temperature", "humidity"]
    power_features = ["Size", "windSpeed", "windGust", "dewPoint", "windBearing", "pressure"]
    def update(res):
        normalized[column] = res["series"].astype('float32')
        params[column] = res["params"]
    for column in dataframe.columns:
        s = dataframe[column]
        if any(map(lambda k: k in column, power_features)):
            update(box_cox_transform(s))
        elif any(map(lambda k: k in column, linear_features)):
            update(linear_transform(s))
        elif any(map(lambda k: k in column, log_features)):
            update(log_transform(s))
        elif any(map(lambda k: k in column, root_features)):
            update(root_transform(s))
        elif any(map(lambda k: k in column, z_features)):
            update(z_score_transform(s))
        else:
            normalized[column] = s.astype('float32')
    return normalized, params

In [82]:
def ignition_index(indexed_feature):
    """Extracts the hour relative to fire ignition from an indexed feature.
    e.g.
    "temperature_24" -> 24
    """
    return int(indexed_feature
               .replace("summary", "")
               .replace("precipType", "")
               .replace("temperature", "")
               .replace("apparentTemperature", "")
               .replace("dewPoint", "")
               .replace("humidity", "")
               .replace("pressure", "")
               .replace("windSpeed", "")
               .replace("windGust", "")
               .replace("windBearing", "")
               .replace("cloudCover", "")
               .replace("uvIndex", "")
               .replace("visibility", "")
               .replace("precipIntensity", "")
               .replace("precipProbability", "")
               .replace("_", ""))

def indexed_feature_to_feature(indexed_feature):
    """
    Extracts the feature property name from an indexed feature name.
    e.g. "temperature_24" -> "temperature"
    """
    f = indexed_feature.replace("_", "")
    try:
        ii = str(ignition_index(indexed_feature))
        f = f.replace(ii, "")
    except:
        pass
    return f

def has_numbers(string):
    """Determines if a string has an integers"""
    return any(char.isdigit() for char in string)

In [83]:
# def construct_feature_columns(input_features):
#     """Construct the TensorFlow Feature Columns.
#     Args:
#         input_features: The names of the numerical input features to use.
#     Returns:
#         A set of feature columns
#     """ 
#     return set([tf.feature_column.numeric_column(my_feature)
#                   for my_feature in input_features.columns])

In [84]:
# def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
#     """Trains a neural network model.

#     Args:
#         features: pandas DataFrame of features
#         targets: pandas DataFrame of targets
#         batch_size: Size of batches to be passed to the model
#           shuffle: True or False. Whether to shuffle the data.
#         num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
#     Returns:
#         Tuple of (features, labels) for next data batch
#     """

#     # Convert pandas data into a dict of np arrays.
#     features = {key: np.array(value) for key, value in dict(features).items()}

#     # Construct a dataset, and configure batching/repeating.
#     ds = Dataset.from_tensor_slices((features, targets))  # warning: 2GB limit
#     ds = ds.batch(batch_size).repeat(num_epochs)

#     # Shuffle the data, if specified.
#     if shuffle:
#         ds = ds.shuffle(10)

#     # Return the next batch of data.
#     features, labels = ds.make_one_shot_iterator().get_next()
#     return features, labels

In [92]:
def preprocess_master(dataframe, features):
    """
    Prepares input features from California wildfire data.
    dataframe: A Pandas DataFrame expected to contain data
      from the training.training wildfire collection.
    Returns: A Pandas DataFrame containing the features to be used for the model.
    """
    # keep all columns that look like a desired feature and restrict only daily data
    c1 = lambda col: not has_numbers(col) # keep non time indexed features
    c2 = lambda col: (has_numbers(col) and ignition_index(col) % 24 == 0) # keep daily features
    c3 = lambda col: any(map(lambda f: f in col, features)) # if any feature is a substring of the column
    # e.g. "temperature" -> "temperature_24", "temperature0", "temperature24", ...
    cols = [col for col in dataframe.columns if not ((c1(col) or c2(col)) and c3(col))]
    dataframe = dataframe.drop(columns=cols)
    # remove all rows containing a NaN in any remaining feature
    dataframe = dataframe.dropna()
    return dataframe

def test_preprocess_master():
    dataframe = pd.DataFrame(columns=["Event", "Latitude", "Longitude", "summary0", "precipType_24", "temperature24",
                         "apparentTemperature_48", "dewPoint0", "humidity_24", "pressure48", "windSpeed48", "windGust48",
                         "windBearing0", "cloudCover_366", "uvIndex366", "visibility96", "Size", "Costs"])
    features = ["temperature", "dewPoint", "humidity", "pressure",
            "windSpeed", "windGust", "windBearing", "Size"]
    df = preprocess_master(dataframe, features)
    assert np.array_equal(df.columns, ["temperature24","dewPoint0","humidity_24","pressure48","windSpeed48","windGust48","windBearing0","Size"])
    return df

def get_split(dataframe, features):
    """
    Creates a training, validation, and test data set from the given
    Pandas DataFrame and array of feature names.
    """
    df = dataframe.copy()
    
    # use only first n-1 features as model inputs
    descriptive_features = features[:-1]
    
    # want to predict nth feature
    target_features = features[-1]

    all_targets = df[target_features]

    # create an 80/10/10 train/validation/test index split
    training_percent = .80
    training_length = math.floor(len(df.index) * training_percent)
    rest_size = len(df[training_length:].index)
    
    # construct training, validation, and test datasets
    training_df = df[0:training_length]
    validation_df = df[training_length:training_length+math.floor(rest_size/2)]
    test_df = df[training_length+math.floor(rest_size/2):]
    
    def column_split(training_df):
        features = training_df.drop(columns=target_features)
        targets = training_df[target_features]
        return features, targets
    
    
    training_features, training_targets = column_split(training_df)
    validation_features, validation_targets = column_split(validation_df)
    test_features, test_targets = column_split(test_df)
    
    return training_features,\
            training_targets,\
            validation_features,\
            validation_targets,\
            test_features,\
            test_targets, \
            all_targets

In [86]:
def feature_to_columns(df, feature):
    '''
    Returns the name of all columns in the dataframe containing the feature string.
    e.g. "temperature" -> ..., "temperature_24", "temperature0", "temperature24", ...
    '''
    return [x for x in df.columns.to_list() if feature in x]

def get_feature_bounds(df, feature):
    '''
    Returns the min and max value of a feature variable in the given dataframe.
    '''
    cols = feature_to_columns(df, feature)
    mn = min(df[cols].describe().loc["min"].to_list())
    mx = max(df[cols].describe().loc["max"].to_list())
    return mn, mx

# Load the Data

In [87]:
# Load data split from previous run

# X_train = pd.read_csv('.\\data\\'+'run_2'+'\\X_train.csv').astype("float32")
# y_train = pd.read_csv('.\\data\\'+'run_2'+'\\y_train.csv').astype("float32")
# X_valid = pd.read_csv('.\\data\\'+'run_2'+'\\X_valid.csv').astype("float32")
# y_valid = pd.read_csv('.\\data\\'+'run_2'+'\\y_valid.csv').astype("float32")
# X_test = pd.read_csv('.\\data\\'+'run_2'+'\\X_test.csv').astype("float32")
# y_test = pd.read_csv('.\\data\\'+'run_2'+'\\y_test.csv').astype("float32")

# y_all = pd.concat([y_train, y_valid, y_test])
# print(y_all.describe())

# # contains train, valid, test
# y_all.to_csv('.\\data\\'+RUN_NAME+'\\y_all.csv')

In [95]:
# load master data
csv_path = "data/wildfire_events_full.csv"
df = pd.read_csv(csv_path, low_memory=False)

# shuffle the data to prevent bias
df = df.reindex(np.random.permutation(df.index))

# # turn each time-indexed feature into just the feature name
# cols = pd.DataFrame(df.columns.to_list())
# cols.columns = ["feature"]
# cols["feature"] = cols["feature"].map(indexed_feature_to_feature)
# cols = cols.drop_duplicates().reset_index(drop=True)
# available_features = cols["feature"].to_list()
# cols

# Features available
# Event, Latitude, Longitude, summary, precipType, temperature, apparentTemperature,
# dewPoint, humidity, pressure, windSpeed, windGust, windBearing, cloudCover,
# uvIndex, visibility, Size, Costs

In [97]:
# Select the features to begin modeling, leaving the last feature as the target

# After testing I found these features to have these distributions:
# temperature: normal
# pressure: normal
# dewPoint: transform normal
# humidity: transform normal
# windSpeed: transform normal
# windGust: transform normal
# Size: transform normal
# windBearing: not transform normal
# cloudCover: not transform normal
# visibility: not transform normal
# precipIntensity: 
# precipProbability: 
# Note: cloudCover, visibility both skewed heavily due to clipping in measurement => likely not random so don't use!

features = ["temperature", "dewPoint", "humidity", "pressure",
            "windSpeed", "windGust", "Size"]

all_df = preprocess_master(df, features)
all_df.describe()

Unnamed: 0,temperature_336,dewPoint_336,humidity_336,pressure_336,windSpeed_336,windGust_336,temperature_312,dewPoint_312,humidity_312,pressure_312,windSpeed_312,windGust_312,temperature_288,dewPoint_288,humidity_288,pressure_288,windSpeed_288,windGust_288,temperature_264,dewPoint_264,humidity_264,pressure_264,windSpeed_264,windGust_264,temperature_240,dewPoint_240,humidity_240,pressure_240,windSpeed_240,windGust_240,temperature_216,dewPoint_216,humidity_216,pressure_216,windSpeed_216,windGust_216,temperature_192,dewPoint_192,humidity_192,pressure_192,windSpeed_192,windGust_192,temperature_168,dewPoint_168,humidity_168,pressure_168,windSpeed_168,windGust_168,temperature_144,dewPoint_144,humidity_144,pressure_144,windSpeed_144,windGust_144,temperature_120,dewPoint_120,humidity_120,pressure_120,windSpeed_120,windGust_120,temperature_96,dewPoint_96,humidity_96,pressure_96,windSpeed_96,windGust_96,temperature_72,dewPoint_72,humidity_72,pressure_72,windSpeed_72,windGust_72,temperature_48,dewPoint_48,humidity_48,pressure_48,windSpeed_48,windGust_48,temperature_24,dewPoint_24,humidity_24,pressure_24,windSpeed_24,windGust_24,temperature0,dewPoint0,humidity0,pressure0,windSpeed0,windGust0,temperature24,dewPoint24,humidity24,pressure24,windSpeed24,windGust24,temperature48,dewPoint48,humidity48,pressure48,windSpeed48,windGust48,temperature72,dewPoint72,humidity72,pressure72,windSpeed72,windGust72,temperature96,dewPoint96,humidity96,pressure96,windSpeed96,windGust96,temperature120,dewPoint120,humidity120,pressure120,windSpeed120,windGust120,temperature144,dewPoint144,humidity144,pressure144,windSpeed144,windGust144,temperature168,dewPoint168,humidity168,pressure168,windSpeed168,windGust168,temperature192,dewPoint192,humidity192,pressure192,windSpeed192,windGust192,temperature216,dewPoint216,humidity216,pressure216,windSpeed216,windGust216,temperature240,dewPoint240,humidity240,pressure240,windSpeed240,windGust240,temperature264,dewPoint264,humidity264,pressure264,windSpeed264,windGust264,temperature288,dewPoint288,humidity288,pressure288,windSpeed288,windGust288,temperature312,dewPoint312,humidity312,pressure312,windSpeed312,windGust312,temperature336,dewPoint336,humidity336,pressure336,windSpeed336,windGust336,Size
count,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0
mean,61.074134,45.744768,0.612752,1013.235164,1.615266,5.575062,61.071518,45.660793,0.610713,1013.193352,1.592922,5.577803,61.089094,45.93402,0.614009,1013.132129,1.564587,5.515504,60.891608,46.082752,0.621586,1013.147135,1.539864,5.471166,60.936852,45.90521,0.61675,1013.20923,1.487576,5.463749,61.179581,45.912514,0.612186,1013.390034,1.55547,5.529807,61.659558,46.081631,0.607475,1013.269253,1.497486,5.539638,61.802548,46.195595,0.606263,1013.116614,1.571552,5.532424,61.739366,46.305663,0.609162,1013.123533,1.558097,5.458233,61.935108,45.970815,0.600555,1013.162933,1.532888,5.506274,62.074134,45.76786,0.59171,1013.157792,1.524394,5.426489,62.134349,45.586988,0.586059,1013.140476,1.503239,5.490555,62.51923,45.033998,0.568743,1013.129706,1.536195,5.629309,62.873975,44.110453,0.549003,1013.187169,1.57684,5.746682,63.701268,42.900306,0.516795,1013.095527,1.816727,6.371144,63.786217,42.685108,0.514519,1013.11487,1.892888,6.669173,62.970453,43.769513,0.545391,1013.117429,1.61393,5.877746,62.394009,44.511857,0.569943,1013.0541,1.556795,5.715459,62.103726,45.186342,0.58487,1013.128494,1.528007,5.552072,61.938324,45.386399,0.594066,1013.043058,1.576954,5.570227,61.890249,45.453488,0.595595,1013.017339,1.558981,5.538834,62.043296,45.597497,0.596195,1013.095787,1.549694,5.480328,62.227475,45.759343,0.597622,1013.273669,1.608109,5.678709,62.197894,45.852559,0.597407,1013.236467,1.578165,5.603046,62.110612,46.099479,0.603986,1013.151359,1.48675,5.493194,61.981869,46.321065,0.610057,1013.090555,1.496104,5.522208,61.720464,46.27752,0.611178,1013.170385,1.541552,5.598222,61.53752,46.34026,0.614972,1013.23239,1.594009,5.721019,61.315164,46.304224,0.617973,1013.299162,1.571857,5.616082,4898.768448
std,8.938955,9.756195,0.216189,3.545709,1.762139,3.416785,8.899852,9.458919,0.216737,3.536657,1.753927,3.4463,8.883105,9.260391,0.210425,3.253639,1.654099,3.324581,8.807334,9.201914,0.211314,3.361573,1.556889,3.207422,8.779877,9.528122,0.209738,3.475909,1.664975,3.334576,8.853321,9.341624,0.210274,3.458969,1.656468,3.346648,8.575708,9.475325,0.212693,3.464442,1.523956,3.301132,8.574813,9.191239,0.213357,3.497551,1.72485,3.403167,8.884215,8.8651,0.210653,3.423311,1.707866,3.367951,8.909001,9.172951,0.216011,3.318785,1.636505,3.376019,8.868542,9.281443,0.212218,3.383545,1.571558,3.237352,8.848203,9.178044,0.210362,3.427293,1.569185,3.383985,8.965044,9.462515,0.211214,3.499949,1.783938,3.555916,9.150936,10.521938,0.219024,3.572653,1.906563,4.000926,9.293359,11.651804,0.22509,3.768229,2.13291,4.717722,8.759216,12.784277,0.227922,4.233125,2.229501,5.362652,8.731791,11.634963,0.226836,3.974061,1.812433,3.883837,8.706777,11.071256,0.228114,3.695495,1.757196,3.794172,8.946278,10.356875,0.221992,4.141156,1.669167,3.615077,9.059867,10.51187,0.224598,3.729307,1.791777,3.699376,9.250738,10.44751,0.222636,3.747015,1.835213,3.529624,9.172412,10.635261,0.224322,3.811668,1.780889,3.605075,9.413134,10.893461,0.227029,3.876752,1.840718,3.869044,9.391235,10.435322,0.222631,3.803471,1.882248,3.830578,9.407619,10.44766,0.2226,3.740229,1.580805,3.458956,9.13695,10.116502,0.219504,3.771952,1.615013,3.439694,9.068484,9.885913,0.215431,3.652851,1.709614,3.668303,9.128141,9.698793,0.21298,3.480385,1.701568,3.616554,9.288288,9.532388,0.209731,3.616983,1.705689,3.57925,22203.844462
min,30.06,1.18,0.06,998.5,0.0,0.0,32.3,3.75,0.17,1000.21,0.0,0.0,28.45,10.1,0.13,1002.64,0.0,0.0,33.76,1.95,0.11,1001.25,0.0,0.0,32.28,2.53,0.09,998.3,0.0,0.0,22.19,6.24,0.11,1003.48,0.0,0.0,30.15,12.4,0.09,1002.8,0.0,0.7,31.52,4.1,0.13,1000.1,0.0,0.0,29.39,7.13,0.16,1001.48,0.0,0.0,33.76,-2.42,0.16,1002.63,0.0,0.69,33.15,-4.72,0.15,1003.6,0.0,0.0,31.98,1.87,0.15,1000.3,0.0,0.0,31.64,5.58,0.11,1002.3,0.0,0.0,22.91,-5.33,0.06,1001.3,0.0,0.0,23.75,-3.85,0.07,1002.2,0.0,0.57,28.45,-2.08,0.05,995.7,0.0,0.34,36.6,-0.83,0.08,999.76,0.0,0.37,34.44,3.09,0.06,996.3,0.0,0.0,28.36,-0.2,0.05,994.11,0.0,0.5,26.32,-1.36,0.07,1001.39,0.0,0.61,27.65,-4.67,0.07,1000.1,0.0,0.29,27.58,-3.99,0.06,996.1,0.0,0.78,28.59,-3.82,0.06,992.99,0.0,0.0,29.62,-1.67,0.1,997.51,0.0,0.86,32.77,-1.13,0.07,997.0,0.0,0.0,36.07,-1.72,0.1,985.85,0.0,0.61,32.79,5.0,0.13,986.18,0.0,0.55,25.25,-0.89,0.1,1001.9,0.0,0.0,24.08,6.16,0.08,1003.42,0.0,0.66,0.1
25%,55.65,40.815,0.43,1011.02,0.525,3.245,55.555,40.975,0.43,1011.055,0.51,3.28,55.585,41.39,0.44,1011.11,0.51,3.22,55.335,41.4,0.44,1011.095,0.505,3.225,55.245,41.0,0.44,1011.115,0.43,3.18,55.65,40.995,0.44,1011.235,0.51,3.285,56.25,41.005,0.43,1011.05,0.47,3.25,56.3,41.44,0.42,1010.995,0.47,3.285,56.265,41.325,0.43,1011.0,0.48,3.24,56.005,40.55,0.41,1011.005,0.47,3.29,56.04,40.445,0.4,1011.02,0.49,3.26,56.205,40.26,0.41,1010.89,0.49,3.35,56.445,39.315,0.39,1010.94,0.46,3.31,56.785,38.195,0.365,1011.0,0.43,3.185,57.82,36.97,0.335,1010.535,0.475,3.37,57.915,36.175,0.335,1010.6,0.52,3.41,57.12,37.4,0.37,1010.535,0.53,3.48,56.75,39.315,0.38,1010.6,0.48,3.32,56.175,39.82,0.4,1010.76,0.51,3.345,56.065,40.625,0.4,1010.7,0.47,3.22,56.11,40.44,0.4,1010.53,0.47,3.245,55.995,40.135,0.4,1010.8,0.48,3.31,56.355,40.635,0.41,1010.8,0.455,3.19,56.16,41.08,0.41,1010.715,0.43,3.22,55.93,40.465,0.42,1010.56,0.47,3.2,56.215,41.48,0.435,1010.8,0.46,3.26,56.04,41.135,0.44,1010.9,0.43,3.075,55.67,41.37,0.44,1011.115,0.5,3.22,55.395,41.27,0.45,1011.0,0.47,3.24,50.0
50%,60.83,47.67,0.61,1013.11,0.98,4.68,60.63,47.55,0.62,1012.96,1.0,4.71,60.59,47.48,0.62,1013.01,1.05,4.66,60.83,47.73,0.62,1013.12,1.04,4.6,61.17,47.75,0.63,1013.09,0.95,4.68,61.36,47.89,0.62,1013.18,1.01,4.63,61.42,47.64,0.61,1013.18,0.96,4.53,61.73,48.01,0.6,1012.92,1.02,4.66,61.72,47.88,0.61,1012.94,0.95,4.65,61.73,47.32,0.6,1012.98,1.01,4.59,61.86,47.18,0.58,1013.0,0.97,4.6,62.22,46.74,0.57,1013.0,0.93,4.55,62.29,46.23,0.55,1013.0,0.99,4.6,63.18,45.82,0.53,1012.93,0.92,4.58,63.96,44.85,0.48,1012.95,1.08,4.85,64.25,45.67,0.48,1012.88,1.11,5.0,63.48,46.3,0.52,1012.76,1.03,4.78,62.73,46.93,0.55,1012.82,0.96,4.54,62.45,47.37,0.56,1012.8,0.97,4.56,62.05,47.55,0.6,1012.63,0.99,4.44,62.05,47.53,0.6,1012.69,0.95,4.57,62.31,47.58,0.61,1012.83,0.96,4.62,62.42,48.51,0.61,1013.09,0.98,4.57,62.39,47.91,0.6,1013.12,0.94,4.47,62.17,48.34,0.61,1013.02,0.93,4.58,61.83,48.13,0.61,1012.92,0.96,4.59,61.94,48.21,0.61,1013.0,0.92,4.64,61.83,48.04,0.63,1012.92,0.99,4.69,61.39,47.76,0.62,1013.0,0.96,4.55,209.0
75%,66.94,52.695,0.8,1015.31,2.22,6.965,66.94,52.425,0.795,1014.995,2.035,6.885,67.09,52.3,0.8,1015.05,2.075,6.935,66.37,52.625,0.8,1015.0,2.06,6.745,66.33,52.52,0.79,1015.155,1.93,6.66,66.625,52.5,0.79,1015.255,1.99,7.03,67.385,52.99,0.8,1015.125,2.04,7.045,67.915,52.705,0.8,1015.065,1.98,6.905,68.06,52.47,0.795,1015.09,2.065,6.665,68.235,52.72,0.79,1015.15,1.94,6.785,68.245,52.62,0.78,1015.22,2.01,6.7,68.44,52.295,0.77,1015.29,1.975,6.585,68.99,51.69,0.74,1015.085,1.93,6.825,69.35,51.57,0.73,1015.035,2.045,7.005,70.025,51.425,0.7,1014.9,2.41,7.575,69.77,51.7,0.7,1015.05,2.215,7.865,69.175,52.055,0.74,1015.025,1.995,7.16,68.8,51.95,0.76,1015.09,2.045,6.85,68.425,52.36,0.77,1015.005,1.855,6.685,68.39,52.39,0.785,1015.195,1.99,6.685,68.375,52.455,0.79,1015.025,2.035,6.81,68.55,53.095,0.79,1015.2,2.01,6.475,68.95,53.0,0.79,1015.515,2.1,6.97,68.94,53.025,0.78,1015.38,2.06,6.785,68.87,53.315,0.79,1015.41,1.95,6.76,68.255,52.99,0.8,1015.055,1.925,6.65,67.855,52.855,0.8,1015.375,1.955,6.935,67.325,53.03,0.8,1015.215,2.11,7.175,67.445,52.895,0.8,1015.54,1.92,7.035,1041.5
max,92.25,62.5,1.0,1028.52,19.99,33.59,95.74,63.63,1.0,1028.11,18.53,33.54,92.79,67.33,1.0,1026.46,21.26,29.9,91.17,65.9,1.0,1026.0,13.33,21.92,92.35,65.42,1.0,1028.63,16.64,21.85,95.43,66.58,1.0,1027.19,15.39,25.62,97.53,65.51,1.0,1030.18,10.68,21.92,96.13,64.54,1.0,1028.38,13.83,24.99,90.74,63.66,1.0,1029.53,12.33,24.93,88.77,67.3,1.0,1028.52,14.67,27.99,96.38,64.13,1.0,1026.49,10.91,22.45,92.95,64.27,1.0,1026.99,12.44,23.87,96.34,66.17,1.0,1026.99,24.6,32.29,93.19,65.21,1.0,1028.11,20.27,29.56,95.02,65.46,1.0,1030.4,16.79,37.98,100.19,68.08,1.0,1032.9,17.42,42.57,91.05,68.93,1.0,1029.31,19.63,32.4,90.13,66.98,1.0,1028.3,15.81,37.98,89.93,65.19,1.0,1065.22,13.58,37.98,91.4,63.8,1.0,1031.85,17.5,26.84,94.99,69.32,1.0,1026.92,17.98,31.58,92.64,69.76,1.0,1026.49,19.92,32.12,92.2,69.78,1.0,1027.9,18.23,35.69,91.4,63.71,1.0,1029.99,18.42,37.98,93.16,69.69,1.0,1025.56,10.45,27.33,94.47,73.91,1.0,1026.16,15.48,22.87,93.16,72.94,1.0,1028.52,14.56,28.97,92.84,69.76,1.0,1028.44,12.57,24.96,94.48,69.76,1.0,1027.9,17.8,29.9,281893.0


# Exploratory Data Analysis

In [98]:
# Visualize the distribution of all descriptive features
# based on fire size quartile
display(HTML("<style>.container { width:100% !important; }</style>"))

f = "Size"
Q1 = all_df[f].describe()["25%"]
Q2 = all_df[f].describe()["50%"]
Q3 = all_df[f].describe()["75%"]
IQR = Q3-Q1
# construct dataframes with fire sizes for each quartile
q1 = all_df[all_df[f]<Q1]
q2 = all_df[all_df[f]<Q2]
q3 = all_df[all_df[f]<Q3]
# construct dataframe with no outlier fires via Tukey's method
tukey_df = all_df[(Q1-1.5*IQR <= all_df[f]) & (all_df[f] <= Q3+1.5*IQR)]
# choose the working dataframe with outlier reduced sizes
qdf = all_df
qdf.describe()

Unnamed: 0,temperature_336,dewPoint_336,humidity_336,pressure_336,windSpeed_336,windGust_336,temperature_312,dewPoint_312,humidity_312,pressure_312,windSpeed_312,windGust_312,temperature_288,dewPoint_288,humidity_288,pressure_288,windSpeed_288,windGust_288,temperature_264,dewPoint_264,humidity_264,pressure_264,windSpeed_264,windGust_264,temperature_240,dewPoint_240,humidity_240,pressure_240,windSpeed_240,windGust_240,temperature_216,dewPoint_216,humidity_216,pressure_216,windSpeed_216,windGust_216,temperature_192,dewPoint_192,humidity_192,pressure_192,windSpeed_192,windGust_192,temperature_168,dewPoint_168,humidity_168,pressure_168,windSpeed_168,windGust_168,temperature_144,dewPoint_144,humidity_144,pressure_144,windSpeed_144,windGust_144,temperature_120,dewPoint_120,humidity_120,pressure_120,windSpeed_120,windGust_120,temperature_96,dewPoint_96,humidity_96,pressure_96,windSpeed_96,windGust_96,temperature_72,dewPoint_72,humidity_72,pressure_72,windSpeed_72,windGust_72,temperature_48,dewPoint_48,humidity_48,pressure_48,windSpeed_48,windGust_48,temperature_24,dewPoint_24,humidity_24,pressure_24,windSpeed_24,windGust_24,temperature0,dewPoint0,humidity0,pressure0,windSpeed0,windGust0,temperature24,dewPoint24,humidity24,pressure24,windSpeed24,windGust24,temperature48,dewPoint48,humidity48,pressure48,windSpeed48,windGust48,temperature72,dewPoint72,humidity72,pressure72,windSpeed72,windGust72,temperature96,dewPoint96,humidity96,pressure96,windSpeed96,windGust96,temperature120,dewPoint120,humidity120,pressure120,windSpeed120,windGust120,temperature144,dewPoint144,humidity144,pressure144,windSpeed144,windGust144,temperature168,dewPoint168,humidity168,pressure168,windSpeed168,windGust168,temperature192,dewPoint192,humidity192,pressure192,windSpeed192,windGust192,temperature216,dewPoint216,humidity216,pressure216,windSpeed216,windGust216,temperature240,dewPoint240,humidity240,pressure240,windSpeed240,windGust240,temperature264,dewPoint264,humidity264,pressure264,windSpeed264,windGust264,temperature288,dewPoint288,humidity288,pressure288,windSpeed288,windGust288,temperature312,dewPoint312,humidity312,pressure312,windSpeed312,windGust312,temperature336,dewPoint336,humidity336,pressure336,windSpeed336,windGust336,Size
count,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0,883.0
mean,61.074134,45.744768,0.612752,1013.235164,1.615266,5.575062,61.071518,45.660793,0.610713,1013.193352,1.592922,5.577803,61.089094,45.93402,0.614009,1013.132129,1.564587,5.515504,60.891608,46.082752,0.621586,1013.147135,1.539864,5.471166,60.936852,45.90521,0.61675,1013.20923,1.487576,5.463749,61.179581,45.912514,0.612186,1013.390034,1.55547,5.529807,61.659558,46.081631,0.607475,1013.269253,1.497486,5.539638,61.802548,46.195595,0.606263,1013.116614,1.571552,5.532424,61.739366,46.305663,0.609162,1013.123533,1.558097,5.458233,61.935108,45.970815,0.600555,1013.162933,1.532888,5.506274,62.074134,45.76786,0.59171,1013.157792,1.524394,5.426489,62.134349,45.586988,0.586059,1013.140476,1.503239,5.490555,62.51923,45.033998,0.568743,1013.129706,1.536195,5.629309,62.873975,44.110453,0.549003,1013.187169,1.57684,5.746682,63.701268,42.900306,0.516795,1013.095527,1.816727,6.371144,63.786217,42.685108,0.514519,1013.11487,1.892888,6.669173,62.970453,43.769513,0.545391,1013.117429,1.61393,5.877746,62.394009,44.511857,0.569943,1013.0541,1.556795,5.715459,62.103726,45.186342,0.58487,1013.128494,1.528007,5.552072,61.938324,45.386399,0.594066,1013.043058,1.576954,5.570227,61.890249,45.453488,0.595595,1013.017339,1.558981,5.538834,62.043296,45.597497,0.596195,1013.095787,1.549694,5.480328,62.227475,45.759343,0.597622,1013.273669,1.608109,5.678709,62.197894,45.852559,0.597407,1013.236467,1.578165,5.603046,62.110612,46.099479,0.603986,1013.151359,1.48675,5.493194,61.981869,46.321065,0.610057,1013.090555,1.496104,5.522208,61.720464,46.27752,0.611178,1013.170385,1.541552,5.598222,61.53752,46.34026,0.614972,1013.23239,1.594009,5.721019,61.315164,46.304224,0.617973,1013.299162,1.571857,5.616082,4898.768448
std,8.938955,9.756195,0.216189,3.545709,1.762139,3.416785,8.899852,9.458919,0.216737,3.536657,1.753927,3.4463,8.883105,9.260391,0.210425,3.253639,1.654099,3.324581,8.807334,9.201914,0.211314,3.361573,1.556889,3.207422,8.779877,9.528122,0.209738,3.475909,1.664975,3.334576,8.853321,9.341624,0.210274,3.458969,1.656468,3.346648,8.575708,9.475325,0.212693,3.464442,1.523956,3.301132,8.574813,9.191239,0.213357,3.497551,1.72485,3.403167,8.884215,8.8651,0.210653,3.423311,1.707866,3.367951,8.909001,9.172951,0.216011,3.318785,1.636505,3.376019,8.868542,9.281443,0.212218,3.383545,1.571558,3.237352,8.848203,9.178044,0.210362,3.427293,1.569185,3.383985,8.965044,9.462515,0.211214,3.499949,1.783938,3.555916,9.150936,10.521938,0.219024,3.572653,1.906563,4.000926,9.293359,11.651804,0.22509,3.768229,2.13291,4.717722,8.759216,12.784277,0.227922,4.233125,2.229501,5.362652,8.731791,11.634963,0.226836,3.974061,1.812433,3.883837,8.706777,11.071256,0.228114,3.695495,1.757196,3.794172,8.946278,10.356875,0.221992,4.141156,1.669167,3.615077,9.059867,10.51187,0.224598,3.729307,1.791777,3.699376,9.250738,10.44751,0.222636,3.747015,1.835213,3.529624,9.172412,10.635261,0.224322,3.811668,1.780889,3.605075,9.413134,10.893461,0.227029,3.876752,1.840718,3.869044,9.391235,10.435322,0.222631,3.803471,1.882248,3.830578,9.407619,10.44766,0.2226,3.740229,1.580805,3.458956,9.13695,10.116502,0.219504,3.771952,1.615013,3.439694,9.068484,9.885913,0.215431,3.652851,1.709614,3.668303,9.128141,9.698793,0.21298,3.480385,1.701568,3.616554,9.288288,9.532388,0.209731,3.616983,1.705689,3.57925,22203.844462
min,30.06,1.18,0.06,998.5,0.0,0.0,32.3,3.75,0.17,1000.21,0.0,0.0,28.45,10.1,0.13,1002.64,0.0,0.0,33.76,1.95,0.11,1001.25,0.0,0.0,32.28,2.53,0.09,998.3,0.0,0.0,22.19,6.24,0.11,1003.48,0.0,0.0,30.15,12.4,0.09,1002.8,0.0,0.7,31.52,4.1,0.13,1000.1,0.0,0.0,29.39,7.13,0.16,1001.48,0.0,0.0,33.76,-2.42,0.16,1002.63,0.0,0.69,33.15,-4.72,0.15,1003.6,0.0,0.0,31.98,1.87,0.15,1000.3,0.0,0.0,31.64,5.58,0.11,1002.3,0.0,0.0,22.91,-5.33,0.06,1001.3,0.0,0.0,23.75,-3.85,0.07,1002.2,0.0,0.57,28.45,-2.08,0.05,995.7,0.0,0.34,36.6,-0.83,0.08,999.76,0.0,0.37,34.44,3.09,0.06,996.3,0.0,0.0,28.36,-0.2,0.05,994.11,0.0,0.5,26.32,-1.36,0.07,1001.39,0.0,0.61,27.65,-4.67,0.07,1000.1,0.0,0.29,27.58,-3.99,0.06,996.1,0.0,0.78,28.59,-3.82,0.06,992.99,0.0,0.0,29.62,-1.67,0.1,997.51,0.0,0.86,32.77,-1.13,0.07,997.0,0.0,0.0,36.07,-1.72,0.1,985.85,0.0,0.61,32.79,5.0,0.13,986.18,0.0,0.55,25.25,-0.89,0.1,1001.9,0.0,0.0,24.08,6.16,0.08,1003.42,0.0,0.66,0.1
25%,55.65,40.815,0.43,1011.02,0.525,3.245,55.555,40.975,0.43,1011.055,0.51,3.28,55.585,41.39,0.44,1011.11,0.51,3.22,55.335,41.4,0.44,1011.095,0.505,3.225,55.245,41.0,0.44,1011.115,0.43,3.18,55.65,40.995,0.44,1011.235,0.51,3.285,56.25,41.005,0.43,1011.05,0.47,3.25,56.3,41.44,0.42,1010.995,0.47,3.285,56.265,41.325,0.43,1011.0,0.48,3.24,56.005,40.55,0.41,1011.005,0.47,3.29,56.04,40.445,0.4,1011.02,0.49,3.26,56.205,40.26,0.41,1010.89,0.49,3.35,56.445,39.315,0.39,1010.94,0.46,3.31,56.785,38.195,0.365,1011.0,0.43,3.185,57.82,36.97,0.335,1010.535,0.475,3.37,57.915,36.175,0.335,1010.6,0.52,3.41,57.12,37.4,0.37,1010.535,0.53,3.48,56.75,39.315,0.38,1010.6,0.48,3.32,56.175,39.82,0.4,1010.76,0.51,3.345,56.065,40.625,0.4,1010.7,0.47,3.22,56.11,40.44,0.4,1010.53,0.47,3.245,55.995,40.135,0.4,1010.8,0.48,3.31,56.355,40.635,0.41,1010.8,0.455,3.19,56.16,41.08,0.41,1010.715,0.43,3.22,55.93,40.465,0.42,1010.56,0.47,3.2,56.215,41.48,0.435,1010.8,0.46,3.26,56.04,41.135,0.44,1010.9,0.43,3.075,55.67,41.37,0.44,1011.115,0.5,3.22,55.395,41.27,0.45,1011.0,0.47,3.24,50.0
50%,60.83,47.67,0.61,1013.11,0.98,4.68,60.63,47.55,0.62,1012.96,1.0,4.71,60.59,47.48,0.62,1013.01,1.05,4.66,60.83,47.73,0.62,1013.12,1.04,4.6,61.17,47.75,0.63,1013.09,0.95,4.68,61.36,47.89,0.62,1013.18,1.01,4.63,61.42,47.64,0.61,1013.18,0.96,4.53,61.73,48.01,0.6,1012.92,1.02,4.66,61.72,47.88,0.61,1012.94,0.95,4.65,61.73,47.32,0.6,1012.98,1.01,4.59,61.86,47.18,0.58,1013.0,0.97,4.6,62.22,46.74,0.57,1013.0,0.93,4.55,62.29,46.23,0.55,1013.0,0.99,4.6,63.18,45.82,0.53,1012.93,0.92,4.58,63.96,44.85,0.48,1012.95,1.08,4.85,64.25,45.67,0.48,1012.88,1.11,5.0,63.48,46.3,0.52,1012.76,1.03,4.78,62.73,46.93,0.55,1012.82,0.96,4.54,62.45,47.37,0.56,1012.8,0.97,4.56,62.05,47.55,0.6,1012.63,0.99,4.44,62.05,47.53,0.6,1012.69,0.95,4.57,62.31,47.58,0.61,1012.83,0.96,4.62,62.42,48.51,0.61,1013.09,0.98,4.57,62.39,47.91,0.6,1013.12,0.94,4.47,62.17,48.34,0.61,1013.02,0.93,4.58,61.83,48.13,0.61,1012.92,0.96,4.59,61.94,48.21,0.61,1013.0,0.92,4.64,61.83,48.04,0.63,1012.92,0.99,4.69,61.39,47.76,0.62,1013.0,0.96,4.55,209.0
75%,66.94,52.695,0.8,1015.31,2.22,6.965,66.94,52.425,0.795,1014.995,2.035,6.885,67.09,52.3,0.8,1015.05,2.075,6.935,66.37,52.625,0.8,1015.0,2.06,6.745,66.33,52.52,0.79,1015.155,1.93,6.66,66.625,52.5,0.79,1015.255,1.99,7.03,67.385,52.99,0.8,1015.125,2.04,7.045,67.915,52.705,0.8,1015.065,1.98,6.905,68.06,52.47,0.795,1015.09,2.065,6.665,68.235,52.72,0.79,1015.15,1.94,6.785,68.245,52.62,0.78,1015.22,2.01,6.7,68.44,52.295,0.77,1015.29,1.975,6.585,68.99,51.69,0.74,1015.085,1.93,6.825,69.35,51.57,0.73,1015.035,2.045,7.005,70.025,51.425,0.7,1014.9,2.41,7.575,69.77,51.7,0.7,1015.05,2.215,7.865,69.175,52.055,0.74,1015.025,1.995,7.16,68.8,51.95,0.76,1015.09,2.045,6.85,68.425,52.36,0.77,1015.005,1.855,6.685,68.39,52.39,0.785,1015.195,1.99,6.685,68.375,52.455,0.79,1015.025,2.035,6.81,68.55,53.095,0.79,1015.2,2.01,6.475,68.95,53.0,0.79,1015.515,2.1,6.97,68.94,53.025,0.78,1015.38,2.06,6.785,68.87,53.315,0.79,1015.41,1.95,6.76,68.255,52.99,0.8,1015.055,1.925,6.65,67.855,52.855,0.8,1015.375,1.955,6.935,67.325,53.03,0.8,1015.215,2.11,7.175,67.445,52.895,0.8,1015.54,1.92,7.035,1041.5
max,92.25,62.5,1.0,1028.52,19.99,33.59,95.74,63.63,1.0,1028.11,18.53,33.54,92.79,67.33,1.0,1026.46,21.26,29.9,91.17,65.9,1.0,1026.0,13.33,21.92,92.35,65.42,1.0,1028.63,16.64,21.85,95.43,66.58,1.0,1027.19,15.39,25.62,97.53,65.51,1.0,1030.18,10.68,21.92,96.13,64.54,1.0,1028.38,13.83,24.99,90.74,63.66,1.0,1029.53,12.33,24.93,88.77,67.3,1.0,1028.52,14.67,27.99,96.38,64.13,1.0,1026.49,10.91,22.45,92.95,64.27,1.0,1026.99,12.44,23.87,96.34,66.17,1.0,1026.99,24.6,32.29,93.19,65.21,1.0,1028.11,20.27,29.56,95.02,65.46,1.0,1030.4,16.79,37.98,100.19,68.08,1.0,1032.9,17.42,42.57,91.05,68.93,1.0,1029.31,19.63,32.4,90.13,66.98,1.0,1028.3,15.81,37.98,89.93,65.19,1.0,1065.22,13.58,37.98,91.4,63.8,1.0,1031.85,17.5,26.84,94.99,69.32,1.0,1026.92,17.98,31.58,92.64,69.76,1.0,1026.49,19.92,32.12,92.2,69.78,1.0,1027.9,18.23,35.69,91.4,63.71,1.0,1029.99,18.42,37.98,93.16,69.69,1.0,1025.56,10.45,27.33,94.47,73.91,1.0,1026.16,15.48,22.87,93.16,72.94,1.0,1028.52,14.56,28.97,92.84,69.76,1.0,1028.44,12.57,24.96,94.48,69.76,1.0,1027.9,17.8,29.9,281893.0


In [99]:
# Visualize the distribution of fire size in a single dimension

# plot the distribution of all fire sizes
all_sizes = all_df["Size"]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("all fire sizes")
val = 0. # this is the value where you want the data to appear on the y-axis.
plt.plot(all_sizes, np.zeros_like(all_sizes) + val, 'x')

fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("all fire sizes")
all_hist = all_df["Size"].hist(bins=20, figsize=(8, 8), xlabelsize=10)

# plot the distribution of the outlier reduced fire sizes
outlier_reduced_sizes = qdf["Size"]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("outlier reduced fire sizes")
plt.plot(outlier_reduced_sizes, np.zeros_like(outlier_reduced_sizes) + val, 'x')
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("outlier reduced fire sizes")
out_red_hist = outlier_reduced_sizes.hist(bins=20, figsize=(8, 8), xlabelsize=10)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Feature Selection

In [100]:
# Find and remove all redundant features

correlated_features = set()
correlation_matrix = qdf.drop("Size", axis=1).corr() # remove target feature
correlation_threshold = 0.8 # two features' correlation coefficient should be <

# find all pair-wise correlated descriptive features via their Pearson correlation
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

# the outlier-reduced dataframe with non pair-wise correlated descriptive features
cdf = qdf.drop(correlated_features, axis=1)

print(str(len(correlated_features))+" features removed from "+str(len(qdf.columns))+" total:")
print(sorted(list(correlated_features)))
print("\n")
print(str(len(cdf.columns))+" features remaining:")
print(sorted(cdf.columns.to_list()))

89 features removed from 175 total:
['dewPoint0', 'dewPoint120', 'dewPoint144', 'dewPoint168', 'dewPoint192', 'dewPoint216', 'dewPoint24', 'dewPoint240', 'dewPoint264', 'dewPoint288', 'dewPoint312', 'dewPoint336', 'dewPoint48', 'dewPoint72', 'dewPoint96', 'dewPoint_120', 'dewPoint_144', 'dewPoint_168', 'dewPoint_192', 'dewPoint_216', 'dewPoint_24', 'dewPoint_240', 'dewPoint_264', 'dewPoint_288', 'dewPoint_312', 'dewPoint_48', 'dewPoint_72', 'dewPoint_96', 'humidity0', 'humidity120', 'humidity144', 'humidity168', 'humidity192', 'humidity216', 'humidity240', 'humidity264', 'humidity288', 'humidity312', 'humidity336', 'humidity48', 'humidity72', 'humidity96', 'humidity_120', 'humidity_144', 'humidity_168', 'humidity_192', 'humidity_216', 'humidity_24', 'humidity_240', 'humidity_264', 'humidity_288', 'humidity_312', 'humidity_48', 'humidity_72', 'humidity_96', 'pressure288', 'pressure48', 'pressure72', 'pressure_120', 'temperature0', 'temperature120', 'temperature144', 'temperature168', 't

In [101]:
# Find top descriptive features most correlated to target
# Note: Can perform "filter" feature selection by keeping only top n descriptive features, but not done currently

# create correlation matrix of Pearson coefficients
corr_df = cdf.corr(method ="pearson")

# create heatmap of correlation matrix for all features
# fig = plt.figure(3)
# fig, ax = plt.subplots(figsize=(5,5))
# sn.heatmap(corr_df, annot=False, linewidths=.5, cmap="YlGnBu")
# plt.show()

# remove target feature row from correlation matrix
corr_df=corr_df.drop(["Size"])

# get each descriptive feature's absolute correlation strength relative to target feature
corr_df["Size"] = corr_df["Size"].map(abs)
corr_df.index.name = "feature"

# rank top absolutely correlated descriptive features relative to target feature
abs_corr_df_specific = corr_df.groupby("feature", sort=False)[["Size"]].mean().sort_values("Size",ascending=False)
abs_corr_specific_list = abs_corr_df_specific.index.to_list()
print("top features most correlated to target:")
print(len(abs_corr_specific_list))
print(abs_corr_specific_list) # Note: This is equivalent to cdf.columns.to_list() excluding target feature
print("\n")

# turn each indexed feature into just the feature name to groupby
corr_df.index = corr_df.index.map(indexed_feature_to_feature)

# rank top absolutely correlated descriptive features relative to target feature
abs_corr_df_feature = corr_df.groupby("feature", sort=False)[["Size"]].mean().sort_values("Size",ascending=False)
abs_corr_feature_list = abs_corr_df_feature.index.to_list()
print("top features most correlated to target on average:")
print(abs_corr_feature_list)
abs_corr_df_feature

top features most correlated to target:
85
['windGust72', 'humidity24', 'windGust24', 'windGust48', 'pressure336', 'pressure264', 'pressure96', 'pressure24', 'pressure192', 'windSpeed24', 'pressure240', 'pressure_96', 'pressure_72', 'pressure216', 'pressure168', 'windSpeed72', 'pressure_312', 'pressure120', 'windGust144', 'pressure_48', 'pressure_144', 'temperature_336', 'pressure_168', 'pressure_192', 'pressure_288', 'windSpeed_144', 'pressure312', 'pressure0', 'pressure144', 'windSpeed288', 'windGust312', 'pressure_240', 'pressure_336', 'windSpeed216', 'windGust96', 'windGust168', 'dewPoint_336', 'windGust_72', 'windSpeed_240', 'windSpeed48', 'windSpeed_288', 'pressure_264', 'windSpeed_192', 'pressure_24', 'windSpeed_120', 'windSpeed240', 'windSpeed_96', 'windSpeed_216', 'windSpeed_264', 'windSpeed264', 'windGust240', 'windSpeed192', 'windSpeed144', 'windGust_216', 'windSpeed_24', 'windGust_48', 'windSpeed_312', 'pressure_216', 'windSpeed_48', 'windSpeed_168', 'windSpeed312', 'windGu

Unnamed: 0_level_0,Size
feature,Unnamed: 1_level_1
humidity,0.085205
pressure,0.077952
temperature,0.071959
dewPoint,0.042464
windGust,0.03628
windSpeed,0.033746


In [102]:
# let's see the distribution of features by ignition state (pre, during, post)
pre_ignition_features = [f for f in abs_corr_specific_list if "_" in f]
ignition_features = [f for f in abs_corr_specific_list if ignition_index(f) == 0]
post_ignition_features = [f for f in abs_corr_specific_list if "_" not in f and ignition_index(f) != 0]
print("number of pre-ignition features: \n", len(pre_ignition_features))
print(pre_ignition_features)
print("\n")
print("number of ignition features: \n", len(ignition_features))
print(ignition_features)
print("\n")
print("number of post-ignition features: \n", len(post_ignition_features))
print(post_ignition_features)
print("\n")

number of pre-ignition features: 
 44
['pressure_96', 'pressure_72', 'pressure_312', 'pressure_48', 'pressure_144', 'temperature_336', 'pressure_168', 'pressure_192', 'pressure_288', 'windSpeed_144', 'pressure_240', 'pressure_336', 'dewPoint_336', 'windGust_72', 'windSpeed_240', 'windSpeed_288', 'pressure_264', 'windSpeed_192', 'pressure_24', 'windSpeed_120', 'windSpeed_96', 'windSpeed_216', 'windSpeed_264', 'windGust_216', 'windSpeed_24', 'windGust_48', 'windSpeed_312', 'pressure_216', 'windSpeed_48', 'windSpeed_168', 'windGust_264', 'humidity_336', 'windGust_336', 'windGust_96', 'windGust_192', 'windSpeed_336', 'windGust_168', 'windGust_288', 'windGust_120', 'windGust_24', 'windSpeed_72', 'windGust_240', 'windGust_144', 'windGust_312']


number of ignition features: 
 2
['pressure0', 'windSpeed0']


number of post-ignition features: 
 39
['windGust72', 'humidity24', 'windGust24', 'windGust48', 'pressure336', 'pressure264', 'pressure96', 'pressure24', 'pressure192', 'windSpeed24', 'pr

In [103]:
# let's visualize the distribution of ignition indexes for each absolutely correlated descriptive feature
g = lambda x, f: -x if "_" in f else x # convert to negative for pre-ignition features
ignition_indexes = [g(ignition_index(f), f) for f in abs_corr_specific_list]

fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("ignition index vs feature count")
ignition_indexes_hist = plt.hist(x=ignition_indexes, bins=100)

<IPython.core.display.Javascript object>

In [104]:
# View distribution of all descriptive features after outlier and correlation filtering
cdf_hist = cdf[abs_corr_specific_list+["Size"]].hist(bins=20, figsize=(16, 16), xlabelsize=1)

<IPython.core.display.Javascript object>

In [105]:
# Looks like some features are a bit skewed, let's see their distribution (want cluster close to 0)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("absolute feature skew distribution")
skew_df = pd.DataFrame(cdf.skew())
skew_df.index.name="feature"
skew_df.columns=["skew"]
skew_df["skew"] = skew_df["skew"].map(abs)
skew_hist = skew_df["skew"].hist(bins=100, figsize=(10, 5), xlabelsize=10)

<IPython.core.display.Javascript object>

In [106]:
# let's rank each feature type by its mean absolute skew
skew_df.index = skew_df.index.map(indexed_feature_to_feature)
skew_df = skew_df.groupby("feature", sort=False)[["skew"]].mean().sort_values("skew",ascending=False)
skew_df

Unnamed: 0_level_0,skew
feature,Unnamed: 1_level_1
Size,8.271004
windSpeed,2.854831
windGust,2.001872
dewPoint,1.141494
pressure,0.456542
humidity,0.144651
temperature,0.098523


## Feature Normalization

In [107]:
# Let's normalize the outlier reduced feature filtered dataframe
ndf, params = normalize(cdf)
ndf.head(10)

Unnamed: 0,temperature_336,dewPoint_336,humidity_336,pressure_336,windSpeed_336,windGust_336,pressure_312,windSpeed_312,windGust_312,pressure_288,windSpeed_288,windGust_288,pressure_264,windSpeed_264,windGust_264,pressure_240,windSpeed_240,windGust_240,pressure_216,windSpeed_216,windGust_216,pressure_192,windSpeed_192,windGust_192,pressure_168,windSpeed_168,windGust_168,pressure_144,windSpeed_144,windGust_144,windSpeed_120,windGust_120,pressure_96,windSpeed_96,windGust_96,pressure_72,windSpeed_72,windGust_72,pressure_48,windSpeed_48,windGust_48,pressure_24,windSpeed_24,windGust_24,pressure0,windSpeed0,humidity24,pressure24,windSpeed24,windGust24,windSpeed48,windGust48,windSpeed72,windGust72,pressure96,windSpeed96,windGust96,pressure120,windSpeed120,windGust120,pressure144,windSpeed144,windGust144,pressure168,windSpeed168,windGust168,pressure192,windSpeed192,windGust192,pressure216,windSpeed216,pressure240,windSpeed240,windGust240,pressure264,windSpeed264,windGust264,windSpeed288,windGust288,pressure312,windSpeed312,windGust312,pressure336,windSpeed336,windGust336,Size
645,-0.041647,3574.299805,-0.12766,7.672649,0.71719,1.819356,4.084489,0.661081,1.533761,2.26494,0.738254,1.662855,0.716995,0.526615,1.465278,4.674052,0.561292,1.695821,3.742727,0.806929,1.831889,3.239204,0.463959,1.579363,6.364965,0.679022,1.782974,3.80511,0.833473,1.776116,0.914123,1.916249,1.955879,0.80688,2.079108,5.65968,0.881971,1.447946,5.337403,0.293056,1.48811,4.940679,0.510757,1.390408,3.607938,0.551392,-0.663158,9.04912,0.29942,1.396302,0.639906,1.699459,0.766385,1.466577,5.111508,0.697273,1.449367,3.387045,0.753614,1.372069,3.870867,0.772079,1.520549,9.272153,0.46342,1.360856,14.499221,0.815252,1.434067,6.19724,0.541355,7.921846,0.599906,1.713589,42.073101,0.69315,1.537868,0.66488,1.448653,4.186083,0.642524,1.449156,2.659111,0.574342,1.558257,6.29592
803,-0.618588,2610.867432,0.489362,11.099131,1.044726,2.897571,9.370628,0.635805,1.778075,10.259719,0.710459,1.590428,12.977681,0.523261,1.353891,11.776882,0.663475,1.580402,6.204667,0.560417,1.592282,7.863421,0.547432,1.789101,10.240376,0.616724,1.920775,7.969058,0.906694,1.985357,0.959085,2.259379,6.91759,0.613867,1.647726,9.69418,0.877283,1.880763,7.240977,0.872908,1.786284,7.703976,0.819421,1.726217,6.915693,0.947445,-0.515789,12.657547,0.931048,1.938548,1.081308,2.405507,0.861142,2.022577,6.20391,0.723433,1.900274,5.982411,0.571847,1.119965,6.499406,0.603611,1.542542,13.052905,0.736579,1.554844,20.006689,0.63259,1.572066,8.849126,0.744179,11.253675,0.67274,1.850395,64.804703,0.721642,1.743728,0.755257,1.97599,6.162165,0.869055,1.785477,5.088601,0.705175,1.740784,6.047731
206,-0.019778,7831.049805,0.574468,12.705016,0.117208,1.496195,7.932352,0.434963,1.600698,7.478351,0.821321,1.679802,11.941228,0.133312,1.369279,14.024368,0.630211,1.644596,5.614795,0.639584,1.615986,6.369277,0.14105,1.21514,9.906569,0.637146,1.523594,8.272284,0.212625,1.360998,0.484583,1.673391,6.975836,0.510368,1.416974,7.329491,0.264186,1.577102,8.250084,0.163465,1.452663,9.707095,0.686556,1.492646,6.933377,0.696451,0.515789,12.690328,0.108904,0.9125,0.546748,1.564049,0.184282,1.252517,7.554385,0.263282,1.451923,5.895161,0.682038,1.647876,6.557399,0.766809,1.36028,16.763809,0.823604,1.650816,28.087433,0.310724,1.487439,12.266507,0.710483,15.127214,0.619766,1.65817,66.87709,1.144454,2.283262,1.105614,2.278484,10.49342,0.455539,1.359153,10.172993,0.842079,2.326633,1.818007
1670,0.205982,6819.549805,0.06383,11.061496,0.148802,0.796911,7.33007,0.192702,0.836597,7.279754,0.10949,0.990887,8.267903,0.265328,0.911011,8.96387,0.256326,0.83268,4.416686,0.290407,1.227064,4.515903,0.368385,0.872477,5.443516,0.09231,0.865955,6.765339,0.427584,1.354707,0.498841,1.235207,5.461365,0.29414,0.97206,8.706923,0.213097,1.12861,7.221321,0.281572,1.328684,6.72341,0.18489,1.072,6.179328,0.300394,0.515789,13.1935,0.233021,0.783927,0.427639,0.941454,0.577327,1.273306,7.49814,0.177847,0.487369,6.277347,0.274068,0.933858,6.923044,0.354026,1.11943,15.03726,0.285762,0.644765,20.572065,0.178383,0.867294,8.035661,0.083469,10.80532,0.559089,1.527223,58.428825,0.132855,0.588042,0.406937,1.067502,7.739872,0.479203,1.035612,7.406649,0.384957,0.90595,3.551633
1186,-0.063193,4628.266602,0.085106,7.875844,0.265023,1.411804,4.600209,0.199686,1.275062,1.700209,0.248905,1.247709,6.11926,0.220564,1.177598,7.131465,0.155504,1.224044,3.732544,0.083793,1.083577,5.604885,0.206763,1.122949,8.557072,0.117313,1.089874,5.341911,0.281179,1.261264,0.233032,1.040667,0.735579,0.337731,1.289487,7.477895,0.219784,1.262722,5.940903,0.178075,1.019185,5.548236,0.329993,1.120863,4.375852,0.185859,-0.052632,10.900025,0.1006,1.125209,0.443767,1.413419,0.454031,1.420473,4.248186,0.504871,1.647172,2.165092,0.363149,1.344475,5.201472,0.116735,0.893184,12.499969,0.262464,1.027051,17.152882,0.206498,1.074757,7.288536,0.038653,8.808414,0.320257,1.416469,39.524467,0.148323,1.33914,0.100501,0.750161,4.280476,0.345238,1.267643,6.134308,0.520175,1.323051,3.642725
1877,0.299566,9381.393555,0.276596,13.252509,0.440269,1.449535,7.426427,0.357949,1.242827,8.16741,0.534379,1.425493,10.58236,0.374049,1.353891,11.253838,0.440381,1.308166,4.759089,0.739897,1.63187,7.086656,0.475429,1.314526,9.822808,0.289483,1.54015,7.264346,0.325331,1.217789,0.321424,1.037616,6.214787,0.348004,1.180908,8.608453,0.199447,1.226351,6.123818,0.452593,1.171799,5.819011,0.280778,1.088744,4.767812,0.163955,0.010526,9.24904,0.404303,1.181305,0.491959,1.388886,0.376135,1.286711,7.140351,0.269259,1.301963,6.386215,0.132566,1.207812,7.132165,0.244219,1.273325,14.391398,0.412488,1.142327,20.486706,0.163723,1.18535,8.879387,0.496868,12.089482,0.3149,1.345581,69.926392,0.257397,1.173547,0.459804,0.980473,7.712991,0.227719,1.221364,5.585719,0.452616,1.139025,7.430736
482,-0.086991,5434.068848,0.276596,14.332879,0.907479,2.38546,8.929755,0.845515,1.952438,6.599742,0.968727,1.965348,7.237526,0.960626,2.054469,11.132195,1.030086,2.346046,6.615275,1.060671,2.361466,8.38649,1.079022,2.490973,9.115416,1.106752,2.599524,5.347542,1.1386,2.389847,1.050326,2.526603,5.131856,0.976109,2.174465,11.63659,0.981734,1.754095,9.460824,1.014926,1.988378,8.485186,1.012217,1.910605,7.622911,1.023293,0.368421,14.062088,1.023346,1.839546,0.890342,1.879507,0.924596,1.705852,6.26181,1.026994,2.246719,5.950184,1.047005,1.930498,8.241359,1.073009,2.033867,16.971033,1.069429,2.37553,22.216597,1.028133,1.784209,8.087508,1.066448,11.460181,1.028939,2.389487,71.771111,1.083015,2.19827,0.99347,2.222674,9.816221,1.05987,2.123944,6.299149,1.027404,2.244074,3.617844
313,0.345232,3865.327148,-0.425532,9.199327,0.742387,2.108253,6.533897,0.496238,1.595576,7.28686,0.242393,1.507883,9.96373,0.506039,1.876348,11.481552,0.558529,1.739409,5.087064,0.520637,1.678988,5.431592,0.363489,1.610491,7.271462,0.057138,2.016605,5.386912,0.466705,1.729599,0.417435,1.618655,5.853127,0.348004,1.578447,9.752064,0.525051,1.423494,7.76803,0.536042,1.902737,6.027476,0.469219,1.513896,4.838752,0.714115,0.578947,12.585383,0.480884,1.426547,0.15589,1.390549,0.211591,1.509388,7.142522,0.39285,1.697489,5.648694,0.644887,1.420925,6.994328,0.470865,1.687408,13.508032,0.504921,1.992139,21.73579,0.542283,1.640574,9.685503,0.513409,12.313455,0.19217,1.82756,63.631443,0.406487,1.790391,0.100501,1.704301,6.676687,0.323916,1.783545,5.614938,0.226101,1.555596,5.150246
1322,-0.37096,5971.908691,0.93617,13.988462,0.838839,2.220128,9.21023,0.618632,1.709962,7.556156,1.066743,2.271462,14.135759,0.723519,1.739439,12.455507,0.358496,1.669654,3.752898,0.520637,1.816348,6.672734,0.486578,1.638942,9.161127,0.67683,1.833598,6.076782,0.816707,1.816932,0.719018,1.796656,8.507166,0.578856,1.552338,11.318206,0.576351,1.969401,8.827892,0.304259,1.556444,8.782686,0.324788,1.470468,6.137716,0.392489,-0.284211,12.348755,0.54787,1.445961,0.491959,1.839697,0.609033,1.433408,7.331613,0.443201,1.374374,6.985246,0.477159,1.225459,7.781986,0.505141,1.430635,13.80075,0.395361,1.399731,19.910713,0.470162,1.654802,7.842978,0.541355,9.7499,0.626906,1.927555,55.894684,0.515497,1.552312,0.292866,1.323377,7.453597,0.389881,1.451563,6.513413,0.606212,2.012567,4.170759
1276,0.080238,7646.689453,0.382979,10.661711,0.239997,1.414067,6.428373,0.316378,1.299046,6.274058,0.457349,1.452502,11.079168,0.171449,1.419768,11.269033,0.091932,1.338789,5.037541,0.227954,1.461503,5.171476,0.11719,1.291334,8.629678,0.156605,1.501014,7.218013,0.320045,1.371859,0.491778,1.351608,4.701694,0.530352,1.428734,6.874288,0.449883,1.491516,4.883769,0.238995,1.166151,6.503235,0.44266,1.29604,5.232315,0.569562,0.178947,10.86646,0.399845,1.23188,0.263471,1.429298,0.553308,1.358118,6.128509,0.345561,1.303596,4.958252,0.191488,1.128747,5.456071,0.501858,1.30104,10.756564,0.35874,1.341461,16.600204,0.386466,1.366147,8.035661,0.927574,11.180257,0.626906,1.39645,61.017178,0.148323,1.479178,0.370605,1.536752,4.211305,0.259843,1.396073,5.243769,0.304269,1.348689,4.257797


In [108]:
params

{'temperature_336': {'min_val': 30.06, 'scale': 31.095},
 'dewPoint_336': {'min_val': 1.18, 'lmbda': 2.4720311728169824},
 'humidity_336': {'min_val': 0.06, 'scale': 0.47},
 'pressure_336': {'min_val': 998.5, 'lmbda': 0.8937171374772127},
 'windSpeed_336': {'min_val': 0.0, 'lmbda': -0.689676774173806},
 'windGust_336': {'min_val': 0.0, 'lmbda': -0.029647408294403037},
 'pressure_312': {'min_val': 1000.21, 'lmbda': 0.7322340821352256},
 'windSpeed_312': {'min_val': 0.0, 'lmbda': -0.7006409129449823},
 'windGust_312': {'min_val': 0.0, 'lmbda': -0.19269252103755727},
 'pressure_288': {'min_val': 1002.64, 'lmbda': 0.8525816177280043},
 'windSpeed_288': {'min_val': 0.0, 'lmbda': -0.611713403104055},
 'windGust_288': {'min_val': 0.0, 'lmbda': -0.14239373174138115},
 'pressure_264': {'min_val': 1001.25, 'lmbda': 0.9397260853332039},
 'windSpeed_264': {'min_val': 0.0, 'lmbda': -0.6815520842898113},
 'windGust_264': {'min_val': 0.0, 'lmbda': -0.1549384483011007},
 'pressure_240': {'min_val': 99

In [109]:
# now let's see the effect of normalization by generating those distribution plots again
ndf_hist = ndf.hist(bins=20, figsize=(16, 16), xlabelsize=1)

<IPython.core.display.Javascript object>

In [110]:
# Looks like we fixed the skew for some features, let's see their distribution (want cluster close to 0)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("absolute feature skew distribution")
norm_skew_df = pd.DataFrame(ndf.skew())
norm_skew_df.index.name="feature"
norm_skew_df.columns=["skew"]
norm_skew_df["skew"] = norm_skew_df["skew"].map(abs)
norm_skew_hist = norm_skew_df["skew"].hist(bins=100, figsize=(10, 5), xlabelsize=10)
norm_skew_df = norm_skew_df.sort_values("skew",ascending=False)
norm_skew_df

<IPython.core.display.Javascript object>

Unnamed: 0_level_0,skew
feature,Unnamed: 1_level_1
pressure96,0.409115
pressure24,0.248389
humidity24,0.238187
dewPoint_336,0.231895
pressure192,0.178228
pressure264,0.173217
windSpeed0,0.131339
pressure_312,0.129554
windSpeed216,0.128722
windSpeed24,0.125473


In [111]:
# now let's rank each feature type by its mean absolute skew
norm_skew_df.index = norm_skew_df.index.map(indexed_feature_to_feature)
norm_skew_df = norm_skew_df.groupby("feature", sort=False)[["skew"]].mean().sort_values("skew",ascending=False)
norm_skew_df

Unnamed: 0_level_0,skew
feature,Unnamed: 1_level_1
dewPoint,0.231895
humidity,0.144651
pressure,0.115695
windSpeed,0.111114
temperature,0.098523
Size,0.009035
windGust,0.005674


In [112]:
# Create data splits using outlier reduced, feature-filtered, normalized dataframe
X_train,y_train,X_valid,y_valid,X_test,y_test, y_all = get_split(ndf, ndf.columns.to_list())

# Create new run (indexed by current date and time)
RUN_NAME = 'run_'+datetime.now().strftime("%Y%m%d_%H%M%S")

# data folder
outdir = './data/'+RUN_NAME

if not os.path.exists(outdir):
    os.mkdir(outdir)

def output(data, filename):
    data.to_csv(outdir+"/"+filename, header="False")
    
output(all_df, "all_data.csv")
output(ndf, "ndf.csv")

# export data splits to csv
output(X_train, "X_train.csv")
output(y_train, "y_train.csv")
output(X_valid, "X_valid.csv")
output(y_valid, "y_valid.csv")
output(X_test, "X_test.csv")
output(y_test, "y_test.csv")
output(y_all, "y_all.csv")

# create metrics file
with open(outdir+"/metrics_"+RUN_NAME+".txt", "w") as text_file:
    # summarize distribution of target feature for each data split
    text_file.write("Training targets summary:\n")
    text_file.write(y_train.describe().to_string()+'\n\n')
    text_file.write("Validation targets summary:\n")
    text_file.write(y_valid.describe().to_string()+'\n\n')
    text_file.write("Test targets summary:\n")
    text_file.write(y_test.describe().to_string()+'\n\n')
    text_file.write("All fire targets summary:\n")
    text_file.write(y_all.describe().to_string()+'\n\n')

# Linear Models

In [114]:
# 2D linear regression

# get most correlated descriptive feature
feature = abs_corr_specific_list[0]
target = "Size"

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.set_ylabel(target)
ax1.set_xlabel(feature)
ax1.set_title(feature+" vs "+target)

# create the regression model on the original dataframe
# X = all_df[feature].values.reshape(-1, 1)
# Y = all_df[target].values.reshape(-1, 1)

# create the regression model on the outlier reduced dataframe
X = ndf[feature].values.reshape(-1, 1)
Y = ndf[target].values.reshape(-1, 1)
lr = LinearRegression()
lr.fit(X, Y)
Y_pred = lr.predict(X)

# extract the model parameters
coeff = lr.coef_
intercept = lr.intercept_
equation = target+" = "+str(coeff[0][0])+" * "+feature+" + "+str(intercept[0])
print(equation)
r_sq = lr.score(X, Y)
print("r^2 = ",r_sq)

mae = metrics.mean_absolute_error(Y, Y_pred)
rmse = math.sqrt(metrics.mean_squared_error(Y, Y_pred))
residuals = Y-Y_pred

results = pd.DataFrame({"actual": Y.flatten(), "predicted": Y_pred.flatten(), "residuals": residuals.flatten()})
print("mae: ", mae)
print("rmse: ", rmse)

# 2D scatter plot comparing two variables
img = plt.scatter(X, Y)
ax1.plot(X, Y_pred, color="red")
plt.show()

# create residual plot to check for heteroscedasticity
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_xlabel("predicted")
ax.set_ylabel("residuals")
ax.set_title("residual plot")

img = plt.scatter(results["predicted"], results["residuals"])
plt.show()

<IPython.core.display.Javascript object>

Size = 0.54904395 * windGust72 + 3.7833323
r^2 =  0.009435072651126974
mae:  1.2814109
rmse:  1.6105437247953063


<IPython.core.display.Javascript object>

In [115]:
# High dimensional linear regression (using all absolutely correlated descriptive features)
target = "Size" # predict size

# use the top absolutely correlated features as inputs
features = abs_corr_specific_list
print("using features: \n", abs_corr_specific_list)
print("\n")
# create the regression model on the outlier reduced dataframe
X = ndf[features]
Y = ndf[target].values.reshape(-1, 1)
# X = X_train[features]
# Y = y_all.values.reshape(-1, 1)
lr = LinearRegression()
lr.fit(X, Y)
Y_pred = lr.predict(X)

# set all negative fire predictions to zero
# m = lambda x: [max(x[0], 0)] # TODO: Use clip function
# Y_pred = np.array([m(y) for y in Y_pred])

# extract the model parameters, note rounded values!
coefficients = lr.coef_.round(decimals=2)
intercept = lr.intercept_.round(decimals=2)

# construct the equation from model parameters
equation = target+" = "
for i in range(len(coefficients[0])):
    coeff = coefficients[0][i]
    f = features[i]
    equation+=str(coeff)+"*"+f+" + "
else:
    equation+=str(intercept[0])

print(equation, "\n")

r_sq = lr.score(X, Y)
print("r^2 = ",r_sq)

Y_inv = inv_box_cox_transform(Y, **params["Size"])
Y_pred_inv = inv_box_cox_transform(Y_pred, **params["Size"])

# mae = metrics.mean_absolute_error(Y, Y_pred)
# rmse = math.sqrt(metrics.mean_squared_error(Y, Y_pred))
# residuals = Y-Y_pred

mae = metrics.mean_absolute_error(Y_inv, Y_pred_inv)
rmse = math.sqrt(metrics.mean_squared_error(Y_inv, Y_pred_inv))
residuals = Y_inv-Y_pred_inv
print("mae: ", mae)
print("rmse: ", rmse)

results = pd.DataFrame({"Y": Y.flatten(), "Y_pred": Y_pred.flatten(), 
                        "Y_inv": Y_inv, "Y_pred_inv": Y_pred_inv, 
                        "residuals": residuals})

results.hist(bins=20, figsize=(8, 8), xlabelsize=1)

# create residual plot to check for heteroscedasticity
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_xlabel("predicted")
ax.set_ylabel("residuals")
ax.set_title("residual plot")

img = plt.scatter(results["Y_pred_inv"], results["residuals"])

# plot actual vs predicted
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_xlabel("actual")
ax.set_ylabel("predicted")
ax.set_title("actual vs predicted")

img = plt.scatter(results["Y"], results["Y_pred_inv"])
results # note: 1 acre ~ 0.75 football field

using features: 
 ['windGust72', 'humidity24', 'windGust24', 'windGust48', 'pressure336', 'pressure264', 'pressure96', 'pressure24', 'pressure192', 'windSpeed24', 'pressure240', 'pressure_96', 'pressure_72', 'pressure216', 'pressure168', 'windSpeed72', 'pressure_312', 'pressure120', 'windGust144', 'pressure_48', 'pressure_144', 'temperature_336', 'pressure_168', 'pressure_192', 'pressure_288', 'windSpeed_144', 'pressure312', 'pressure0', 'pressure144', 'windSpeed288', 'windGust312', 'pressure_240', 'pressure_336', 'windSpeed216', 'windGust96', 'windGust168', 'dewPoint_336', 'windGust_72', 'windSpeed_240', 'windSpeed48', 'windSpeed_288', 'pressure_264', 'windSpeed_192', 'pressure_24', 'windSpeed_120', 'windSpeed240', 'windSpeed_96', 'windSpeed_216', 'windSpeed_264', 'windSpeed264', 'windGust240', 'windSpeed192', 'windSpeed144', 'windGust_216', 'windSpeed_24', 'windGust_48', 'windSpeed_312', 'pressure_216', 'windSpeed_48', 'windSpeed_168', 'windSpeed312', 'windGust288', 'windGust_264', '

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Y,Y_pred,Y_inv,Y_pred_inv,residuals
0,6.29592,5.502198,3081.998926,866.491602,2215.507324
1,6.047731,5.207273,2050.000635,554.169702,1495.830933
2,1.818007,3.443044,6.000001,47.721281,-41.72128
3,3.551633,4.018229,55.000009,102.379022,-47.379013
4,3.642725,4.067051,62.000009,109.38492,-47.384911
5,7.430736,4.343588,23024.996484,159.885263,22865.111221
6,3.617844,4.103036,59.999994,114.871614,-54.87162
7,5.150246,3.148158,508.999933,32.592863,476.40707
8,4.170759,4.855817,125.999994,330.4284,-204.428406
9,4.257797,4.163409,141.999979,124.738901,17.261078


In [524]:
# Visualize a 3D scatter plot for multiple features over time
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# descriptive features
f1 = abs_corr_specific_list[1]
f2 = abs_corr_specific_list[0]
f3 = "Size"
# target feature
f4 = "Size"

# select the dataframe to visualize
plot_df = ndf

# get feature bounds
f1_min, f1_max = get_feature_bounds(plot_df, f1)
f2_min, f2_max = get_feature_bounds(plot_df, f2)
f3_min, f3_max = get_feature_bounds(plot_df, f3)

def draw_scatter(i, ax, df):
    factor = 24
    index = i*factor
    f = lambda i, x: "" if x in ["Costs", "Latitude", "Longitude"] else (str(i) if i >= 0 else "_"+str(abs(i)))

    # set the hour offset per feature
#     x1=f1+f(index, f1)
#     x2=f2+f(index, f2)
#     x3=f3+f(index, f3)
#     x4=f4

    x1=f1
    x2=f2
    x3=f3
    x4=f4

    # set the axis values per feature
    x = df[x1]
    y = df[x2]
    z = df[x3]
    c = df[x4]

    ax.set_xlabel(x1)
    ax.set_ylabel(x2)
    ax.set_zlabel(x3)

    # set constant axis bounds
    ax.set_xlim(f1_min, f1_max)
    ax.set_ylim(f2_min, f2_max)
    ax.set_zlim(f3_min, f3_max)

    linear_features = [x1,x2]
    linear_target = x3
    
    # create the linear regression model
    X = df[linear_features] # use first two features as predictors to third
    Y = df[linear_target].values.reshape(-1, 1)
    lr = LinearRegression()
    lr.fit(X, Y)
    Y_pred = lr.predict(X)

    # extract the model parameters, note rounded values!
    coefficients = lr.coef_.round(decimals=2)
    intercept = lr.intercept_.round(decimals=2)

    # construct the equation from model parameters
    equation = linear_target+" = "
    for i in range(len(coefficients[0])):
        coeff = coefficients[0][i]
        f = linear_features[i]
        equation+=str(coeff)+" * "+f+" + "
    else:
        equation+=str(intercept[0])

    # ax.plot(x, y, Y_pred.flatten())

    # construct the line of best fit (needs fix)
#     lx = np.arange(f1_min, f1_max, 0.715)
#     ly = np.arange(f2_min, f2_max, 0.25)
#     inter = np.full((1, len(lx)), intercept[0])
#     lz = lx*coefficients[0][0] + ly*coefficients[0][1] + inter[0]
    
#     ax.plot(lx, ly, lz)
    
    r_sq = lr.score(X, Y)
    r_sq_str = "r^2 = "+str(r_sq)
                                     
    img = ax.scatter(x, y, z, c=c, cmap="jet")
    
    ax.set_title(label=equation+"\n"+r_sq_str, fontsize=8)
    fig.suptitle(x1+" vs "+x2+" vs "+x3)
    return img



init_slider_val = 0

# Update the axis for the initial scatterplot
img = draw_scatter(init_slider_val, ax, plot_df)

# Draw the colorbar and label it with the target feature
fig.colorbar(img).set_label(f4)

# Create a new slider axes with specified dimensions
slider_ax = plt.axes([0.1, 0.05, 0.8, 0.05])

# Create the slider
a_slider = Slider(slider_ax,      # the axes object containing the slider
                  'days',         # the name of the slider parameter
                  -14,            # minimal value of the parameter
                  14,             # maximal value of the parameter
                  valinit=0.,     # initial value of the parameter
                  valfmt="%i"
                 )

# Create integer slider offsets
def set_slider(s,val):
    s.val = int(round(val))
    s.poly.xy[2] = s.val,1
    s.poly.xy[3] = s.val,0
    s.valtext.set_text(s.valfmt % s.val)

# function to be executed each time slider
# value changes, variable of this function will
# be assigned the value of the slider
def update(a):
    # update the slider
    set_slider(a_slider, a)
    # get the the updated slider value
    a = int(a_slider.val)
    # clear the axis
    ax.clear()
    # update the axis
    draw_scatter(a, ax, plot_df)
    # redraw the plot
    fig.canvas.draw_idle()

# execute update on slider change
a_slider.on_changed(update)

plt.show()

<IPython.core.display.Javascript object>

In [126]:
# TODO: Use high dimensional regression equation on train/valid/test split (via cdf) and check RMSE values
# - Then try polynomial terms in high dimensional linear regression to maximize r^2

# - Then try to fit nonlinear functions like x^2 with mlp
# - Then see if you can generalize

In [277]:
# Load data splits from previous run

# norm_X_train = pd.read_csv('.\\data\\'+'run_2'+'\\norm_X_train.csv', index_col=0).astype("float32")
# norm_y_train = pd.read_csv('.\\data\\'+'run_2'+'\\norm_y_train.csv', index_col=0).astype("float32")


# norm_X_valid = pd.read_csv('.\\data\\'+'run_2'+'\\norm_X_valid.csv', index_col=0).astype("float32")
# norm_y_valid = pd.read_csv('.\\data\\'+'run_2'+'\\norm_y_valid.csv', index_col=0).astype("float32")

# norm_X_test = pd.read_csv('.\\data\\'+'run_2'+'\\norm_X_test.csv', index_col=0).astype("float32")
# norm_y_test = pd.read_csv('.\\data\\'+'run_2'+'\\norm_y_test.csv', index_col=0).astype("float32")

# print(norm_X_train.describe())
# print(norm_X_valid.describe())
# print(norm_X_test.describe())



In [340]:
# # normalize the descriptive and target features across each data split
# norm_X_train = normalize(X_train)
# norm_X_valid = normalize(X_valid)
# norm_X_test = normalize(X_test)
# norm_y_train = normalize(y_train)
# norm_y_valid = normalize(y_valid)
# norm_y_test = normalize(y_test)

# # export normalized data splits to csv
# norm_X_train.to_csv('.\\data\\'+RUN_NAME+'\\norm_X_train.csv')
# norm_y_train.to_csv('.\\data\\'+RUN_NAME+'\\norm_y_train.csv')
# norm_X_valid.to_csv('.\\data\\'+RUN_NAME+'\\norm_X_valid.csv')
# norm_y_valid.to_csv('.\\data\\'+RUN_NAME+'\\norm_y_valid.csv')
# norm_X_test.to_csv('.\\data\\'+RUN_NAME+'\\norm_X_test.csv')
# norm_y_test.to_csv('.\\data\\'+RUN_NAME+'\\norm_y_test.csv')

# # plot distribution of normalized target feature for each data split
# norm_y_train_hist = norm_y_train.hist(bins=20, figsize=(18, 12), xlabelsize=10)
# plt.savefig('.\\data\\'+RUN_NAME+'\\norm_y_train_hist.png')
# norm_y_valid_hist = norm_y_valid.hist(bins=20, figsize=(18, 12), xlabelsize=10)
# plt.savefig('.\\data\\'+RUN_NAME+'\\norm_y_valid_hist.png')
# norm_y_test_hist = norm_y_test.hist(bins=20, figsize=(18, 12), xlabelsize=10)
# plt.savefig('.\\data\\'+RUN_NAME+'\\norm_y_test_hist.png')

# # summarize distribution of normalized descriptive features for each data split
# print("Training descriptive features summary:")
# print(norm_X_train.describe())
# print("Validation descriptive features summary:")
# print(norm_X_valid.describe())
# print("Test descriptive features summary:")
# print(norm_X_test.describe())

# # summarize distribution of normalized target feature for each data split
# print("Training targets summary:")
# print(y_train.describe())
# print("Validation targets summary:")
# print(y_valid.describe())
# print("Test targets summary:")
# print(y_test.describe())
# print("All fire targets summary:")
# print(y_all.describe())


# Neural Network Models

In [341]:
# define the tf input functions for training
def training_input_fn(batch_size=1):
    return lambda: my_input_fn(norm_X_train, 
                                      norm_y_train["Size"], 
                                      batch_size=batch_size)

def validation_input_fn(batch_size=1):
    return lambda: my_input_fn(norm_X_valid, 
                                            norm_y_valid["Size"], 
                                            num_epochs=1, 
                                            shuffle=False)

In [66]:
# get tf descriptive features
feature_columns = construct_feature_columns(norm_X_train)

# set learning parameters
STEPS_PER_EPOCH = 100
EPOCHS = 100
BATCH_SIZE = 100
LEARNING_RATE = 0.001

# set network architecture
neuron_split = 2**0
num_neurons = math.floor(len(norm_X_train.columns)/neuron_split)
num_layers = 1
hidden_layers = []

# add hidden layers to network
for i in range(num_layers):
    hidden_layers.append(num_neurons)
print(hidden_layers)

# set dropout probability
dropout = 0.5


MODEL_PATH='./models/DNNRegressors/'+RUN_NAME+'/'

run_str = ""
for hl in hidden_layers:
	run_str += '%s_' % hl
run_str += 'D0%s' % (int(dropout*10))
MODEL_PATH += run_str
logging.info('Saving to %s' % MODEL_PATH)

# validation and test configuration
validation_metrics = {"RMSE": tf.contrib.metrics.streaming_root_mean_squared_error}
test_config = skflow.RunConfig(save_checkpoints_steps=100, save_checkpoints_secs=None)

# create the optimizer
my_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)

# build the network
regressor = skflow.DNNRegressor(feature_columns=feature_columns,
# regressor = estimator.DNNRegressor(feature_columns=feature_columns,
				label_dimension=1,
                optimizer=my_optimizer,
				hidden_units=hidden_layers,
				model_dir=MODEL_PATH,
				dropout=dropout,
				config=test_config)


INFO:root:Saving to ./models/DNNRegressors/run_8/87_D05


1
[87]


In [67]:
TRAINING = True
WITHPLOT = False

# Train it
if TRAINING:
	logging.info('Train the DNN Regressor...\n')
	MSEs = []	# for plotting
	STEPS = []	# for plotting

	for epoch in range(EPOCHS+1):

		# Fit the DNNRegressor (This is where the magic happens!!!)
		regressor.fit(input_fn=training_input_fn(batch_size=BATCH_SIZE),
				steps=STEPS_PER_EPOCH)
		# Thats it -----------------------------
		# Start Tensorboard in Terminal:
		# 	tensorboard --logdir='./models/DNNRegressors/'
		# Now open Browser and visit localhost:6006\

		
		# This is just for fun and educational purpose:
		# Evaluate the DNNRegressor every 10th epoch
		if epoch%10==0:
			eval_dict = regressor.evaluate(input_fn=validation_input_fn(),
							metrics=validation_metrics)
# 			print(eval_dict)
			print('Epoch %i: %.5f RMSE' % (epoch+1, eval_dict['RMSE']))


			if WITHPLOT:
				# Generate a plot for this epoch to see the network learning
				y_pred = regressor.predict(x={'X': X}, as_iterable=False)

				E = (y.reshape((1,-1))-y_pred)
				MSE = np.mean(E**2.0)
				step = (epoch+1) * STEPS_PER_EPOCH
				title_string = '%s DNNRegressor after %06d steps (MSE=%.5f)' % \
								(MODEL_PATH.split('/')[-1], step, MSE)
				
				MSEs.append(MSE)
				STEPS.append(step)

				fig = plt.figure(figsize=(9,4))
				ax1 = fig.add_subplot(1, 4, (1, 3))
				ax1.plot(X, y, label='function to predict')
				ax1.plot(X, y_pred, label='DNNRegressor prediction')
				ax1.legend(loc=2)
				ax1.set_title(title_string)
				ax1.set_ylim([0, 1])

				ax2 = fig.add_subplot(1, 4, 4)
				ax2.plot(STEPS, MSEs)
				ax2.set_xlabel('Step')
				ax2.set_xlim([0, EPOCHS*STEPS_PER_EPOCH])
				ax2.set_ylabel('Mean Square Error')
				ax2.set_ylim([0, 0.01])

				plt.tight_layout()
				plt.savefig(MODEL_PATH + '_%05d.png' % (epoch+1), dpi=72)
				logging.info('Saved %s' % MODEL_PATH + '_%05d.png' % (epoch+1))

				plt.close()

    # Now it's trained. We can try to predict some values.
else:
    logging.info('No training today, just prediction')
    try:
        # final Plot
        if WITHPLOT:
            plt.plot(X, y, label='function to predict')
            plt.plot(X, regressor.predict(x={'X': X}, as_iterable=False), \
                     label='DNNRegressor prediction')
            plt.legend(loc=2)
            plt.ylim([0, 1])
            plt.title('%s DNNRegressor' % MODEL_PATH.split('/')[-1])
            plt.tight_layout()
            plt.savefig(MODEL_PATH + '.png', dpi=72)
            plt.close()
    except:
        logging.Error('Prediction failed! Maybe first train a model?')

INFO:root:Train the DNN Regressor...



Epoch 1: 2.52311 RMSE
Epoch 11: 2.46751 RMSE
Epoch 21: 2.47693 RMSE
Epoch 31: 2.47769 RMSE
Epoch 41: 2.51087 RMSE
Epoch 51: 2.48353 RMSE
Epoch 61: 2.49004 RMSE
Epoch 71: 2.49172 RMSE
Epoch 81: 2.48900 RMSE
Epoch 91: 2.49584 RMSE
Epoch 101: 2.51023 RMSE


In [68]:
# get trained values out of the network
for variable_name in regressor.get_variable_names():
    if str(variable_name).startswith('dnn/hiddenlayer') and \
    (str(variable_name).endswith('weights') or \
         str(variable_name).endswith('biases')):
        print('\n%s:' % variable_name)
        weights = regressor.get_variable_value(variable_name)
        print(weights)
        print('size: %i' % weights.size)


dnn/hiddenlayer_0/biases:
[ 2.47e-02 -1.64e-02  8.13e-03 -1.82e-02 -1.67e-02 -3.44e-02 -2.06e-03
 -1.55e-02  4.90e-03  4.71e-03  0.00e+00 -5.19e-03  0.00e+00 -8.80e-03
 -2.10e-02 -6.01e-03 -1.91e-02  0.00e+00 -1.16e-02 -1.43e-02  0.00e+00
 -6.87e-03 -5.05e-03 -7.84e-03  0.00e+00 -2.48e-02  0.00e+00  0.00e+00
 -9.10e-03 -2.54e-02 -5.59e-03  0.00e+00  0.00e+00  1.85e-02  5.55e-03
 -3.30e-02 -2.72e-02  1.61e-02  0.00e+00  0.00e+00  0.00e+00 -2.08e-02
  0.00e+00 -2.87e-03  0.00e+00 -3.49e-02 -2.55e-02 -2.33e-02 -6.61e-02
 -1.41e-02 -2.20e-02  0.00e+00 -5.13e-03 -9.11e-03 -1.20e-02  0.00e+00
 -6.31e-03  3.30e-03  0.00e+00  6.79e-03  0.00e+00 -7.53e-03 -8.54e-03
  9.88e-05 -1.94e-02 -2.33e-02  6.27e-03 -3.09e-03 -2.18e-02  2.87e-03
  0.00e+00 -8.58e-03 -5.56e-03  0.00e+00 -9.37e-03  9.51e-04  0.00e+00
  0.00e+00  0.00e+00 -3.39e-02 -2.64e-03 -5.66e-03 -1.50e-02 -1.05e-02
 -2.82e-02  3.04e-02  0.00e+00]
size: 87

dnn/hiddenlayer_0/weights:
[[-0.45 -0.13 -0.13 ...  0.16 -0.23  0.02]
 [-0.26 -

In [69]:
# create metrics file
with open(".\\data\\"+RUN_NAME+"\\metrics_"+run_str+".txt", "w") as text_file:
    unnormalized_predict_targets = y_test["Size"].reset_index(drop=True)
    normalized_predict_targets = norm_y_test["Size"].reset_index(drop=True)

    def predict_test_input_fn(batch_size=1):
        return lambda: my_input_fn(norm_X_test, 
                                                normalized_predict_targets, 
                                                num_epochs=1,
                                                batch_size=batch_size,
                                                shuffle=False)

    normalized_test_predictions = regressor.predict(input_fn=predict_test_input_fn(batch_size=len(normalized_predict_targets)+2), as_iterable=False)


    normalized_test_mae = metrics.mean_absolute_error(normalized_predict_targets, normalized_test_predictions)


    compare_df = pd.DataFrame()
    compare_df["unnormalized_test_predictions"] = pd.DataFrame(normalized_test_predictions)[0].apply(lambda x: math.exp(x)-1)
    compare_df["unnormalized_predict_targets"] = unnormalized_predict_targets
    compare_df["diff"] = compare_df["unnormalized_test_predictions"] - compare_df["unnormalized_predict_targets"]

    over_df = compare_df["diff"][lambda x: x >= 0]
    over = len(over_df.index)

    under_df = compare_df["diff"][lambda x: x < 0]
    under = len(under_df.index)

    print("predicted too high count:")
    text_file.write("predicted too high count:")
    text_file.write('\n')
    print(over)
    text_file.write(str(over))
    text_file.write('\n')
    print("predicted too high percent of total:")
    text_file.write("predicted too high percent of total:")
    text_file.write('\n')
    print(over/len(compare_df["diff"].index))
    text_file.write(str(over/len(compare_df["diff"].index)))
    text_file.write('\n')
    print("predicted too high mean bias error:")
    text_file.write("predicted too high mean bias error:")
    text_file.write('\n')
    print(over_df.mean())
    text_file.write(str(over_df.mean()))
    text_file.write('\n')

    print('\n')
    text_file.write('\n')

    print("predicted too low count:")
    text_file.write("predicted too low count:")
    text_file.write('\n')
    print(under)
    text_file.write(str(under))
    text_file.write('\n')
    print("predicted too low percent of total:")
    text_file.write("predicted too low percent of total:")
    text_file.write('\n')
    print(under/len(compare_df["diff"].index))
    text_file.write(str(under/len(compare_df["diff"].index)))
    text_file.write('\n')
    print("predicted too low mean bias error:")
    text_file.write("predicted too low mean bias error:")
    text_file.write('\n')
    print(under_df.mean())
    text_file.write(str(under_df.mean()))
    text_file.write('\n')

    print('\n')
    text_file.write('\n')

    print("total mean absolute error:")
    text_file.write("total mean absolute error:")
    text_file.write('\n')
    mae = compare_df["diff"].apply(lambda x: abs(x)).mean()
    print(mae)
    text_file.write(str(mae))
    text_file.write('\n')
    print("total mean bias error:")
    text_file.write("total mean bias error:")
    text_file.write('\n')
    mbe = compare_df["diff"].mean()
    print(mbe)
    text_file.write(str(mbe))
    text_file.write('\n')
    print("total mean squared error:")
    text_file.write("total mean squared error:")
    text_file.write('\n')
    mse = compare_df["diff"].apply(lambda x: x**2).mean()
    print(mse)
    text_file.write(str(mse))
    text_file.write('\n')
    rmse = math.sqrt(mse)
    print("total root mean squared error:")
    text_file.write("total root mean squared error:")
    text_file.write('\n')
    print(rmse)
    text_file.write(str(rmse))
    text_file.write('\n')

    print(compare_df)
    compare_df.to_csv('.\\data\\'+RUN_NAME+'\\compare_df_'+run_str+'.csv')


predicted too high count:
93
predicted too high percent of total:
0.5081967213114754
predicted too high mean bias error:
240.821998070441


predicted too low count:
90
predicted too low percent of total:
0.4918032786885246
predicted too low mean bias error:
-6085.158308448448


total mean absolute error:
3115.0857572727396
total mean bias error:
-2870.315857594586
total mean squared error:
159208831.31819496
total root mean squared error:
12617.798196127364
     unnormalized_test_predictions  unnormalized_predict_targets          diff
0                       314.634616                          40.0    274.634616
1                       268.294133                          53.0    215.294133
2                       460.502353                          26.0    434.502353
3                       277.365196                         700.0   -422.634804
4                       152.851427                        1080.0   -927.148573
5                       204.210761                       91281.0