In [1]:
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
from sklearn.metrics import mean_absolute_error, mean_squared_error

import auxiliaries as aux

## Setup

In [2]:
# Develop window for data separation
species = 'Deer'
file_name = 'GSM02927'

years_to_predict = 1
time_intervals = 3 if species == 'Moose' else 4

future_window = aux.FuturePredictionWindow(species, file_name)
past_window = aux.PastPredictionWindow(species, file_name)

results = {}

print('------Original Regressive Window------')
print(f'Original Data Size: {len(future_window.orig_df)}')
print(f'Training Data Size: {len(future_window.train_df)}')
print(f'Testing Data Size: {len(future_window.test_df)}\n')

print(f'Training Mean:\n{future_window.train_df.mean()}')
print(f'Training Std. Dev.:\n{future_window.train_df.std()}\n')

print('------Reversed Regressive Window------')
print(f'Original Data Size: {len(past_window.orig_df)}')
print(f'Training Data Size: {len(past_window.train_df)}')
print(f'Testing Data Size: {len(past_window.test_df)}\n')

print(f'Training Mean:\n{past_window.train_df.mean()}')
print(f'Training Std. Dev.:\n{past_window.train_df.std()}\n')

### Future-Prediction Regressive Model

In [3]:
future_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, input_shape=(1,3), activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(2)
])

future_window.model_compilation_and_fitting(future_model)

In [4]:
results['Future-Prediction Regressive Model'] = future_model.evaluate(future_window.test_input, future_window.test_label, verbose=1, return_dict=True)

In [5]:
# Generate CSV file analyzing testing set results
def graph_regressive(model, window, url_dest):
    test_df = window.test_df
    test_df['timestamp'] = window.timeline[int(len(window.orig_df)*0.7):]

    initial_df = test_df[['timestamp', 'month', 'day', 'external-temperature', 'longitude', 'latitude']]
    initial_df['id'] = ['original' for _ in range(len(initial_df))]

    data = {
        'timestamp': [],
        'month': [],
        'day': [],
        'external-temperature': [],
        'longitude': [],
        'latitude': [],
        'id': []
    }

    for i in range(len(initial_df)):
        section = initial_df.iloc[i]

        timestamp = section['timestamp']
        month = (section['month'] - window.train_df['month'].mean())/window.train_df['month'].std()
        day = (section['day'] - window.train_df['day'].mean())/window.train_df['day'].std()
        external_temp = (section['external-temperature'] - window.train_df['external-temperature'].mean())/window.train_df['external-temperature'].std()

        if isinstance(model, tf.keras.Sequential):
            output_fields = model(np.array([external_temp, month, day]).reshape([1, 1, 3])).numpy()*window.train_df[['longitude', 'latitude']].std().values + window.train_df[['longitude', 'latitude']].mean().values
            output_fields = output_fields[0][0]
        elif isinstance(model, sklearn.neighbors.KNeighborsRegressor):
            output_fields = model.predict([[external_temp, month, day]])[0]*window.train_df[['longitude', 'latitude']].std() + window.train_df[['longitude', 'latitude']].mean()
            output_fields = output_fields.values

        longitude = output_fields[0]
        latitude = output_fields[1]
        id = 'predicted'

        data['timestamp'].append(timestamp)
        data['month'].append(section['month'])
        data['day'].append(section['day'])
        data['external-temperature'].append(section['external-temperature'])
        data['longitude'].append(longitude)
        data['latitude'].append(latitude)
        data['id'].append(id)

    add_on_df = pd.DataFrame(data)
    
    full_df = pd.concat([initial_df, add_on_df], ignore_index=True)
    full_df.to_csv(url_dest, index=False)

    return full_df

future_test_results_df = graph_regressive(future_model, future_window, f'CSVFiles/TestPerformanceCSV/TestPredictions/{species}/{file_name}_futureprediction_testpredictions.csv')

In [6]:
future_test_results_df

In [7]:
important_columns = set(future_test_results_df.columns)
important_columns.remove('timestamp')
important_columns.remove('id')

normed_future_test_results_df = future_test_results_df.copy(deep=True)
normed_future_test_results_df[list(important_columns)] = (future_test_results_df[list(important_columns)] - future_window.train_df[list(important_columns)].mean())/future_window.train_df[list(important_columns)].std()
normed_future_test_results_df

In [8]:
# Defining Input and Output Labels
input_columns = ['month', 'day', 'external-temperature']
output_columns = ['longitude', 'latitude']

# Input ordering
future_original_output = normed_future_test_results_df[normed_future_test_results_df.id == 'original'][output_columns].values
future_original_output = future_original_output.reshape((future_original_output.shape[0], future_original_output.shape[-1]))
future_predicted_output = normed_future_test_results_df[normed_future_test_results_df.id == 'predicted'][output_columns].values
future_predicted_output = future_predicted_output.reshape((future_predicted_output.shape[0], future_predicted_output.shape[-1]))

# Statistics gathering
mae_values = list(mean_absolute_error(future_original_output, future_predicted_output, multioutput='raw_values'))
mae_values = {coord_name: coord_value for coord_name, coord_value in zip(['mae_longitude', 'mae_latitude'], mae_values)}

mse_values = list(mean_squared_error(future_original_output, future_predicted_output, multioutput='raw_values'))
mse_values = {coord_name: coord_value for coord_name, coord_value in zip(['mse_longitude', 'mse_latitude'], mse_values)}

# Apply statistics to results
results['Future-Prediction Regressive Model'].update(mae_values)
results['Future-Prediction Regressive Model'].update(mse_values)

In [9]:
results['Future-Prediction Regressive Model']

In [10]:
# Histogram for Distance Errors
future_distances = []
unnormed_future_original_output = future_test_results_df[future_test_results_df['id'] == 'original'][output_columns].values
unnormed_future_predicted_output = future_test_results_df[future_test_results_df['id'] == 'predicted'][output_columns].values

r = 3958.756 + 636/5280 # Average radius of Earth to sea level plus elevation of Trentino
for original_coord, predicted_coord in zip(unnormed_future_original_output, unnormed_future_predicted_output):
    curr_d = 2*r*math.asin(
        math.sqrt(
            ((1-math.cos(math.radians(predicted_coord[1] - original_coord[1]))) 
             + math.cos(math.radians(predicted_coord[1]))*math.cos(math.radians(original_coord[1]))*(1-math.cos(math.radians(predicted_coord[0] - original_coord[0]))))/2))
    future_distances.append(curr_d)

future_distance_df = pd.DataFrame({'Distances': future_distances, 'Timestamp': future_test_results_df[future_test_results_df.id == 'original']['timestamp']})
future_distance_df['Timestamp'] = pd.DatetimeIndex(future_distance_df['Timestamp'])

In [11]:
hist, bins = np.histogram(future_distances, bins=12, range=(0, 3), density=True)

In [12]:
# Histogram of future_distances
con = plt.bar(bins[0:len(bins) - 1] + 0.125, hist*0.25, edgecolor='black', width=0.25)
plt.title(f"Distance Between Actual and Predicted Future Positions (n = {len(future_distances)})")
plt.xlabel("Distance (mi)")
plt.ylabel("Percentage of predictions in each group")
plt.show()

In [13]:
month = 'January'
month_to_date = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

hist, bins = np.histogram(future_distance_df[future_distance_df['Timestamp'].dt.month == month_to_date[month]]['Distances'].values, bins=12, range=(0, 3), density=True)

# Histogram of future_distances by month
con = plt.bar(bins[0:len(bins) - 1] + 0.125, hist*0.25, edgecolor='black', width=0.25)
plt.title(f"Error Distances in {month} (n = {len(future_distance_df[future_distance_df['Timestamp'].dt.month == month_to_date[month]]['Distances'].values)})")
plt.xlabel("Distance (mi)")
plt.ylabel("Percentage of predictions in each group")
plt.show()

In [14]:
future_window.csv_extension(f'{species}/{file_name}_futureprediction', species, future_model)

In [15]:
future_model.save(f'ModelFiles/{species}/{file_name}_futureprediction.keras')

### Past-Prediction Regressive

In [16]:
past_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, input_shape=(1,3), activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(2)
])

past_window.model_compilation_and_fitting(past_model)

In [17]:
results['Past-Prediction Regressive Model'] = past_model.evaluate(past_window.test_input, past_window.test_label, verbose=1, return_dict=True)

In [18]:
past_test_results_df = graph_regressive(past_model, past_window, f'CSVFiles/TestPerformanceCSV/TestPredictions/{species}/{file_name}_pastprediction_testpredictions.csv')

In [19]:
past_test_results_df

In [20]:
important_columns = set(past_test_results_df.columns)
important_columns.remove('timestamp')
important_columns.remove('id')

normed_past_test_results_df = past_test_results_df.copy(deep=True)
normed_past_test_results_df[list(important_columns)] = (past_test_results_df[list(important_columns)] - past_test_results_df[list(important_columns)].mean())/past_test_results_df[list(important_columns)].std()
normed_past_test_results_df

In [21]:
# Defining Input and Output Labels
input_columns = ['month', 'day', 'external-temperature']
output_columns = ['longitude', 'latitude']

# Input ordering
past_original_output = normed_past_test_results_df[normed_past_test_results_df.id == 'original'][output_columns].values
past_original_output = past_original_output.reshape((past_original_output.shape[0], past_original_output.shape[-1]))
past_predicted_output = normed_past_test_results_df[normed_past_test_results_df.id == 'predicted'][output_columns].values
past_predicted_output = past_predicted_output.reshape((past_predicted_output.shape[0], past_predicted_output.shape[-1]))

# Statistics gathering
mae_values = list(mean_absolute_error(past_original_output, past_predicted_output, multioutput='raw_values'))
mae_values = {coord_name: coord_value for coord_name, coord_value in zip(['mae_longitude', 'mae_latitude'], mae_values)}

mse_values = list(mean_squared_error(past_original_output, past_predicted_output, multioutput='raw_values'))
mse_values = {coord_name: coord_value for coord_name, coord_value in zip(['mse_longitude', 'mse_latitude'], mse_values)}

# Apply statistics to results
results['Past-Prediction Regressive Model'].update(mae_values)
results['Past-Prediction Regressive Model'].update(mse_values)

In [22]:
results['Past-Prediction Regressive Model']

In [23]:
# Histogram for Distance Errors
past_distances = []
unnormed_past_original_output = past_test_results_df[past_test_results_df['id'] == 'original'][output_columns].values
unnormed_past_predicted_output = past_test_results_df[past_test_results_df['id'] == 'predicted'][output_columns].values

r = 3958.756 + 636/5280 # Average radius of Earth to sea level plus elevation of Trentino
for original_coord, predicted_coord in zip(unnormed_past_original_output, unnormed_past_predicted_output):
    curr_d = 2*r*math.asin(
        math.sqrt(
            ((1-math.cos(math.radians(predicted_coord[1] - original_coord[1]))) 
             + math.cos(math.radians(predicted_coord[1]))*math.cos(math.radians(original_coord[1]))*(1-math.cos(math.radians(predicted_coord[0] - original_coord[0]))))/2))
    past_distances.append(curr_d)

past_distance_df = pd.DataFrame({'Distances': past_distances, 'Timestamp': past_test_results_df[past_test_results_df.id == 'original']['timestamp']})
past_distance_df['Timestamp'] = pd.DatetimeIndex(past_distance_df['Timestamp'])

In [24]:
hist, bins = np.histogram(past_distances, bins=12, range=(0,3), density=True)

In [25]:
# Histogram of distances
con = plt.bar(bins[0:len(bins) - 1] + 0.125, hist*0.25, edgecolor='black', width=0.25)
plt.title(f"Distance Between Actual and Predicted Positions (n = {len(past_distances)})")
plt.xlabel("Distance (mi)")
plt.ylabel("Percentage of predictions in each group")
plt.show()

In [26]:
month = 'January'
month_to_date = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

hist, bins = np.histogram(past_distance_df[past_distance_df['Timestamp'].dt.month == month_to_date[month]]['Distances'].values, bins=12, range=(0, 3), density=True)

# Histogram of distances
con = plt.bar(bins[0:len(bins) - 1] + 0.125, hist*0.25, edgecolor='black', width=0.25)
plt.title(f"Error Distances in {month} (n = {len(past_distance_df[past_distance_df['Timestamp'].dt.month == month_to_date[month]]['Distances'].values)})")
plt.xlabel("Distance (mi)")
plt.ylabel("Percentage of predictions in each group")
plt.show()

In [27]:
past_window.csv_extension(f'{species}/{file_name}_pastprediction', species, past_model)

In [28]:
past_model.save(f'ModelFiles/{species}/{file_name}_pastprediction.keras')

### Final Results

In [29]:
results_df = pd.DataFrame(results).T
results_df.to_csv(f'TestPerformanceCSV/TestMetrics/{species}/{file_name}_testmetrics.csv', index_label='Model_Name')
results_df