# figures

This notebook includes miscellaneous post processing and formatting of the output CSVs that are then used for the figures and tables in latex for the paper. Most of this processing is formatting CSVs that came out of models.ipynb. The majority of this notebook reformats the column names ect. so it is easier to load into Latex tikz figures and tables.

In [None]:
import torch
#import geopandas as gpd
import pandas as pd
import os
import datetime as dt
import time
from copy import deepcopy
import datetime
from shapely.geometry import Point, LineString, Polygon, asShape, mapping
import requests
import numpy as np
from shapely.ops import cascaded_union, transform
from functools import partial
import pyproj
#import folium
import math
import requests
import concurrent.futures
import json
import plotly.graph_objects as go
from scipy import stats
import pickle
import sklearn.metrics
import statistics
#from matplotlib import cm, colors
#import seaborn as sn
#import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
os.chdir("../")
print(f"Current working directory: {os.getcwd()}")

## 1. fig 3 formatting

Figure 3b requires formatting the output results in both CO2 emissions and energy in kWh.

In [None]:
results_kwh = {}

file = os.path.join(os.getcwd(), 'output_r', 'latex', 'bootresults.pkl')
with open(file, 'rb') as handle:
    results = pickle.load(handle)

for k, v in results.items():
    m = k.split("_")[0]
    vehicle = k.split("_")[1]
    temp = []
    for sim in v:
        if vehicle == 'electric':
            temp.append(sim.apply(lambda x: x / .707))
        else:
            temp.append(sim.apply(lambda x: (x / 10.18) * 37.95))
    results_kwh[k] = temp

In [None]:
def get_bias_variance(df, result):
    biases, variances = [], []
    for k, v in df.iterrows():
        y_true = v[TARGET]
        y_preds = []
        for sim in result:
            try:
                y_preds.append(sim.loc[k])
            except:
                continue
        if len(y_preds) > 1:
            bias = np.abs(np.mean(y_preds) - y_true)
            biases.append(bias)
            variance = statistics.variance(y_preds)
            variances.append(variance)
    return biases, variances

TARGET = 'target_kg'
bias_data = {}
variance_data = {}
table_results = []

for k, v in results.items():
    biases, variances = get_bias_variance(data[k.split("_")[1]], v)
    bias_data[k] = biases
    variance_data[k] = variances
    print(f"model: {k}, mean bias: {np.mean(biases)}, variance mean: {np.mean(variances)}")

In [None]:
table_results = []
for vehicle in ['diesel', 'hybrid', 'electric']:
    if vehicle == 'diesel':
        temp = {'Vehicle Class': 'ICEV'}
    elif vehicle == 'hybrid':
        temp = {'Vehicle Class': 'HV'}
    else:
        temp = {'Vehicle Class': 'EV'}
    for model in ['mtl', 'baseline']:
        if model == 'mtl':
            m = 'MTL'
        else:
            m = 'Baseline'
        temp[f"{m} Bias Mean"] = np.mean(bias_data[f"{model}_{vehicle}"])
        temp[f"{m} Bias Median"] = np.median(bias_data[f"{model}_{vehicle}"])
        temp[f"{m} Variance Mean"] = np.mean(variance_data[f"{model}_{vehicle}"])
        temp[f"{m} Variance Median"] = np.median(variance_data[f"{model}_{vehicle}"])
    table_results.append(temp)
df_table = pd.DataFrame.from_records(table_results)
df_table = df_table.round(4)
outpath = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'biasvariancetable.csv')
df_table.to_csv(outpath, index=False)
df_table.head()

In [None]:
quantiles = [0.01, 0.25, 0.5, 0.75, 0.99]
box = {}
for m in ['mtl', 'baseline']:
    for vehicle in ['diesel', 'hybrid', 'electric']:
        box[f"{m}{vehicle}"] = pd.Series(bias_data[f"{m}_{vehicle}"]).quantile(quantiles).tolist()
df_box = pd.DataFrame(box)
outpath = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'fullbootdistkwh.csv')
df_box.to_csv(outpath, index=False)
df_box

In [None]:
def get_bias_variance(df, result):
    biases, variances = [], []
    for k, v in df.iterrows():
        y_true = v[TARGET]
        y_preds = []
        for sim in result:
            try:
                y_preds.append(sim.loc[k])
            except:
                continue
        if len(y_preds) > 1:
            bias = np.abs(np.mean(y_preds) - y_true)
            biases.append(bias)
            variance = statistics.variance(y_preds)
            variances.append(variance)
    return biases, variances

TARGET = 'target_kwh'
bias_data = {}
variance_data = {}
table_results = []

for k, v in results_kwh.items():
    biases, variances = get_bias_variance(data[k.split("_")[1]], v)
    bias_data[k] = biases
    variance_data[k] = variances
    print(f"model: {k}, mean bias: {np.mean(biases)}, variance mean: {np.mean(variances)}")

In [None]:
table_results = []
for vehicle in ['diesel', 'hybrid', 'electric']:
    if vehicle == 'diesel':
        temp = {'Vehicle Class': 'ICEV'}
    elif vehicle == 'hybrid':
        temp = {'Vehicle Class': 'HV'}
    else:
        temp = {'Vehicle Class': 'EV'}
    for model in ['mtl', 'baseline']:
        if model == 'mtl':
            m = 'MTL'
        else:
            m = 'Baseline'
        temp[f"{m} Bias Mean"] = np.mean(bias_data[f"{model}_{vehicle}"])
        temp[f"{m} Bias Median"] = np.median(bias_data[f"{model}_{vehicle}"])
        temp[f"{m} Variance Mean"] = np.mean(variance_data[f"{model}_{vehicle}"])
        temp[f"{m} Variance Median"] = np.median(variance_data[f"{model}_{vehicle}"])
    table_results.append(temp)
df_table = pd.DataFrame.from_records(table_results)
df_table = df_table.round(4)
outpath = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'biasvariancetablekwh.csv')
df_table.to_csv(outpath, index=False)
df_table.head()

In [None]:
quantiles = [0.01, 0.25, 0.5, 0.75, 0.99]
box = {}
for m in ['mtl', 'baseline']:
    for vehicle in ['diesel', 'hybrid', 'electric']:
        box[f"{m}{vehicle}"] = pd.Series(bias_data[f"{m}_{vehicle}"]).quantile(quantiles).tolist()
df_box = pd.DataFrame(box)
outpath = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'fullbootdistkwh.csv')
df_box.to_csv(outpath, index=False)
df_box

## 2. mtl vs baseline formatting

Format the output from the MTL vs Baseline evaluation in models.ipynb.

In [None]:
# mtl_vs_baseline (test set)

file_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'mtl_vs_baseline.csv')
df = pd.read_csv(file_path)
df = df.set_index('description')

vehicles = ['diesel', 'hybrid', 'electric']
metrics = ['mae', 'mse']

result = {'mtlmae': [], 'mtlmse': [], 'baselinemae': [], 'baselinemse': [], 'vehicle': []}

for vehicle in vehicles:
    result['vehicle'].append(vehicle)
    for metric in metrics:
        y = df.at[f"{vehicle}_test_{metric}", 'mtl_mean']
        result[f"mtl{metric}"].append(y)
    
        y = df.at[f"{vehicle}_test_{metric}", 'baseline_mean']
        result[f"baseline{metric}"].append(y)

out_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'mtlvsbaseline.csv')
temp = pd.DataFrame(result)
temp['improvementmse'] = temp.apply(lambda row: 100*((row['baselinemse'] - row['mtlmse']) / row['baselinemse']), axis=1)
temp['improvementmae'] = temp.apply(lambda row: 100*((row['baselinemae'] - row['mtlmae']) / row['baselinemae']), axis=1)
#r = {'mtlmae': temp['mtlmae'].sum(), 'mtlmse': temp['mtlmse'].sum(), 'baselinemae': temp['baselinemae'].sum(), 'baselinemse': temp['baselinemse'].sum(), 'vehicle': 'total'}
#temp = temp.append(r, ignore_index=True)
temp.to_csv(out_path, index=False)

## 3. Emissions correlation table 

This changes the column names in table 3 of the paper.

In [None]:
# EC Correlation

file_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'ec_correlation.csv')
df = pd.read_csv(file_path, index_col=0)
ind = ["Average Speed", "Jam Factor", "Temperature", 'Preciptitation Intensity', 'Wind Gust', 'Humidity', 'Visibility', 'Wind Speed', 'Speed Ratio', 'Change in Elevation', 'Elevation Difference', 'Time to Travel', 'Distance Travelled']
df.index = ind
df = df.sort_values(by=['electric'], ascending=False)
df = df.rename(columns={'diesel': 'ICEV', 'hybrid': 'HV', 'electric': 'EV'})
out_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'eccorrelation.csv')
df.to_csv(out_path, index_label='Feature')
df.head()

## 4. Bootstrap evaluation formatting

This formats the output to make figure 5 in the paper

In [None]:
# bootstrap (full data)

file_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'full_boot.csv')
df = pd.read_csv(file_path)
df = df.rename(columns={'vehicle': 'vehicle', 'mtl_bias': 'mtlbias', 'mtl_mae': 'mtlmae', 'mtl_mse': 'mtlmse', 'baseline_bias': 'baselinebias', 'baseline_mae': 'baselinemae', 'baseline_mse': 'baselinemse'})
out_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'fullboot.csv')
df.to_csv(out_path, index=False)

In [None]:
# bootstrap full - bias distribution

file_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'full_boot_baseline_bias_distribution.csv')
df = pd.read_csv(file_path)
df = df.rename(columns={'diesel': 'baselinediesel', 'hybrid': 'baselinehybrid', 'electric': 'baselineelectric'})

file_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'full_boot_mtl_bias_distribution.csv')
df2 = pd.read_csv(file_path)
df2 = df2.rename(columns={'diesel': 'mtldiesel', 'hybrid': 'mtlhybrid', 'electric': 'mtlelectric'})

df3 = pd.concat([df, df2], axis=1)
out_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'fullbootdist.csv')
df3.to_csv(out_path, index=False)

## 5. ITL

This section formats the data to be presented in figure 6 of the paper

In [None]:
file_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'itl_temp.csv')
df = pd.read_csv(file_path)
#temp = df.groupby(['source', 'target', 'target_frac']).mean()
#temp.head()
result = {'targetfrac': sorted(df['target_frac'].unique().tolist()), 'targetsamples': sorted(df['target_samples'].unique().tolist())}
for source in ['diesel', 'hybrid', 'electric']:
    for target in ['diesel', 'hybrid', 'electric']:
        if source != target:
            temp = df[(df['source']==source) & (df['target']==target)].groupby(['target_frac']).mean().sort_values(by=['target_frac'])
            temp['improvementmse'] = temp.apply(lambda row: 100 * ((row['mse_baseline'] - row['mse_target']) / row['mse_baseline']), axis=1)
            temp['improvementmae'] = temp.apply(lambda row: 100 * ((row['mae_baseline'] - row['mae_target']) / row['mae_baseline']), axis=1)
            result[f"{source}{target}mseimprovement"] = temp['improvementmse'].values.tolist()
            result[f"{source}{target}maeimprovement"] = temp['improvementmae'].values.tolist()
            result[f"{source}{target}mtlmse"] = temp['mse_target'].values.tolist()
            result[f"{source}{target}mtlmae"] = temp['mae_target'].values.tolist()
            result[f"{source}{target}baselinemae"] = temp['mae_baseline'].values.tolist()
            result[f"{source}{target}baselinemse"] = temp['mse_baseline'].values.tolist()
            
df1 = pd.DataFrame(result)
df1['targetper'] = df1['targetfrac'].apply(lambda x: 100*x)
out_path = os.path.join(os.getcwd(), 'output_r', 'latex', 'paper', 'itl.csv')
df1.to_csv(out_path, index=False)