Aggregates prediction intervals

In [None]:
import os

print(os.getcwd())

In [None]:
# Pull in list of model performance metrics
import glob
import pandas as pd
file_list = glob.glob("*/*.csv")
file_list_excel = glob.glob("*/*.xlsx")
file_list = [f for f in file_list if 'Prediction intervals' in f]
file_list = [f for f in file_list if 'Output' not in f]
file_list_excel = [f for f in file_list_excel if 'Prediction intervals' in f]
file_list = file_list + file_list_excel
file_list_thresholds = [f for f in file_list if 'all ' in f]
file_list_ave = list(set(file_list).difference(set(file_list_thresholds)))

In [None]:
file_dict = {}
for file_name in file_list_thresholds:
    file = pd.read_csv(file_name) 
    file_name = file_name.replace(".csv", "")
    # All files    
    file_name = file_name.replace("Prediction intervals - all ", "")
    LA, model_id = file_name.split('/')
    file['LA'] = LA
    file['model_id'] = model_id
    file_dict[file_name] = file

print(file_dict.keys())

results = pd.concat(file_dict.values(), axis = 0, ignore_index = True)

In [None]:
file_dict = {}
for file_name in file_list_ave:
    file = pd.read_csv(file_name, index_col = 0) 
    file_name = file_name.replace(".csv", "")
    # All files    
    file_name = file_name.replace("Prediction intervals - max, ave", "")
    file_name = file_name.replace("Prediction intervals max ave ", "")
    LA, model_id = file_name.split('/')
    file['LA'] = LA
    file['model_id'] = model_id
    file_dict[file_name] = file

print(file_dict.keys())

results_ave = pd.concat(file_dict.values(), axis = 0, ignore_index = True)

In [None]:
summary_prediction_interval = pd.DataFrame(results_ave[['Average width of prediction interval', 'Width of prediction interval at threshold value']].mean())
summary_prediction_interval = summary_prediction_interval.round(4)
summary_prediction_interval.columns = ['Mean']
summary_prediction_interval.to_csv('Output/Summary prediction intervals.csv')
summary_prediction_interval



In [None]:
# Drop the concatenated summary data for each model
print(results.shape)
results_for_grouping = results.loc[(results['Threshold'] != 'Width of prediction interval at threshold value') &
                     (results['Threshold'] != 'Average width of prediction interval'),]

print(results_for_grouping.shape)

print(results_for_grouping['Threshold'].unique())
results_for_grouping['Threshold'] = pd.to_numeric(results_for_grouping['Threshold'])
print(results_for_grouping['Threshold'].unique())

In [None]:
# Find the average for each threshold
prediction_interval_by_threshold = results_for_grouping.groupby('Threshold')[['Precision', 'Recall', 'F score (beta = 0.1)', 'Prediction interval (threshold +/- 0.03)']].mean().reset_index()
prediction_interval_by_threshold[['Precision', 'Recall', 'F score (beta = 0.1)',
       'Prediction interval (threshold +/- 0.03)']] = prediction_interval_by_threshold[['Precision', 'Recall', 'F score (beta = 0.1)',
       'Prediction interval (threshold +/- 0.03)']].round(4)
prediction_interval_by_threshold.to_csv('Output/Prediction intervals by thresholds.csv')
prediction_interval_by_threshold


In [None]:
# Split up the model id so can do analysis by cv and data type
model_ids = results_for_grouping['model_id'].str.split('_', expand = True)
_, results_for_grouping['Cross Validation'], results_for_grouping['Data Included'] = model_ids[0], model_ids[1], model_ids[2]

In [None]:
# Comparing thresholds by cv
from scipy.stats import mannwhitneyu
prediction_interval_ss = results_for_grouping.loc[results_for_grouping['Cross Validation'] == 'ss','Prediction interval (threshold +/- 0.03)']
prediction_interval_ts = results_for_grouping.loc[results_for_grouping['Cross Validation'] == 'ts','Prediction interval (threshold +/- 0.03)']

print(mannwhitneyu(prediction_interval_ss, prediction_interval_ts, alternative='two-sided'))


prediction_intervals_by_cv = pd.pivot_table(results_for_grouping, index = ['Threshold'], columns = 'Cross Validation', values = 'Prediction interval (threshold +/- 0.03)')

prediction_intervals_by_cv[['ss', 'ts']] = prediction_intervals_by_cv[['ss', 'ts']].round(4)
prediction_intervals_by_cv.rename(columns = {'ss': 'Learning from all cases', 'ts': 'Learning only from earlier cases'}, inplace = True)
prediction_intervals_by_cv.to_csv('Output/Prediction intervals by cross validation.csv')
prediction_intervals_by_cv


In [None]:
# Comparing thresholds by data included
prediction_interval_str = results_for_grouping.loc[results_for_grouping['Data Included'] == 'str','Prediction interval (threshold +/- 0.03)']
prediction_interval_all = results_for_grouping.loc[results_for_grouping['Data Included'] == 'all','Prediction interval (threshold +/- 0.03)']

print(mannwhitneyu(prediction_interval_str, prediction_interval_all, alternative='two-sided'))

prediction_intervals_by_data_type = pd.pivot_table(results_for_grouping, index = ['Threshold'], columns = 'Data Included', values = 'Prediction interval (threshold +/- 0.03)')

prediction_intervals_by_data_type[['str', 'all']] = prediction_intervals_by_data_type[['str', 'all']].round(4)
prediction_intervals_by_data_type.rename(columns = {'str': 'Just structured data', 'all': 'Structured and text data'}, inplace = True)
prediction_intervals_by_data_type.to_csv('Output/Prediction intervals by data type.csv')
prediction_intervals_by_data_type

In [None]:
## Any correlation between threshold value and threshold width

from scipy.stats import pearsonr

results_for_pearsonr = results_for_grouping.dropna()
pearsonr(results_for_pearsonr['Threshold'], results_for_pearsonr['Prediction interval (threshold +/- 0.03)'])