## Post-process data

* Aggregate into single database

* Generate cleaning functions to detect and remove outlier models

In [None]:
import helix_funcs
import geopandas as gpd
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [None]:
# Below should only be run to combine the indidivual results if needed
#helix_funcs.combine_processed_results('./processed/grids1/','./master_grid1.csv')

## testing

* Remove unused data 

* Remove outliers based on sigma values

    - Loop over individual variables
    - Iscolate individual SWL
    - Iscolate individual shape_ids
    - find standard deviation and mean
    - if any value falls outside of 3 sigma flag/remove it

* also may need to seperate tables by variables due to size

In [None]:
%%time
df = pd.read_csv('./master_grid1.csv')

df = df.drop(['season','is_monthly','month','min','max','std'], 1) # drop un-wanted columns

In [None]:
# prior to dropping bad rows...
len(df)

In [None]:
def sigma_filter(tmp_array, sigma=3):
    """ Given an array of values, return back a list of
        booleans that can be used as an index, which
        are true when the cells area average fall outside of the 
        X sigma range of the group mean.
        If there arent enough values to determine group stats, a
        None is returned instead
    """
    if len(tmp_array) > 3:
        shape_std = np.std(tmp_array)
        shape_mean = np.mean(tmp_array)
        print('mean: ',shape_mean,'std: ', shape_std)
        lower_range = shape_mean - (sigma * shape_std)
        upper_range = shape_mean + (sigma * shape_std)
        print('valid:', lower_range,'to', upper_range)
        truthy_index = [item < lower_range or item > upper_range for item in tmp_array]
        return truthy_index
    else:
        return None

In [None]:
%%time

# Looping over topic, SWL, and shape id, calculate where the rows are outside of a statistical
# norm (defined as ±3sigma around observed mean), and drop those entries from the dataframe.

bad_indexs =[]
verbose = False

for topic in df['impact_tag'].unique():
    tmp_topic = df[df['impact_tag'] == topic]
    if verbose: print('Topic:', topic, len(tmp_topic),'items')
    for variable in tmp_topic['variable'].unique():
        tmp_var = tmp_topic[tmp_topic['variable'] == variable]
        if verbose: print('\tVariable: ',variable, len(tmp_var),'items')
        for swl in tmp_var['swl_info'].unique():
            tmp_swl = tmp_var[tmp_var['swl_info'] == swl]
            if verbose: print('\t\tSWL: ',swl,len(tmp_swl),'items')
            for shape in tmp_swl['shape_id'].unique():
                tmp_shape = tmp_swl[tmp_swl['shape_id'] == shape]
                if verbose: print('\t\t\tShape id: ',shape,len(tmp_shape),'items')
                tmp_values = tmp_shape['mean'].values
                tmp_indexs = sigma_filter(tmp_values, sigma=3)
                # Remove bad rows from the large dataframe
                if tmp_indexs:
                    cnt = 0
                    for t in tmp_indexs:
                        if t == True:
                            cnt += 1
                    if verbose: print('found',cnt,'/',len(tmp_values),'out of bounds')
                    bad_indexs.append(list(tmp_shape.index[tmp_indexs].values))
                #break  # break for shapes
            #break   # break for swls
        #break     # break for variables
    #break    # break for topics
    


In [None]:
flat_indexs = []

for i_list in bad_indexs:
    for i in i_list:
        flat_indexs.append(i)

flat_indexs = sorted(flat_indexs)

In [None]:
%%time
start_size = len(df)

df = df.drop(flat_indexs)

end_size = len(df)

print("Dropped", start_size - end_size,'from table')

In [None]:
df.to_csv('./master_1deg_cleaned.csv', index=False)

In [None]:
#%%time
#df = df.drop(tmp_shape.index[tmp_indexs])

In [None]:
# len(indexes_to_keep)

In [None]:
# len(set(df.index))

In [None]:
# set(tmp_shape.index[tmp_indexs])