In [None]:
import csv
import matplotlib.pyplot as plt
import matplotlib.pyplot as 
import numpy as np
import time
%pylab inline

In [None]:
def filter_outliers_destr(lst, minimum, maximum):
    while (np.min(lst) < minimum):
        error = np.argmin(lst)
        lst[error] = 0.5*(lst[error-1] + lst[error+1])
    while (np.max(lst) > maximum):
        error = np.argmax(lst)
        lst[error] = 0.5*(lst[error-1] + lst[error+1])
        
def filter_outliers_newlist(lst, minimum, maximum):
    new_lst = list(lst)
    
    while (np.min(new_lst) < minimum):
        error = np.argmin(new_lst)
        lst[error] = 0.5*(new_lst[error-1] + new_lst[error+1])
    while (np.max(new_lst) > maximum):
        error = np.argmax(new_lst)
        new_lst[error] = 0.5*(new_lst[error-1] + new_lst[error+1])
        
    return new_lst

def filter_spikes(data, min_roc, max_roc, toggle_nan=False):
    fil_indices = []
    old_values = []
    new_values = []
    
    lst = list(data)
    
    diff = np.diff(lst)
    i = 0
    while (i < len(diff)):
        if (diff[i] < min_roc):
            fil_indices.append(i+1)
            old_values.append(lst[i+1])
            drops = 1
            j = i + 1
            while (diff[j] == 0):
                fil_indices.append(j)
                old_values.append(lst[j])
                drops += 1
                j += 1
            if (toggle_nan):
                for k in range(i+1, j+1):
                    lst[k] = NaN
                    new_values.append(lst[k])
            else:
                slope = (0.5*(lst[j+1] - lst[i])) / drops
                for k in range(i+1, j+1):
                    lst[k] = lst[k-1] + slope
                    new_values.append(lst[k])
            i = j
        i += 1
    filtered = list(zip(fil_indices, old_values, new_values))
            
    return lst, filtered

In [None]:
## FOR Rate of Change errors ##

def find_velocity_extremes_individual(data, threshold):
    assert threshold > 0
    data = data.astype(np.float)
    extremes = []
    for i in range(2, len(data)-1):
        if not (np.isnan(np.array(data[i], dtype=np.float64)) or np.isnan(np.array(data[i+1], dtype=np.float64)) or np.isnan(np.array(data[i-1],dtype=np.float64))):
            diff1 = abs(data[i] - data[i-1])
            diff2 = abs(data[i+1] - data[i])
            if (diff1 > threshold and diff2 > threshold):
                extremes.append(i)
                extremes.append(i+1) 
    return extremes

def find_timestamp_ROC_errors(timestamps, data_header, extremes):
    error_timestamps = []
    for x in extremes:
        error_timestamps.append((timestamps[x], data_header))
    return error_timestamps

def replace_velocity_extremes_individual_NaN(data, threshold):
    extremes = find_velocity_extremes_individual(data, threshold)
    for x in extremes:
        data[x] = NaN

In [None]:
## FOR '-9999' to NaN drops ##

## Returns the number of '-9999' drops in one measurement.
def find_measurement_drops(measurement):
    return list(measurement).count("-9999")

## Returns the total number of '-9999' drops in a set of measurements.
def find_total_drops(all_data):
    drops = 0
    for measurement in all_data:
        drops += find_measurement_drops(measurement)
    return drops

## Prints the number of '-9999' drops in one measurement.
def print_measurement_drops(measurement):
    header = 0
    if (measurement[header] != " flag" and measurement[header] != "Timestamp"):
        print(measurement[header], ": ", find_measurement_drops(measurement))

## Prints the number of '-9999' drops in each measurement in a set of measurements.
def print_all_drops(all_data):
    header = 0
    for measurement in all_data:
            print_measurement_drops(measurement)
    print("TOTAL DROPS: ", find_total_drops(all_data), "\n")

## Filter and replace all '-9999' drops with 'NaN' values in one measurement.    
def replace_drops_NaN(measurement):
    measurment = measurement.tolist()
    for i in range(1, len(measurement)):
        if (measurement[i]== "-9999"):
            measurement[i] = NaN
    measurement = np.asarray(measurement)

## Filter and replace all '-9999' drops with 'NaN' values for all measurements
#  in a set of measurements.
def replace_alldrops_NaN(all_data):
    for measurement in all_data:
        if (measurement[0] != " flag" and measurement[0] != "Timestamp"):
            replace_drops_NaN(measurement)
            
## Return the indices of all drops in a measurement.
def find_dropindex(measurement):
    dropindices = []
    measurment = measurement.tolist()
    for i in range(1, len(measurement)):
        if (measurement[i] == "-9999"):
            dropindices.append(i)
    measurement = np.asarray(measurement)
    return dropindices

## Return the timestamps of drops in a specified measurement (string).
def find_timestamps_drops1(all_data, header):
    droplist = []
    headers = get_headers(all_data)
    if header in headers.keys():
        for x in find_dropindex(all_data[headers[header]]):
            droplist.append(all_data[0][x])
    return droplist        

## Return the timestamps of drops in a specified measurement (numerical index).
def find_timestamps_drops2(all_data, index):
    assert (index % 2 == 1) and (index > 0) and (index < len(all_data))
    droplist = []
    for x in find_dropindex(all_data[index]):
        droplist.append(all_data[0][x])
    return droplist

## Pair timestamps of every drop in a measurment with measurment header for every measurement.
#  If drops exceed the tolerance print out the number of drops instead.
def find_timestamps_drops_all(all_data, tolerance):
    droplist = []
    for measurement in all_data:
        if (measurement[0] != " flag" and measurement[0] != "Timestamp"):
            if (find_measurement_drops(measurement) < (tolerance*len(all_data[0]))):
                droplist.append((measurement[0], find_timestamps_drops1(all_data, measurement[0])))
            else:
                droplist.append((measurement[0], "", find_measurment_drops(measurment)), " drops")
    return dict(droplist)

# Print out the header, timestamp pairing for every measurement
def print_timestamps_drops_all(all_data, tolerance):
    droplist = find_timestamps_drops_all(all_data, tolerance)
    for x in droplist.keys():
        print(x, ": ", droplist[x])

# Return list of headers paired with indices in set.
def get_headers(all_data):
    headers = []
    for i in range(1, len(all_data)):
        headers.append((all_data[i][0], i))
    return dict(headers)

#######################################################################################################################

## FOR "-9999" drops to be interpolated ##

def interpolate_drops(measurement, dimensionality=0):
    measurement = measurement.tolist()
    if (dimensionality == 0):
        for i in range(2, len(measurement)):
            if (measurement[i] == "-9999"):
                j = 1
                while (measurement[i+j] == "-9999" and j < 6):
                    j+=1
                if (j == 6):
                    while(measurement[i+j] == "-9999"):
                        j+=1
                    measurement[i:j-1] = NaN
                    i += j+1
                else:
                    slope = (float(measurement[i+j]) - float(measurement[i-1])) / (j+1)
                    for k in range(i, i+j):
                        measurement[k] = float(measurement[k-1]) + slope
            else:
                i+=1
    return np.asarray(measurement)

In [None]:
def find_VB(data):
    drops = []
    for i in range(1, len(data)):
        if (data[i] == "VB"):
            drops.append(i)
    print(len(drops))
    return drops

def replace(data, ind):
    for x in ind:
        data[x] = NaN
    return data

In [None]:
def reduce_and_mean(inp, size):
    data = inp[1:].astype(float)
    out = np.zeros(int((len(data))/size) + 1)
    i = 0
    while (i < len(data)):
        mean =  np.nanmean(data[i:(i+size-1)])
        out[int(i/size)] = mean
        i += size
    return out

def reduce_and_max(inp, size):
    data = inp[1:].astype(float)
    out = np.zeros(int((len(data))/size) + 1)
    i = 0
    while (i < len(data)):
        maxv =  np.nanmax(data[i:(i+size-1)])
        out[int(i/size)] = maxv
        i += size
    return out

def reduce_and_min(inp, size):
    data = inp[1:].astype(float)
    out = np.zeros(int((len(data))/size) + 1)
    i = 0
    while (i < len(data)):
        minv =  np.nanmin(data[i:(i+size-1)])
        out[int(i/size)] = minv
        i += size
    return out