# Filter wsn data

Load libraries

In [1]:
from wsn_client import query
import datetime, os
from matplotlib import pyplot
import numpy as np
import pandas as pd
import os.path


Load one station

In [244]:
# Time period
start = datetime.datetime(2019, 1, 1)
end = datetime.datetime(2020, 10, 10)

# Load station
#station_name =   [ f"sw-{x:03d}"  for x in range(1,14) ]
#for name in station_name:
name = "sw-001"
#var_oi = ['tmp_temperature','bme_hum'] #[]'type','acc_x','acc_y','acc_z','ds1820','mb_distance','vl_distance','tmp_temperature'
df = query.query('postgresql', name=name, time__gte=start, time__lte=end, limit=2000000000000)

# Check if empty
if df.empty: 
    print(name," --- Frame is >> EMPTY <<")
else:     
    # Remove non recorded data
    if 'type' in df.columns:
        df = df[df.type==0]
    print(name)
    # Remove duplicated indexes and frames
    df=df.drop_duplicates(['time', 'frame'])
    # Printing Number of columns and names
    print('Number of columns :', df.shape[1]) 
    print(df.columns)

sw-001
Number of columns : 29
Index(['time', 'frame', 'type', 'bat', 'acc_x', 'acc_y', 'acc_z', 'bme_tc',
       'int_tc', 'bme_hum', 'int_hum', 'bme_pres', 'int_pres', 'mb_distance',
       'vl_distance', 'tmp_temperature', 'ds1820', 'altitude', 'latitude',
       'longitude', 'gps_accuracy', 'gps_satellites', 'mlx_object',
       'mlx_ambient', 'received', 'momsn', 'iridium_cep', 'iridium_latitude',
       'iridium_longitude'],
      dtype='object')


In [243]:
# print("NaN percentage per column")
# print((df.isna().sum()/len(df)*100).round(2))

In [3]:
sensor_tmp = ['tmp_temperature','bme_tc']#, 'ds1820', 'mlx_ambient'
sensor_hum = ['bme_hum',]
sensor_wind = ['wind_dir','wind_gust','wind_speed']
sensor_snow = ['mb_distance','vl_distance']
sensor_pres = ['bme_pres']
sensor_acc = ['acc_x','acc_y','acc_z']

In [4]:
coord=df[df.columns[df.columns.isin(['latitude','longitude','altitude'])]][df.gps_satellites>7].agg(['mean','median', 'std']).round(4)

In [5]:
# Extract wanted and existing columns
coltoget2 = ['bme_tc','bme_hum', 'bme_pres', 'mlx_object','mb_distance',]
df[df.columns[df.columns.isin(coltoget2)]]

Unnamed: 0_level_0,bme_tc,bme_hum,bme_pres,mb_distance,mlx_object
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-04-06 15:00:00,22.820000,12.080078,103088.164062,"[2161, 2164, 2166, 2165, 2165]",
2019-04-06 15:10:00,23.020000,11.994141,103092.312500,"[1203, 1200, 1201, 1207, 1210]",
2019-04-06 15:20:00,23.590000,10.735352,103091.242188,"[1286, 1913, 1910, 1910, 1911]",
2019-04-06 15:30:00,23.889999,11.067383,103101.437500,"[1236, 1247, 1234, 1247, 1237]",
2019-04-06 15:40:00,24.070000,10.757812,103115.226562,"[1303, 1254, 1251, 1221, 1929]",
...,...,...,...,...,...
2020-01-07 19:00:00,-9.480000,100.000000,93323.742188,"[2666, 2661, 2652, 2653, 2652]",
2020-01-07 19:10:00,-9.530000,100.000000,93313.500000,"[2703, 2678, 2680, 2710, 2679]",
2020-01-07 19:20:00,-9.380000,100.000000,93264.695312,,
2020-01-07 19:30:00,-9.460000,100.000000,93313.304688,"[2715, 2719, 2728, 2726, 2724]",-9.410004


In [89]:
type(df)

pandas.core.frame.DataFrame

In [387]:
def quality_control_air_temperature(df,f_update=False,f_print=True):
    """Apply quality control on air temperature data.

    The following tests are applied:
        - Must pass monthly temperature limit. QC.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        data downloaded from wsn
    flag_update : boolean
        Flag to recompute the monthly min and max of the temperature (saved 
        in csv at data_qc/temperature_,monthly_limit.csv)
    """

    sensor_tmp = ['tmp_temperature','bme_tc']
        
    
    if all(~df.columns.isin(sensor_tmp)):
        if f_print: print('No temperature data defined as:')
        if f_print: print(sensor_tmp) 
        temp=None
        return(temp)
        
    # Update monthly limits and save it in a csv file (can be manually edited)
    filename='data_qc/temperature_monthly_limit.csv'
    if (f_update) | (~os.path.isfile(filename)): 
        update_file_temperature_monthly_limits(df.tmp_temperature,filename,f_print)
    
    # Load and select data source
    if f_print: print('Load temperature data')        
    col_to_get=df.columns.isin([sensor_tmp[0]])
    if any(col_to_get):
        temp=df[df.columns[col_to_get]]
        if f_print: print(temp)
        
        # Quality Control -- 1) -- Monthly limit
        limit_table=pd.read_csv(filename)
        temp[sensor_tmp[0] + '_qc']=df[[sensor_tmp[0]]].apply(lambda x: qc_temperature_monthly_limits(x,limit_table,
                                                                                                      var='temp'),axis=1)
    
    return(temp)



def update_file_temperature_monthly_limits(df,filename='data_qc/temperature_monthly_limit.csv',f_print=True):
    """Compute monthly air temperature minum and maximum.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        temperature data downloaded from wsn
    """
    if f_print: print('Update monthly limits for Air Temperature')
    monthly_data=df.groupby(df.index.month).agg(['min', 'max','std','count']).round(2)
    monthly_data=monthly_data.rename(columns = {'min':'temp_min','max':'temp_max',
                                                'std':'temp_std','count':'temp_count'})
    monthly_data.to_csv(filename,index_label='month')
    
    
def qc_temperature_monthly_limits(x,limit_table, var='temp'):
    """Compute whether points are in range and assign quality assessment.
    
    The function is built for apply() and needs to be run as followed:
    qc_range=df[['tmp_temperature']].apply(lambda x: fun(x,limit_table),axis=1)
    
    Parameters
    ----------
    x : pandas.core.frame.DataFrame (one column)
        temperature data downloaded from wsn
    limit_table : pandas.core.frame.DataFrame 
        defining the limit in temperature range per month
    var : string
        name of the variable
    """
    limit=limit_table[limit_table.month.values==x.name.month]
    out=(x<limit.iloc[0][var + '_min']) | (x>limit.iloc[0][var + '_max'])
    out.at[out]='QC1-1'
    return out


## Function to update monthly limits in temperature

In [389]:
tmp=quality_control_air_temperature(df)

Update monthly limits for Air Temperature
Load temperature data
                     tmp_temperature
time                                
2019-04-06 15:00:00          22.6875
2019-04-06 15:10:00          22.8750
2019-04-06 15:20:00          23.4375
2019-04-06 15:30:00          23.7500
2019-04-06 15:40:00          23.9375
...                              ...
2020-01-07 19:00:00          -8.6250
2020-01-07 19:10:00          -8.6875
2020-01-07 19:20:00          -8.5625
2020-01-07 19:30:00          -8.6250
2020-01-07 19:40:00         248.3125

[35868 rows x 1 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [390]:
# tmp=quality_control_air_temperature(df[df.columns[1:3]])

In [136]:
df.all()

None


In [85]:
df.columns[df.columns.isin(sensor_tmp)]

Index(['bme_tc', 'tmp_temperature'], dtype='object')