# Limits Generation - QC

This notebook aims to generate climatological and continuity tests limits using basic statistics concepts.



## Importing Libs

In [1]:
# IMPORT LIBS
import pandas as pd
idx = pd.IndexSlice
pd.set_option("display.precision", 2)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('default')

from glob import glob
import os

## Loading Data

In [2]:
# PATHS
# raw_data
data_path = '/home/remobs-dados/PNBOIA/pnboia_qc/data'

In [57]:
# LOAD DATA
# Raw data
os.chdir(data_path)

buoys_df_site = pd.read_csv('pnboia_site_data.csv',parse_dates=['Datetime'],index_col=['buoy','Datetime'])
buoys_df_site.sort_index(inplace=True)

buoys_df_raw = pd.read_csv('pnboia_raw_data.csv',parse_dates=['Datetime'],index_col=['buoy','Datetime'])
buoys_df_raw.sort_index(inplace=True)

# Filtered data

# Get buoys names
buoys_site = buoys_df_site.reset_index().loc[:,'buoy'].unique()
buoys_raw = buoys_df_raw.reset_index().loc[:,'buoy'].unique()

# site data params renaming

params_rename = {
                    'Cvel1':'cspd1',
                    'Cvel2':'cspd1',
                    'Cvel3':'cspd1',
                    'Dewp':'dewpt',
                    'Dpd':'tp',
                    'Gust':'gust',
                    'Humi':'rh',
                    'Pres':'pres',
                    'Pressao_Atm':'pres',
                    'Rad_Solar':'arad',
                    'Salinidade':'sss',
                    'Temp_Agua':'sst',
                    'Temp_Ar':'atmp',
                    'Umidade':'rh',
                    'Velocidade_Vento':'wspd',
                    'Wmax':'mxwvht',
                    'Wspd':'wspd',
                    'Wtmp':'sst',
                    'Wvht':'swvht',
                    'dpd':'tp',
                    'wvht':'swvht',
                    'Direcao_Vento':'wdir',
                    'Wdir':'wdir',
                    'Atmp':'atmp'
                 }

buoys_df_site.rename(columns=params_rename,inplace=True)

In [58]:
buoys_df_site.loc['antartica'].dropna(how='all',axis=1)

Unnamed: 0_level_0,lat,lon,wspd,wdir,wtmp,swvht,tp,mwd,spred,peak_mwd,peak_spred,mean_dpd
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-12-05 19:44:55,-62.21,-58.28,2.4,211.0,,1.99,17.1,79.0,77.0,286.0,213.0,11.0
2020-12-05 20:14:55,-62.21,-58.28,4.4,234.0,0.16,1.69,25.6,79.0,76.0,128.0,195.0,10.2
2020-12-05 20:44:55,-62.21,-58.28,6.0,240.0,,0.85,10.2,48.0,60.0,171.0,182.0,4.9
2020-12-05 21:14:55,-62.21,-58.28,7.2,251.0,0.04,0.86,10.2,52.0,62.0,170.0,182.0,4.7
2020-12-05 21:44:55,-62.21,-58.28,8.4,257.0,,0.88,11.4,60.0,65.0,148.0,196.0,4.3
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-03-03 12:14:55,-61.93,-57.57,6.0,80.0,1.22,1.57,7.9,64.0,64.0,334.0,333.0,6.0
2021-03-03 12:44:55,-61.93,-57.57,8.8,234.0,,1.67,8.5,63.0,68.0,329.0,329.0,6.2
2021-03-03 13:14:55,-61.92,-57.57,6.4,228.0,1.20,1.76,7.9,57.0,62.0,333.0,325.0,6.2
2021-03-03 13:44:55,-61.91,-57.57,6.4,222.0,,1.86,8.5,65.0,62.0,316.0,322.0,6.4


In [20]:
buoys_df_site.columns

Index(['Lat', 'Lon', 'Battery', 'bHead', 'wspd', 'wdir', 'gust', 'atmp',
       'pres', 'dewpt', 'rh', 'sst', 'cspd1', 'Cdir1', 'cspd1', 'Cdir2',
       'cspd1', 'Cdir3', 'swvht', 'mxwvht', 'tp', 'Mwd', 'Spread', 'lat',
       'lon', 'wspd', 'wdir', 'wtmp', 'swvht', 'tp', 'mwd', 'spred',
       'peak_mwd', 'peak_spred', 'mean_dpd', 'compass', 'gust', 'atmp',
       'dewpt', 'rh', 'pres', 'arad', 'sst', 'cspd1', 'cdir1', 'cspd2',
       'cdir2', 'cspd3', 'cdir3', 'swvht1', 'tp1', 'mxwvht1', 'wvdir1',
       'wvspread1', 'swvht2', 'tp2', 'wvdir2', 'Unnamed: 0', 'Condutividade',
       'wdir', 'Latitude', 'Longitude', 'pres', 'arad', 'sss', 'sst', 'atmp',
       'rh', 'wspd'],
      dtype='object')

## Outlier Limit

In [30]:
def gen_outlier_lim(data,std_factor=3.):
    # drop unwanted parameters
    data = data.drop(columns=['Lat', 'Lon', 'Battery', 'bHead','wdir','Cdir1', 
                                'Cdir2','Cdir3','Mwd','Spread', 'lat','lon', 
                                'mwd', 'spred', 'peak_mwd', 'peak_spred', 'mean_dpd',
                                'compass','cdir1','cdir2','cdir3','wvdir1',
                                'Latitude', 'Longitude', 'Unnamed: 0'
                                ])
    # get buoys names
    buoys = data.index.levels[0]
    # generate global df
    lims = pd.DataFrame(columns=['buoy','param','mean','std','lower_lim','upper_lim'])

    # generate limits for each buoy and concatenate to the global dataframe
    for buoy in buoys:
        res = data.loc[buoy].dropna(how='all',axis=1).describe().loc[['mean','std']].T
        res.index.names = ['param']
        res.reset_index(inplace=True)
        res['lower_lim'] = res['mean'] - res['std']*std_factor
        res['upper_lim'] = res['mean'] + res['std']*std_factor
        res['buoy'] = buoy
        lims = pd.concat([lims,res])
        
    lims.set_index(['buoy','param'],inplace=True)

    # replace negative lower_limits with 0.
    lims.loc[lims['lower_lim'] < 0,'lower_lim'] = 0.

    return lims

In [27]:
out_lims_site = gen_outlier_lim(buoys_df_site,std_factor=6.)

In [28]:
out_lims_site.loc[idx[:,['swvht1','swvht2','swvht']],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,lower_lim,upper_lim
buoy,param,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bmo_bc1_0,swvht1,1.65,0.51,0.0,4.73
bmo_santos,swvht1,2.12,0.74,0.0,6.54
bmo_bc1_0,swvht2,1.75,2.49,0.0,16.71
bmo_santos,swvht2,1.95,1.71,0.0,12.24
antartica,swvht,1.16,0.57,0.0,4.58
cabofrio,swvht,1.36,0.67,0.0,5.35
cabofrio2_0,swvht,1.93,0.62,0.0,5.64
fortaleza,swvht,1.65,0.37,0.0,3.86
itaguai_1,swvht,1.65,0.58,0.0,5.15
itajai_0,swvht,1.96,0.76,0.0,6.52


In [15]:
out_lims_site.index.levels[1].unique()

Index(['Condutividade', 'arad', 'atmp', 'cspd1', 'cspd2', 'cspd3', 'dewpt',
       'gust', 'mxwvht', 'mxwvht1', 'pres', 'rh', 'sss', 'sst', 'swvht',
       'swvht1', 'swvht2', 'tp', 'tp1', 'tp2', 'wspd', 'wtmp', 'wvdir2',
       'wvspread1'],
      dtype='object', name='param')

## Continuity Limit

In [45]:
def gen_cont_lims(data,std_factor=3.):
    # drop unwanted parameters
    data = data.drop(columns=['Lat', 'Lon', 'Battery', 'bHead','wdir','Cdir1', 
                                'Cdir2','Cdir3','Mwd','Spread', 'lat','lon', 
                                'mwd', 'spred', 'peak_mwd', 'peak_spred', 'mean_dpd',
                                'compass','cdir1','cdir2','cdir3','wvdir1',
                                'Latitude', 'Longitude', 'Unnamed: 0'
                                ])
    
    # get buoys names
    buoys = data.index.levels[0]
    # generate global df
    lims = pd.DataFrame(columns=['buoy','param','mean','std','lim'])

    # generate limits for each buoy and concatenate to the global dataframe
    for buoy in buoys:
        res = data.loc[buoy].dropna(how='all',axis=1).diff().describe().loc[['mean','std']].T
        res.index.names = ['param']
        res.reset_index(inplace=True)
        res['lim'] = res['std']*std_factor
        res['buoy'] = buoy
        lims = pd.concat([lims,res])
    
    lims.set_index(['buoy','param'],inplace=True)

    return lims

In [46]:
cont_lims_site = gen_cont_lims(buoys_df_site,std_factor=3.)

In [47]:
cont_lims_site

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,lim
buoy,param,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
antartica,wspd,6.65e-04,1.28,3.84
antartica,wtmp,,,
antartica,swvht,5.00e-04,0.23,0.69
antartica,tp,3.94e-03,4.90,14.71
bmo_bc1_0,wspd,-1.67e-02,1.65,4.96
...,...,...,...,...
vitoria_0,cspd1,1.71e-02,86.50,259.51
vitoria_0,cspd1,2.92e-02,84.24,252.72
vitoria_0,swvht,-2.44e-05,0.11,0.34
vitoria_0,mxwvht,-5.01e-05,0.38,1.13
