In [1]:
import datetime as dt
import numpy as np
import netCDF4 as nc
import pandas as pd
import glob
from salishsea_tools import geo_tools
import gsw
import os
import pytz
import matplotlib.pyplot as plt
import cmocean as cmo
import warnings
from sqlalchemy import create_engine, case, MetaData
from sqlalchemy.orm import create_session, aliased
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.sql import and_, or_, not_, func
from salishsea_tools import viz_tools

pd.set_option('display.max_colwidth', -1)

%matplotlib inline

In [2]:
basedir='/ocean/shared/SalishSeaCastData/DFO/CTD/'
dbname='DFO_CTD.sqlite'
datelims=()

In [3]:
engine = create_engine('sqlite:///' + basedir + dbname, echo = False)

In [4]:
grid = nc.Dataset('/data/eolson/MEOPAR/NEMO-forcing-new/grid/bathymetry_201702.nc')

In [5]:
md=MetaData()
md.reflect(engine)
Base = automap_base(metadata=md)
# reflect the tables in salish.sqlite:
Base.prepare()
# mapped classes have been created
# existing tables:
StationTBL=Base.classes.StationTBL
ObsTBL=Base.classes.ObsTBL
CalcsTBL=Base.classes.CalcsTBL
AncTBL=Base.classes.AncillaryTBL
#JDFLocsTBL=Base.classes.JDFLocsTBL
session = create_session(bind = engine, autocommit = False, autoflush = True)

In [6]:
qry=session.query(ObsTBL.Depth,ObsTBL.Pressure,CalcsTBL.Z,ObsTBL.Salinity,ObsTBL.Salinity_T0_C0,ObsTBL.Salinity_T1_C1,CalcsTBL.Salinity_SA,CalcsTBL.Salinity_T0_C0_SA,CalcsTBL.Salinity_T1_C1_SA,
                  CalcsTBL.Temperature_CT,CalcsTBL.Temperature_Primary_CT,CalcsTBL.Temperature_Secondary_CT).\
    select_from(CalcsTBL).join(ObsTBL,ObsTBL.ID==CalcsTBL.ObsTBLID).filter(ObsTBL.Depth==ObsTBL.Depth).all()

In [7]:
df=pd.DataFrame(qry)

In [8]:
df.describe()

Unnamed: 0,Depth,Pressure,Z,Salinity,Salinity_T0_C0,Salinity_T1_C1,Salinity_SA,Salinity_T0_C0_SA,Salinity_T1_C1_SA,Temperature_CT,Temperature_Primary_CT,Temperature_Secondary_CT
count,1063996.0,1063996.0,1063996.0,36266.0,818468.0,207732.0,36266.0,818468.0,207732.0,36266.0,818466.0,207732.0
mean,484.878,490.4283,484.8602,21.883578,32.289047,32.635112,21.989125,32.451615,32.800109,10.172972,7.075404,6.463621
std,583.5839,591.3095,583.5957,10.375783,2.077741,2.030878,10.425777,2.093501,2.04643,2.43854,3.142686,2.89263
min,0.1,0.0,0.0,0.03,3.6922,9.4358,0.030145,3.710085,9.481491,2.341295,1.414291,1.412261
25%,78.3,79.0,78.3175,22.08,30.6669,30.767275,22.186822,30.816571,30.918819,7.817325,4.033313,3.697934
50%,216.0,218.0,216.0315,26.61,32.5463,33.8955,26.737975,32.702707,34.061461,10.368842,8.270452,7.173339
75%,737.3,745.0,737.3297,28.13,34.2469,34.3262,28.266203,34.425831,34.507366,11.491793,9.558143,8.886104
max,3230.6,3283.6,3230.645,32.7555,34.6501,34.6491,32.91341,34.83919,34.838142,22.36438,20.877578,19.87548


### Look at cases where Depth and Z are different:

In [9]:
np.min((df['Depth']-df['Z'])/(df['Z']+df['Depth'])*2),np.max((df['Depth']-df['Z'])/(df['Z']+df['Depth'])*2)

(-0.6590548136330314, 2.0)

In [10]:
df.loc[(np.abs((df['Depth']-df['Z'])/(df['Z']+df['Depth'])*2)>.1)&(np.abs((df['Depth']-df['Z']))>.3)]

Unnamed: 0,Depth,Pressure,Z,Salinity,Salinity_T0_C0,Salinity_T1_C1,Salinity_SA,Salinity_T0_C0_SA,Salinity_T1_C1_SA,Temperature_CT,Temperature_Primary_CT,Temperature_Secondary_CT
26905,2.3,2.0,1.982764,,29.3926,,,29.534709,,,11.387468,
1033661,1.3,1.0,0.991485,,27.6416,,,27.775756,,,9.437533,
1035199,1.1,2.0,1.982966,,28.1208,,,28.257224,,,7.757864,
1035200,2.6,3.0,2.974442,,28.1204,,,28.256767,,,7.758590,
1035585,1.3,2.0,1.982968,,26.9449,,,27.075610,,,6.999719,
...,...,...,...,...,...,...,...,...,...,...,...,...
1061645,0.6,1.0,0.991487,,26.7720,,,26.901929,,,11.355665,
1061646,1.6,2.0,1.982969,,26.7388,,,26.868515,,,11.326563,
1062429,2.5,3.0,2.974451,,27.1881,,,27.319927,,,9.828560,
1062810,1.2,2.0,1.982972,,27.5068,,,27.640224,,,9.970379,


### List column names for tables:

In [11]:
sorted([x.name for x in md.tables['StationTBL'].columns])

['EVENT_NUMBER',
 'ID',
 'Include',
 'LATITUDE',
 'LONGITUDE',
 'Lat',
 'Lon',
 'PLATFORM',
 'START_TIME',
 'STATION',
 'StartDay',
 'StartHour',
 'StartMonth',
 'StartTimeZone',
 'StartYear',
 'WATER_DEPTH',
 'WDIR',
 'WSPD',
 'sourceFile']

In [12]:
sorted([x.name for x in md.tables['ObsTBL'].columns])

['Conductance_Specific',
 'Conductance_Specific_units',
 'Conductivity',
 'Conductivity_Primary',
 'Conductivity_Primary_units',
 'Conductivity_Secondary',
 'Conductivity_Secondary_units',
 'Conductivity_units',
 'Density',
 'Density_units',
 'Depth',
 'Depth_units',
 'Fluorescence_URU_Seapoint',
 'Fluorescence_URU_Seapoint_units',
 'Fluorescence_URU_Wetlabs',
 'Fluorescence_URU_Wetlabs_units',
 'ID',
 'Include',
 'Number_of_bin_records',
 'Number_of_bin_records_units',
 'Oxygen_Dissolved_SBE',
 'Oxygen_Dissolved_SBE_1',
 'Oxygen_Dissolved_SBE_1_units',
 'Oxygen_Dissolved_SBE_units',
 'PAR',
 'PAR1',
 'PAR1_units',
 'PAR_1',
 'PAR_1_units',
 'PAR_Reference',
 'PAR_Reference_units',
 'PAR_units',
 'Pressure',
 'Pressure_units',
 'Salinity',
 'Salinity_T0_C0',
 'Salinity_T0_C0_units',
 'Salinity_T1_C1',
 'Salinity_T1_C1_units',
 'Salinity_units',
 'Speed_Sound',
 'Speed_Sound_units',
 'StationTBLID',
 'Temperature',
 'Temperature_Primary',
 'Temperature_Primary_units',
 'Temperature_Seco

In [13]:
sorted([x.name for x in md.tables['CalcsTBL'].columns])

['Include',
 'ObsTBLID',
 'Salinity_SA',
 'Salinity_T0_C0_SA',
 'Salinity_T1_C1_SA',
 'StationTBLID',
 'Temperature_CT',
 'Temperature_Primary_CT',
 'Temperature_Secondary_CT',
 'Z']

In [14]:
sorted([x.name for x in md.tables['AncillaryTBL'].columns])

['AGENCY',
 'COUNTRY',
 'DATA_DESCRIPTION',
 'MISSION',
 'MODEL',
 'PAR_CalConst',
 'PAR_CalDate',
 'PAR_Multiplier',
 'PAR_Offset',
 'PAR_Serial',
 'PAR_b',
 'PAR_m',
 'PROJECT',
 'RefPAR_CalDate',
 'RefPAR_ConvFact',
 'RefPAR_Multiplier',
 'RefPAR_Serial',
 'SCIENTIST',
 'SERIAL',
 'StationTBLID',
 'TYPE',
 'xmiss_CalDate',
 'xmiss_PathLen',
 'xmiss_Serial',
 'xmiss_b',
 'xmiss_m']

#### salinity variables: 'Salinity','Salinity_T0_C0', 'Salinity_T1_C1'
#### temperature variables:'Temperature','Temperature_Primary','Temperature_Secondary'

### How many Depths with no Pressure and vice versa?

In [15]:
print('Z without P:',session.query(ObsTBL.Depth).filter(ObsTBL.Pressure==None).count())
print('P without Z:',session.query(ObsTBL.Pressure).filter(ObsTBL.Depth==None).count())


Z without P: 0
P without Z: 1051722


### Other depth info:

In [16]:
print('Z min, max:',session.query(func.min(ObsTBL.Depth)).one(),session.query(func.max(ObsTBL.Depth)).one())
print('P min, max:',session.query(func.min(ObsTBL.Pressure)).one(),session.query(func.max(ObsTBL.Pressure)).one())

Z min, max: (0.1,) (3230.6,)
P min, max: (0.0,) (3283.6,)


### Other Variables:

In [17]:
for var in (ObsTBL.Salinity,ObsTBL.Salinity_T0_C0,ObsTBL.Salinity_T1_C1,ObsTBL.Temperature,ObsTBL.Temperature_Primary,ObsTBL.Temperature_Secondary):
    print(var,'min max count:',session.query(func.min(var)).one(),session.query(func.max(var)).one(),session.query(var).filter(var!=None).count())

ObsTBL.Salinity min max count: (0.03,) (34.1957,) 77823
ObsTBL.Salinity_T0_C0 min max count: (1.361,) (34.6505,) 1278653
ObsTBL.Salinity_T1_C1 min max count: (3.5259,) (34.6514,) 756143
ObsTBL.Temperature min max count: (2.216,) (22.053,) 77823
ObsTBL.Temperature_Primary min max count: (1.6359,) (21.6947,) 1279520
ObsTBL.Temperature_Secondary min max count: (1.6371,) (20.245,) 756237


In [18]:
vlist=(ObsTBL.Salinity,ObsTBL.Salinity_T0_C0,ObsTBL.Salinity_T1_C1,ObsTBL.Temperature,ObsTBL.Temperature_Primary,ObsTBL.Temperature_Secondary)
ulist=(ObsTBL.Salinity_units,ObsTBL.Salinity_T0_C0_units,ObsTBL.Salinity_T1_C1_units,ObsTBL.Temperature_units,
      ObsTBL.Temperature_Primary_units,ObsTBL.Temperature_Secondary_units)

In [19]:
for vvar,uvar in zip(vlist,ulist):
    print(uvar,'unique:')
    print('\t',[i for i in session.query(uvar).group_by(uvar).all()])
    print('\t','# missing units:',session.query(vvar,uvar).filter(and_(vvar!=None,uvar==None)).count())

ObsTBL.Salinity_units unique:
	 [(None,), ('PSS-78',)]
	 # missing units: 0
ObsTBL.Salinity_T0_C0_units unique:
	 [(None,), ('PSS-78',)]
	 # missing units: 0
ObsTBL.Salinity_T1_C1_units unique:
	 [(None,), ('PSS-78',)]
	 # missing units: 0
ObsTBL.Temperature_units unique:
	 [(None,), ("'deg_C'",), ("'deg_C(ITS90)'",), ("'deg_C_(ITS90)'",)]
	 # missing units: 0
ObsTBL.Temperature_Primary_units unique:
	 [(None,), ("'deg_C_(ITS90)'",)]
	 # missing units: 0
ObsTBL.Temperature_Secondary_units unique:
	 [(None,), ("'deg_C_(ITS90)'",)]
	 # missing units: 0


## Check which T&S variable combinations are present

In [20]:
df.loc[(~np.isnan(df['Salinity_T1_C1']))&(~np.isnan(df['Temperature_Primary']))]

KeyError: 'Temperature_Primary'

In [None]:
df.loc[(~np.isnan(df['Salinity_T1_C1']))&(~np.isnan(df['Temperature']))]

In [None]:
df.loc[(~np.isnan(df['Salinity_T0_C0']))&(~np.isnan(df['Temperature']))]

In [None]:
df.loc[(~np.isnan(df['Salinity_T0_C0']))&(~np.isnan(df['Temperature_Secondary']))]

In [None]:
df.loc[(~np.isnan(df['Salinity']))&(~np.isnan(df['Temperature_Secondary']))]

In [None]:
df.loc[(~np.isnan(df['Salinity']))&(~np.isnan(df['Temperature_Primary']))]

In [None]:
len(df.loc[(~np.isnan(df['Salinity']))&(~np.isnan(df['Temperature']))])

In [None]:
len(df.loc[(~np.isnan(df['Salinity_T0_C0']))&(~np.isnan(df['Temperature_Primary']))])

In [None]:
len(df.loc[(~np.isnan(df['Salinity_T1_C1']))&(~np.isnan(df['Temperature_Secondary']))])

### Plot All T S Data

In [None]:
df=pd.DataFrame(session.query(ObsTBL.Depth,ObsTBL.Pressure,ObsTBL.Salinity,ObsTBL.Salinity_T0_C0,ObsTBL.Salinity_T1_C1,
                              ObsTBL.Temperature,ObsTBL.Temperature_Primary,ObsTBL.Temperature_Secondary,
                              StationTBL.Lat,StationTBL.Lon).select_from(ObsTBL).join(StationTBL,StationTBL.ID==ObsTBL.StationTBLID).all())

fig,ax=plt.subplots(1,3,figsize=(18,6))
ax[0].plot(df['Salinity'],df['Temperature'],'r.')
ax[1].plot(df['Salinity_T0_C0'],df['Temperature_Primary'],'c.')
ax[2].plot(df['Salinity_T1_C1'],df['Temperature_Secondary'],'m.')

In [None]:
fig,ax=plt.subplots(1,3,figsize=(18,6))
for iax in ax:
    viz_tools.set_aspect(iax, coords = 'map')
    viz_tools.plot_coastline(iax, grid, coords = 'map')
    iax.set_ylim(47, 52)
    iax.set_xlim(-130, -122);
ax[0].plot(df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lon']],
                    df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lat']],'ro')
ax[1].plot(df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lon']],
                    df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lat']],'co')
ax[2].plot(df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lon']],
                    df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lat']],'mo')

### Plot all included T&S data

In [None]:
df=pd.DataFrame(session.query(ObsTBL.Depth,ObsTBL.Pressure,ObsTBL.Salinity,ObsTBL.Salinity_T0_C0,
                              ObsTBL.Salinity_T1_C1,ObsTBL.Temperature,ObsTBL.Temperature_Primary,
                              ObsTBL.Temperature_Secondary,StationTBL.Lat,StationTBL.Lon).\
                select_from(ObsTBL).join(StationTBL,StationTBL.ID==ObsTBL.StationTBLID).\
                filter(ObsTBL.Include==True).all())

fig,ax=plt.subplots(1,3,figsize=(18,6))
ax[0].plot(df['Salinity'],df['Temperature'],'r.')
ax[1].plot(df['Salinity_T0_C0'],df['Temperature_Primary'],'c.')
ax[2].plot(df['Salinity_T1_C1'],df['Temperature_Secondary'],'m.')

In [None]:
fig,ax=plt.subplots(1,3,figsize=(18,6))
for iax in ax:
    viz_tools.set_aspect(iax, coords = 'map')
    viz_tools.plot_coastline(iax, grid, coords = 'map')
    iax.set_ylim(47, 52)
    iax.set_xlim(-130, -122);
ax[0].plot(df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lon']],
                    df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lat']],'ro')
ax[1].plot(df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lon']],
                    df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lat']],'co')
ax[2].plot(df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lon']],
                    df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lat']],'mo')

### Restrict to Salish Sea

In [None]:
qry=session.query(StationTBL.StartYear.label('Year'),StationTBL.StartMonth.label('Month'),
                      StationTBL.StartDay.label('Day'),StationTBL.StartHour.label('Hour'),
                      StationTBL.Lat,StationTBL.Lon,
                     ObsTBL.Depth,ObsTBL.Pressure,ObsTBL.Salinity,
                  ObsTBL.Salinity_T0_C0,ObsTBL.Salinity_T1_C1,
                    ObsTBL.Temperature,ObsTBL.Temperature_Primary,ObsTBL.Temperature_Secondary,ObsTBL.sourceFile).\
                select_from(StationTBL).join(ObsTBL,ObsTBL.StationTBLID==StationTBL.ID).\
                filter(and_(StationTBL.Lat>47-3/2.5*(StationTBL.Lon+123.5),
                            StationTBL.Lat<47-3/2.5*(StationTBL.Lon+121)))
df=pd.DataFrame(qry.all())

In [None]:
fig,ax=plt.subplots(1,3,figsize=(18,6))
ax[0].plot(df['Salinity'],df['Temperature'],'r.')
ax[1].plot(df['Salinity_T0_C0'],df['Temperature_Primary'],'c.')
ax[2].plot(df['Salinity_T1_C1'],df['Temperature_Secondary'],'m.')

In [None]:
fig,ax=plt.subplots(1,3,figsize=(18,6))
for iax in ax:
    viz_tools.set_aspect(iax, coords = 'map')
    viz_tools.plot_coastline(iax, grid, coords = 'map')
    #iax.set_ylim(48, 50.5)
    #iax.set_xlim(-125.7, -122.5);
    iax.set_ylim(47, 52)
    iax.set_xlim(-130, -122);
ax[0].plot(df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lon']],
                    df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lat']],'ro')
ax[1].plot(df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lon']],
                    df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lat']],'co')
ax[2].plot(df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lon']],
                    df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lat']],'mo')

In [None]:
models=session.query(AncTBL.MODEL).distinct().all()
models

### Display CastAway stations

In [None]:
## Where are CastAway stations?
qry=session.query(StationTBL.StartYear.label('Year'),StationTBL.StartMonth.label('Month'),
                      StationTBL.StartDay.label('Day'),StationTBL.StartHour.label('Hour'),
                      StationTBL.Lat,StationTBL.Lon,
                     ObsTBL.Depth,ObsTBL.Pressure,ObsTBL.Salinity,
                  ObsTBL.Salinity_T0_C0,ObsTBL.Salinity_T1_C1,
                    ObsTBL.Temperature,ObsTBL.Temperature_Primary,ObsTBL.Temperature_Secondary,ObsTBL.sourceFile).\
                select_from(StationTBL).join(ObsTBL,ObsTBL.StationTBLID==StationTBL.ID).\
                join(AncTBL,AncTBL.StationTBLID==StationTBL.ID).\
                filter(AncTBL.MODEL=='CastAway')
df=pd.DataFrame(qry.all())
fig,ax=plt.subplots(1,3,figsize=(18,6))
for iax in ax:
    viz_tools.set_aspect(iax, coords = 'map')
    viz_tools.plot_coastline(iax, grid, coords = 'map')
    #iax.set_ylim(48, 50.5)
    #iax.set_xlim(-125.7, -122.5);
    iax.set_ylim(47, 52)
    iax.set_xlim(-130, -122);
ax[0].plot(df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lon']],
                    df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lat']],'ro')
ax[1].plot(df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lon']],
                    df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lat']],'co')
ax[2].plot(df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lon']],
                    df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lat']],'mo')

### Check CastAway profiles are excluded:

In [None]:
qry=session.query(StationTBL.Include).select_from(StationTBL).join(AncTBL,AncTBL.StationTBLID==StationTBL.ID).\
    filter(AncTBL.MODEL=='CastAway').distinct().all()
print('Station Include:',qry)

In [None]:
qry=session.query(ObsTBL.Include).select_from(ObsTBL).join(AncTBL,AncTBL.StationTBLID==ObsTBL.StationTBLID).\
    filter(AncTBL.MODEL=='CastAway').distinct().all()
print('Obs Include:',qry)

#### check that this plot is linear or you are likely missing stations from obs table:

In [None]:
test=session.query(ObsTBL.StationTBLID).distinct().order_by(ObsTBL.StationTBLID).all()
plt.plot(test)

### Salish Sea with Include=True

In [None]:
qry=session.query(StationTBL.StartYear.label('Year'),StationTBL.StartMonth.label('Month'),
                      StationTBL.StartDay.label('Day'),StationTBL.StartHour.label('Hour'),
                      StationTBL.Lat,StationTBL.Lon,
                     ObsTBL.Depth,ObsTBL.Pressure,ObsTBL.Salinity,
                  ObsTBL.Salinity_T0_C0,ObsTBL.Salinity_T1_C1,
                    ObsTBL.Temperature,ObsTBL.Temperature_Primary,ObsTBL.Temperature_Secondary,ObsTBL.sourceFile).\
                select_from(StationTBL).join(ObsTBL,ObsTBL.StationTBLID==StationTBL.ID).\
                join(AncTBL,AncTBL.StationTBLID==StationTBL.ID).\
                filter(and_(StationTBL.Lat>47-3/2.5*(StationTBL.Lon+123.5),
                            StationTBL.Lat<47-3/2.5*(StationTBL.Lon+121),ObsTBL.Include==True))
df=pd.DataFrame(qry.all())
fig,ax=plt.subplots(1,3,figsize=(18,6))
ax[0].plot(df['Salinity'],df['Temperature'],'r.')
ax[1].plot(df['Salinity_T0_C0'],df['Temperature_Primary'],'c.')
ax[2].plot(df['Salinity_T1_C1'],df['Temperature_Secondary'],'m.')

In [None]:
fig,ax=plt.subplots(1,3,figsize=(18,6))
for iax in ax:
    viz_tools.set_aspect(iax, coords = 'map')
    viz_tools.plot_coastline(iax, grid, coords = 'map')
    #iax.set_ylim(48, 50.5)
    #iax.set_xlim(-125.7, -122.5);
    iax.set_ylim(47, 52)
    iax.set_xlim(-130, -122);
ax[0].plot(df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lon']],
                    df.loc[(df['Salinity']>0)&(df['Temperature']>0),['Lat']],'ro')
ax[1].plot(df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lon']],
                    df.loc[(df['Salinity_T0_C0']>0)&(df['Temperature_Primary']>0),['Lat']],'co')
ax[2].plot(df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lon']],
                    df.loc[(df['Salinity_T1_C1']>0)&(df['Temperature_Secondary']>0),['Lat']],'mo')

## Check for Duplicates

In [None]:
# search for duplicate stations and investigate:
a1=aliased(StationTBL)
a2=aliased(StationTBL)
dupsQRY=session.query(a1.ID.label('ID1'),a1.Include,a2.ID.label('ID2'),a2.Include,a1.sourceFile.label('source1'),a2.sourceFile.label('source2'),
                     a1.EVENT_NUMBER.label('EVENT_NUMBER1'),a2.EVENT_NUMBER.label('EVENT_NUMBER2'),
                     a1.PLATFORM.label('PLATFORM1'),a2.PLATFORM.label('PLATFORM2'),
                     a1.STATION.label('STATION1'),a2.STATION.label('STATION2'),
                     a1.WATER_DEPTH.label('WATER_DEPTH1'),a2.WATER_DEPTH.label('WATER_DEPTH2')).select_from(a1).join(a2,and_(
    a1.StartYear==a2.StartYear,
    a1.StartMonth==a2.StartMonth,
    a1.StartDay==a2.StartDay,
    a1.StartHour-a2.StartHour<0.001,
    a1.StartHour-a2.StartHour>-0.001,
    a1.Lat-a2.Lat<0.001,
    a1.Lat-a2.Lat>-0.001,
    a1.Lon-a2.Lon<0.001,
    a1.Lon-a2.Lon>-0.001,
    a1.ID!=a2.ID)).filter(a1.Include==True,a2.Include==True,a1.ID<a2.ID)


In [None]:
dfa=pd.DataFrame(dupsQRY.all())

In [None]:
dfa

In [None]:
# display previously eliminated duplicates:
a1=aliased(StationTBL)
a2=aliased(StationTBL)
dupsQRY=session.query(a1.ID.label('ID1'),a1.Include,a2.ID.label('ID2'),a2.Include,a1.sourceFile.label('source1'),a2.sourceFile.label('source2'),
                     a1.EVENT_NUMBER.label('EVENT_NUMBER1'),a2.EVENT_NUMBER.label('EVENT_NUMBER2'),
                     a1.PLATFORM.label('PLATFORM1'),a2.PLATFORM.label('PLATFORM2'),
                     a1.STATION.label('STATION1'),a2.STATION.label('STATION2'),
                     a1.WATER_DEPTH.label('WATER_DEPTH1'),a2.WATER_DEPTH.label('WATER_DEPTH2')).select_from(a1).join(a2,and_(
    a1.StartYear==a2.StartYear,
    a1.StartMonth==a2.StartMonth,
    a1.StartDay==a2.StartDay,
    a1.StartHour-a2.StartHour<0.001,
    a1.StartHour-a2.StartHour>-0.001,
    a1.Lat-a2.Lat<0.001,
    a1.Lat-a2.Lat>-0.001,
    a1.Lon-a2.Lon<0.001,
    a1.Lon-a2.Lon>-0.001,
    a1.ID!=a2.ID)).filter(a1.Include==True,a2.Include==False,a1.ID<a2.ID)
dfa=pd.DataFrame(dupsQRY.all())
dfa

In [None]:
for i,r in dfa.iterrows():
    print(r['source1'],'\n',r['source2'],'\n')

In [None]:
## check for obs include true where station include false; should be empty with adjusted procedure
test=pd.DataFrame(session.query(StationTBL.ID.label('StationID'),ObsTBL.ID.label('ObsID')).select_from(StationTBL).\
                  join(ObsTBL,ObsTBL.StationTBLID==StationTBL.ID).\
                  filter(and_(StationTBL.Include==False,ObsTBL.Include==True)).all())
test

In [None]:
grid.close()