In [1]:
### Serial numbers are mined from the raw data archive and pre-deploy/
### post-recovery images each year post-cruise.  Parameter files 
### 'rawFileSN.csv' and 'imageSN.csv' include the most recently acquired
### serial numbers, but must be refreshed each year after the cruise.

In [2]:
# Critical Metadata Verification

In [3]:
### import packages

import datetime
from datetime import date
import metadataFunctions as mf
import numpy as np
import os
from os import path
import pandas as pd
import re
import requests
import subprocess
from subprocess import Popen
import glob
import requests


In [4]:
### If running preliminary pre-cruise verification, use local 
### asset management repo, as local changes to be verified will
### not yet be on the main oceanobseratories repo.  Set useLocal to 'yes', 
### and enter path to local am repo.
###
### For post-cruise verification, set useLocal to 'no' in order to verify
### the amRepo that has been ingested into CI.
###
### 

useLocal = 'no'
amRepo_local = '/Users/wruef/repos/asset-management'
calRepo_local = '/Users/wruef/repos/calibrationFiles'
deployHistoryDir = path.dirname('/Users/wruef/testing/deployments/')

In [7]:
### Define date to append to output file names
today = date.today()
runDate = today.strftime("%Y%m%d")

In [8]:
### Define output file names
calVerifyFile = 'reportOuts/calibrationVerification_' + runDate + '.csv'
deployVerifyFile = 'reportOuts/deploymentVerification_' + runDate + '.csv'
sensorBulkMatchFile = 'reportOuts/sensorBulkVerification_' + runDate + '.txt'
missingCalFile = 'reportOuts/missingCalFiles_' + runDate + '.txt'
refDesList = 'reportOuts/refDesList_' + runDate + '.txt'

### Define input parameter files
RawInputFile = 'params/rawFileSN.csv'
imageSNfile = 'params/imageSN_2023.csv'
instrumentFile = 'params/RCA-InstrumentList.csv'


### Define directory pathways

### Repo URL to access directory listing
amRepo_head = 'https://github.com/oceanobservatories/asset-management/tree/master'
calRepo_head = 'https://github.com/OOI-CabledArray/calibrationFiles/tree/master'

if 'yes' in useLocal:
    amRepo = amRepo_local
    calRepo = calRepo_local
else:
    ### Repo base URL to directly load files
    amRepo = 'https://raw.githubusercontent.com/oceanobservatories/asset-management/master'
    
    ### Repo base URL to directly load files
    calRepo = 'https://raw.githubusercontent.com/OOI-CabledArray/calibrationFiles/master'


In [9]:
### Helper function to load file list from Github repo of calibration file format
### Repo URL is the main repo (as opposed to the raw file content base url)
### Optional to search for a specific type of file extention (i.e. 'csv')
### A file extension of "*" will return all files.
### Current regex works for calibration directories

def github_fileList(repo_url,fileExt):
    if '*' in fileExt:
        fileExt = '[a-z]*'
    
    fileRegEx = re.compile('{"name":"(AT.{25}\.' + fileExt + ')",')
    result = requests.get(repo_url).text
    filenameList = re.findall(fileRegEx, result)
    
    return filenameList

In [10]:
#repo_url = 'https://github.com/oceanobservatories/asset-management/tree/master/calibration/CTDBPN'
#github_fileList(repo_url, 'csv')

#repo_url = 'https://github.com/OOI-CabledArray/calibrationFiles/tree/master/CTDPFA'
#github_fileList(repo_url, '*')

In [11]:
### load in RCA Instrument list
assetList = pd.read_csv(instrumentFile)
assetList['mfgSN'] = assetList['mfgSN'].str.split(', ')
assetList['instrumentType'] = assetList['instrumentType'].str.split(',')

# import calibration coefficient maps and constants
CalCoefficientMap = pd.read_csv('params/coefficientMap.csv')
CalCoeff_dict = CalCoefficientMap.set_index('github').transpose().to_dict('list')

CalConstants = pd.read_csv('params/coefficientConstants.csv', sep=",", converters = {'value': np.float64}, float_precision='round_trip')       
CalConstants_dict = {}
for idx,values in CalConstants.iterrows():
    if values.sensor not in CalConstants_dict:
        CalConstants_dict[values.sensor]={}
    CalConstants_dict[values.sensor][values.coeff] = values.constant

# load in list of github calibration files that have gone through 2i-HITL checks
HITLcal = pd.read_csv('2i_HITL/2i_HITL_calibrationVerification.csv')

# load in list of github deployment instances that have gone through 2i-HITL checks
HITLdeploy = pd.read_csv('2i_HITL/2i_HITL_deploymentVerification.csv')


In [12]:
### load in bulk asset records from OOI asset-management github repo
sensorList = pd.read_csv(amRepo + '/bulk/sensor_bulk_load-AssetRecord.csv')
platformList = pd.read_csv(amRepo + '/bulk/platform_bulk_load-AssetRecord.csv')
cruiseList = pd.read_csv(amRepo + '/cruise/CruiseInformation.csv')

### Load in Cabled Array Deployment sheets from OOI asset-management github repo
CabledArray = pd.Series(['CE02SHBP','CE04OSBP','CE04OSPD','CE04OSPS','RS01SBPD','RS01SBPS',
                        'RS01SLBS','RS01SUM1','RS03AXBS','RS03AXPD','RS03AXPS','RS03INT2',
                        'RS03INT1','RS01SUM2','RS03CCAL','RS03ECAL','RS03ASHS'])

deploymentSheets = []
for array in CabledArray:
    deployFilePath = amRepo + '/deployment/' + array + '_Deploy.csv' 
    deploymentSheets.append(deployFilePath)
    
df_deploy = pd.concat([pd.read_csv(f, skip_blank_lines = True, comment='#') for f in deploymentSheets], ignore_index = True)


In [13]:
### load calibration directory list for sensors with cal sheets from OOI asset-management github repo,
### filtered for Cabled Array sensors (AssetID starts with 'AT')
### Excluding HYDLFA, OBSBBA, OBSSPA
githubFileList = []
sensorsWithCals = ['CTDBPN','CTDBPO','CTDPFA','CTDPFB','DOFSTA','DOSTAD','FLCDRA','FLNTUA','FLORDD', \
              'HYDBBA','NUTNRA','OPTAAC','OPTAAD', 'PARADA','PCO2WA', \
               'PCO2WB','PHSENA','PHSEND','SPKIRA','THSPHA','TMPSFA','TRHPHA','VEL3DA','ZPLSCB']


for sensor in sensorsWithCals:
    print(sensor)
    sensorDir = amRepo + '/calibration/' + sensor
    print(sensorDir)
    if 'yes' in useLocal:
        if os.path.exists(sensorDir):
            fileList = os.listdir(sensorDir)
        else:
            print('directory does not exist!')
    else:
        github_url = amRepo_head + '/calibration/' + sensor
        print(github_url)
        fileList = github_fileList(github_url,'csv')
        print(fileList)
    for calFile in fileList:
        if str.startswith(calFile,'AT'):
            githubFileList.append(sensorDir + '/' + calFile)

### load original vendor calibration directory list from OOI-CabledArray github repo
calRepoList = []
calRepoFileList = []
if 'yes' in useLocal:
    calRepoList = glob.glob(calRepo + '/*/*')
    for calFile in calRepoList:
        calRepoFileList.append(calFile.split('/')[6].split('.')[0])
else:
    for sensor in sensorsWithCals:
        sensorDir = calRepo + sensor
        github_url = calRepo_head + '/' + sensor
        fileList = github_fileList(github_url,'*')
        for calFile in fileList:
            if str.startswith(calFile,'AT'):
                calRepoList.append(github_url + '/' + calFile)
                calRepoFileList.append(calFile.split('.')[0])
 

CTDBPN
https://raw.githubusercontent.com/oceanobservatories/asset-management/master/calibration/CTDBPN
https://github.com/oceanobservatories/asset-management/tree/master/calibration/CTDBPN
['ATOSU-69827-00001__20121120.csv', 'ATOSU-69827-00001__20131207.csv', 'ATOSU-69827-00001__20160315.csv', 'ATOSU-69827-00001__20170218.csv', 'ATOSU-69827-00001__20171025.csv', 'ATOSU-69827-00001__20190911.csv', 'ATOSU-69827-00002__20150409.csv', 'ATOSU-69827-00002__20170202.csv', 'ATOSU-69827-00002__20180823.csv', 'ATOSU-69827-00002__20200929.csv', 'ATOSU-69827-00002__20221028.csv', 'ATOSU-69827-00003__20130101.csv', 'ATOSU-69827-00003__20131207.csv', 'ATOSU-69827-00003__20160315.csv', 'ATOSU-69827-00003__20171109.csv', 'ATOSU-69827-00003__20180929.csv', 'ATOSU-69827-00003__20191215.csv', 'ATOSU-69827-00003__20201014.csv', 'ATOSU-69827-00003__20211114.csv']
CTDBPO
https://raw.githubusercontent.com/oceanobservatories/asset-management/master/calibration/CTDBPO
https://github.com/oceanobservatories/asse

In [14]:
### Create metadata dictionaries

# Github Sensor bulk AssetID key to mfgSN
assetID_dict = pd.Series(sensorList["Manufacturer's Serial No./Other Identifier"].values, index=sensorList['ASSET_UID']).to_dict()

# Deployment sheet Reference Designator key to startDate, AssetID, rawFile
df_deploy_sort = df_deploy.sort_values(by=["Reference Designator","startDateTime"],ascending=False)

RefDes_dict = {}
for i in df_deploy_sort['Reference Designator'].unique():
    RefDes_dict[i] = [{'deployDate':datetime.datetime.strptime(df_deploy_sort['startDateTime'][j], '%Y-%m-%dT%H:%M:%S'), 'deployEnd':df_deploy_sort['stopDateTime'][j], 'AssetID':df_deploy_sort['sensor.uid'][j], 'deployNum':df_deploy_sort['deploymentNumber'][j],'calFile': 'none', 'calFile_verify': 'none', 'firstRawFile':'undef', 'rawSN':'undef','rawFile_verify':'none','imageAssetID':'undef','image_verify':'none'} for j in df_deploy_sort[df_deploy_sort['Reference Designator']==i].index]

# RCA AssetID key to mfgSN, instrumentType
asset_dict_RCA = assetList.set_index('assetID').T.to_dict('series')

# Github sensor cals AssetID key to calibration dates (extracted from fileNames)
sensorCals = {}
for githubFile in githubFileList:
    fileBits = re.search(r"/.*/.*/.*/.*/.*/((.*)__(.*).csv)",githubFile)
    if fileBits:
        if fileBits.group(2) not in sensorCals:
            sensorCals[fileBits.group(2)] = {'calFile': []}
        sensorCals[fileBits.group(2)]['calFile'].append([datetime.datetime.strptime(fileBits.group(3), '%Y%m%d'),fileBits.group(1)])
        

In [15]:
### Verify sensor bulk by comparing AT# and vendor serial numbers between github sensorBulk and RCA Instrument List

match = []
matchFormat = []
mismatch = []
missingFromSensorBulk = []
missingFromRCAlist = []

#*# identify assetsIDs in sensorBulk missing from RCA Instrument List
for key in assetID_dict:
    if 'ATAPL' in key or 'ATOSU' in key:
        if key not in asset_dict_RCA:
            missingFromRCAlist.append(key)
            
#*# identify mismatched serial numbers between sensorBulk and RCA Instrument List
for key,values in asset_dict_RCA.items():
    if key in assetID_dict:
        matchCategory=[]
        SensorBulkSN = str(assetID_dict[key]).strip()
        if 'nan' in SensorBulkSN:
            matchCategory.append('0')
        else:
            for SN in asset_dict_RCA[key]['mfgSN']:
                RCA_SN = SN.strip()
                if RCA_SN == SensorBulkSN:
                    matchCategory.append('2')
                elif mf.partialMatch(RCA_SN, SensorBulkSN, 3):
                    matchCategory.append('1')
                else:
                    matchCategory.append('0')                  
        if '2' in matchCategory:
            match.append([key,asset_dict_RCA[key]['mfgSN'],SensorBulkSN])
        elif '1' in matchCategory:
            matchFormat.append([key,asset_dict_RCA[key]['mfgSN'],SensorBulkSN])
        else:
            mismatch.append([key,asset_dict_RCA[key]['mfgSN'],SensorBulkSN])
#*# identify assetIDs in RCA list that are missing from sensorBulk
    else:
        missingFromSensorBulk.append(key)
        
        
with open(sensorBulkMatchFile,'w') as f:
    f.write('match\n')
    for entry in match:
        f.write('%s, %s, %s\n' % (entry[0], entry[1], entry[2]))
    f.write('matchFormat\n')
    for entry in matchFormat:
        f.write('%s, %s, %s\n' % (entry[0], entry[1], entry[2]))
    f.write('mismatch\n')
    for entry in mismatch:
        f.write('%s, %s, %s\n' % (entry[0], entry[1], entry[2]))
    f.write('missingFromSensorBulk\n')
    for entry in missingFromSensorBulk:
        f.write('%s\n' % entry)
    f.write('missingFromRCAlist\n')
    for entry in missingFromRCAlist:
        f.write('%s\n' % entry)
        

In [16]:
### Verify calibration files

calVerify_dict = {}
githubFileList_names = []
#*# for each calibration file in github Repo:
for githubFile in githubFileList:
    #*# extract instrument type and filename from path
    fileBits = re.search(r"/.*/.*/.*/.*/(.*)/((.*__.*).\S{3})",githubFile)
    if fileBits:
        instrument = fileBits.group(1)
        fileName = fileBits.group(2)
        fileNameSub = fileBits.group(3)
        githubFileList_names.append(fileNameSub)
        calVerify_dict[fileName] = {}
        calVerify_dict[fileName]['instrument'] = instrument
        #*# identify if file has undergone 2i-HITL verification  
        if fileName in HITLcal['githubFile'].tolist():
            calVerify_dict[fileName]['HITLstatus'] = HITLcal[HITLcal.githubFile == fileName].Status.item()
            calVerify_dict[fileName]['HITLnotes'] = HITLcal[HITLcal.githubFile == fileName].HITLnotes.item()
        else:
            calVerify_dict[fileName]['HITLstatus'] = 'NA'
            calVerify_dict[fileName]['HITLnotes'] = ' '
        #*# Verify gitHub cal file is in calRepoDrive list
        if fileNameSub not in calRepoFileList:
            calVerify_dict[fileName]['calRepo_check'] = 'NOMATCH'
        else:
            calVerify_dict[fileName]['calRepo_check'] = 'MATCH'  
            
        #*# Load in gitHub cal file
        try:
            githubCal = pd.read_csv(githubFile, sep=",", converters = {'value': np.float64}, float_precision='round_trip')
            fileLoad = 'Success'
            calVerify_dict[fileName]['fileParse'] = 'SUCCESS_TYPE1'
        except ValueError:
            try:
                githubCal = pd.read_csv(githubFile, sep=",", float_precision='round_trip')
                fileLoad = 'Success'
                calVerify_dict[fileName]['fileParse'] = 'SUCCESS_TYPE2'
            except ValueError:
                fileLoad = 'Fail'
                calVerify_dict[fileName]['fileParse'] = 'FAIL'
                
        if fileLoad == 'Success':
            #*# verify serial number is identical for each line and maps back to AT# in filename
            if 'serial' not in githubCal.columns:
                calVerify_dict[fileName]['serialNumber'] = 'NOTFOUND_FILE'
            else:
                if len(np.unique(githubCal['serial'])) > 1:
                    calVerify_dict[fileName]['serialNumber'] = 'MULTIPLE'
                else:
                    fileNameSNstring = re.search(r"/(\S{5}-\d{5}-\d{5})", githubFile)
                    if fileNameSNstring:
                        fileNameSN = str(assetID_dict[fileNameSNstring.group(1)]).strip()
                        if 'nan' in fileNameSN:
                            calVerify_dict[fileName]['serialNumber'] = 'NOTFOUND_SENSORBULK'
                        else:
                            if str(np.unique(githubCal['serial'])[0]) not in str(fileNameSN):
                                calVerify_dict[fileName]['serialNumber'] = 'MISMATCH_SENSORBULK'
                            else:
                                calVerify_dict[fileName]['serialNumber'] = 'MATCH_SENSORBULK'
            #*# identify any duplicate coefficient names, determine if values are identical 
            #*# (exception for OPTAA .ext cal sheets which have no column headers and are in a different format than 
            #*# other cal sheets)
            if '.ext' not in githubFile:
                duplicates = githubCal[githubCal.duplicated('name')]
                if not duplicates.empty:
                    for dup in duplicates['name']:
                        #*# retrieve dateframe of coefficient duplicates and drop all identical rows...
                        #*# if rows remain, coefficient values were not identical
                        if (githubCal[githubCal['name']==dup].drop_duplicates(keep=False)).empty:
                            calVerify_dict[fileName]['duplicateCoeff'] = 'DUPLICATES_IDENTICAL'
                        else:
                            calVerify_dict[fileName]['duplicateCoeff'] = 'DUPLICATES_NOTIDENTICAL'
                else:
                    calVerify_dict[fileName]['duplicateCoeff'] = 'NONE'
            #*# for each parse-able calibration file compare coefficients between github and calRepoDrive
            if calVerify_dict[fileName]['calRepo_check'] == 'NOMATCH':
                calVerify_dict[fileName]['vendorMatch'] = 'NAN'
            elif calVerify_dict[fileName]['calRepo_check'] == 'MATCH':
                calRepoDriveFile = calRepo + '/' + fileNameSub
                calCompare = mf.compareCalCoefficients(githubCal, calRepoDriveFile, CalCoeff_dict, CalConstants_dict)
                if calCompare[0] == 'NAN':
                    calVerify_dict[fileName]['vendorMatch'] = 'NOTCOMPARED'
                else:
                    calVerify_dict[fileName]['vendorMatch'] = calCompare
    else:
        print('invalid fileName format...')
                
calRepo_gitHub_missing = []

#*# for each file in calRepo identify any missing files from github
calRepo_gitHub_index = np.where(~np.isin(calRepoFileList,githubFileList_names))
for i in range(len(calRepo_gitHub_index[0])):
    calRepo_gitHub_missing.append(calRepoFileList[calRepo_gitHub_index[0][i]])
    
with open(missingCalFile,'w') as f:
    f.write('calRepo_gitHub_missing\n')
    for entry in set(calRepo_gitHub_missing):
        f.write('%s\n' % (entry))


In [17]:
with open(calVerifyFile,'w') as f:
    f.write('githubFile,instrument,calRepo_check,HITLstatus,HITLnotes,fileParse,serialNumber,duplicateCoeff,vendorMatch\n')
    for key,values in calVerify_dict.items():
        if 'ext' not in key:
            f.write("%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (key, calVerify_dict[key]['instrument'], calVerify_dict[key]['calRepo_check'], calVerify_dict[key]['HITLstatus'],calVerify_dict[key]['HITLnotes'],calVerify_dict[key]['fileParse'], calVerify_dict[key]['serialNumber'], calVerify_dict[key]['duplicateCoeff'], calVerify_dict[key]['vendorMatch']))
        

In [18]:
### Verify deployment sheets

#*# check for valid parameters in deployment sheet:

#*# does sensor ID exist in github sensorBulk?
sensorID_check = np.where(~np.isin(df_deploy["sensor.uid"],sensorList.ASSET_UID))
print('Sensor does not exist in sensorBulk: ')
print(df_deploy["sensor.uid"][sensorID_check[0]] + ' ' + df_deploy["Reference Designator"][sensorID_check[0]])

#*# does mooring ID exist in github platformBulk?
mooringID_check = np.where(~np.isin(df_deploy["mooring.uid"],platformList.ASSET_UID))
print('Mooring ID does not exist in platformBulk: ')
print(df_deploy["mooring.uid"][mooringID_check[0]] + ' ' + df_deploy["Reference Designator"][mooringID_check[0]])

#*# does cruise exist in github cruiseList?
CUID_check = np.where(~np.isin(df_deploy.CUID_Deploy,cruiseList.CUID))
print('Cruise does not exist in cruiseList: ')
print(df_deploy.CUID_Deploy[CUID_check[0]] + ' ' + df_deploy["Reference Designator"][CUID_check[0]])

#*# check for duplicate assetIDs within same deployment year AND same deployment number
print('Duplicate assetIDs within same deployment year AND same deployment number: ')
dates = pd.to_datetime(df_deploy['startDateTime'])
years = np.unique(dates.dt.year)
for year in years:
    print(year)
    yearIndex = np.where(dates.dt.year == year)
    sensor = df_deploy['sensor.uid'][yearIndex[0]]
    refDes = df_deploy['Reference Designator'][yearIndex[0]]
    deploy = df_deploy['deploymentNumber'][yearIndex[0]]
    deployList = np.unique(deploy)
    
    for d in deployList:
        subDeploymentIndex = np.where(deploy == d)
        depSensor = sensor.iloc[subDeploymentIndex[0]]
        depRef = refDes.iloc[subDeploymentIndex[0]]
        depDeploy = deploy.iloc[subDeploymentIndex[0]]
        idx = depSensor.duplicated(keep=False)
        print(depSensor[idx] + ' ' + depRef[idx] + ' ' + str(depDeploy[idx]))
        
#*# for each deployment in github repo        
for key,values in RefDes_dict.items():
    for deploy in RefDes_dict[key]:
        #*# load HITL notes
        keyYearString = key + '.' + deploy['deployDate'].strftime('%Y') + '.' + str(deploy['deployNum'])
        if keyYearString in HITLdeploy['referenceDesignatorYearDeployNum'].tolist():
            deploy['HITLstatus'] = HITLdeploy[HITLdeploy.referenceDesignatorYearDeployNum == keyYearString].Status.item()
            deploy['HITLnotes'] = HITLdeploy[HITLdeploy.referenceDesignatorYearDeployNum == keyYearString].HITLnotes.item()
        else:
            deploy['HITLstatus'] = 'NA'
            deploy['HITLnotes'] = ''         
        #*# assign calibration file
        if deploy['AssetID'] in sensorCals:
            calHistory = sensorCals[deploy['AssetID']]
            calDateList = list(filter(lambda d: d[0] < deploy['deployDate'], calHistory['calFile']))
            if calDateList:
                deploymentCalFile = min(calDateList, key = lambda x: abs(x[0]-deploy['deployDate']))
                #*# is calibration file available?
                if deploymentCalFile:
                    deploy['calFile'] = deploymentCalFile[1]
                    #*# is caldate within 1 year of deployment?
                    if deploy['deployDate'] - deploymentCalFile[0] > datetime.timedelta(days = 450):
                        deploy['calFile_verify'] = 'VALID_FILE_CAL_OLDER_THAN_15MONTHS'
                    else:
                        deploy['calFile_verify'] = 'VALID_FILE'
            else:
                deploy['calFile'] = 'noValidCalFile'
                deploy['calFile_verify'] = 'NO_VALID_FILE'
        
        
#*# identify if serial number in raw (first or data) file and if so compare with assetID map in github sensorBulk        
rawCheckSensors = ['CTD','SPK','NUT','PAR','FLOR','PREST','TMPSFA','OPTAA','ADCP','PAR']
rawCheckSensors_DP = ['ENG000000','VEL3DA105','FLCDRA103','FLNTUA103','DOSTAD105',
                      'VEL3DA103','FLCDRA102','FLNTUA102','DOSTAD104',
                      'VEL3DA303','FLCDRA302','FLNTUA302','DOSTAD304']
excludeNodes = []
#*# exclude MARUM PI sensor for now since there is no raw data in the archive
excludeSensors = ['CTDPFA110']

#*# load rawFileSN.csv
rawFileSN_list = pd.read_csv(RawInputFile)
rawKeyList = []
for key, values in RefDes_dict.items():
    for deployment in RefDes_dict[key]:
        try:
            deployment['rawSN'] = rawFileSN_list.loc[(rawFileSN_list['referenceDesignator'] == key) & (rawFileSN_list['deployYear'] == int(deployment['deployDate'].strftime('%Y'))), 'rawSerialNumber'].iloc[0]
            deployment['firstRawFile'] = rawFileSN_list.loc[(rawFileSN_list['referenceDesignator'] == key) & (rawFileSN_list['deployYear'] == int(deployment['deployDate'].strftime('%Y'))), 'rawFile'].iloc[0]
        except IndexError:
            if any(sensor in key[18:27] for sensor in rawCheckSensors) or any(sensor in key[18:27] for sensor in rawCheckSensors_DP):
                if all(sensor not in key[18:27] for sensor in excludeSensors):
                    if any(node_ex in key[9:14] for node_ex in excludeNodes):
                        print("node exclusion for: " + key)
                        next
                    else:
                        print('no raw file listed for: ' + key + ' ' + deployment['deployDate'].strftime('%Y'))
                        deployment['rawSN'] = '-99999'
                        deployment['firstRawFile'] = 'none'
                        rawKeyList.append(key)
                    


Sensor does not exist in sensorBulk: 
Series([], dtype: object)
Mooring ID does not exist in platformBulk: 
Series([], dtype: object)
Cruise does not exist in cruiseList: 
Series([], dtype: object)
Duplicate assetIDs within same deployment year AND same deployment number: 
2013
Series([], dtype: object)
2014
424     ATAPL-58320-00002 RS01SBPD-DP01A-06-DOSTAD104 ...
850     ATAPL-58320-00002 RS03AXPS-PC03A-4A-DOSTAD303 ...
1015    ATAPL-58340-00003 RS03INT1-MJ03C-07-RASFLA301 ...
1016    ATAPL-58340-00003 RS03INT1-MJ03C-07-D1000A301 ...
dtype: object
Series([], dtype: object)
2015
Series([], dtype: object)
1022    ATAPL-58340-00003 RS03INT1-MJ03C-07-RASFLA301 ...
1023    ATAPL-58340-00003 RS03INT1-MJ03C-07-D1000A301 ...
dtype: object
Series([], dtype: object)
2016
Series([], dtype: object)
1029    ATAPL-58340-00003 RS03INT1-MJ03C-07-RASFLA301 ...
1030    ATAPL-58340-00003 RS03INT1-MJ03C-07-D1000A301 ...
dtype: object
Series([], dtype: object)
2017
Series([], dtype: object)
Series([], dt

In [19]:
for key,values in RefDes_dict.items():
    for deployment in RefDes_dict[key]:
        if 'none' in deployment['firstRawFile']:
            if any(sensor in key[18:27] for sensor in rawCheckSensors) \
            and all(sensor not in key[18:27] for sensor in excludeSensors) \
            and all(node_ex not in key[9:14] for node_ex in excludeNodes):
                deployment['rawFile_verify'] = 'NO_FILE'
        elif 'undef' in deployment['firstRawFile']:
            deployment['rawFile_verify'] = 'NAN'
        else:
            if '-99999' in str(deployment['rawSN']):
                deployment['rawFile_verify'] = 'NO_SN'
            else:
                if str(deployment['rawSN']) in assetID_dict[deployment['AssetID']]:
                    deployment['rawFile_verify'] = 'MATCH'
                else:
                    rawAT = 'unknown'
                    for IDkey, IDvalue in assetID_dict.items():
                        if deployment['AssetID'][0:11] in IDkey:
                            if str(deployment['rawSN']) in IDvalue:
                                rawAT = IDkey
                    deployment['rawFile_verify'] = 'MISMATCH: raw: ' + str(deployment['rawSN']) + ':' + rawAT
        

In [20]:
#*# load image assetIDs
imageSN_list = pd.read_csv(imageSNfile)

for idx,image in imageSN_list.iterrows():
    if pd.isnull(image['imageSerialNumber']) and not pd.isnull(image['imageAssetID']):
        print('image Serial Number missing!')
    elif pd.isnull(image['imageAssetID']) and not pd.isnull(image['imageSerialNumber']):
        print('image AssetID missing!')
    # check for discrepancies between serial number and assetID
    if not pd.isnull(image['imageSerialNumber']) and not pd.isnull(image['imageAssetID']):
        if image['imageAssetID'] not in assetID_dict:
            print('unknown asset ID: ')
            print(image['imageAssetID'])
        else:
            imageSN = image['imageSerialNumber'].split(',')
            assetSN = assetID_dict[image['imageAssetID']]
            any_in = lambda imageSN, assetSN: any(i in assetSN for i in imageSN)
            if not any_in:
                print('no match')
    

image AssetID missing!
image AssetID missing!
image Serial Number missing!
image Serial Number missing!
image Serial Number missing!
image AssetID missing!
image AssetID missing!
image AssetID missing!
image Serial Number missing!
unknown asset ID: 
ATAPL-67977-00008
unknown asset ID: 
ATOSU-6662-00006
image Serial Number missing!
image Serial Number missing!
unknown asset ID: 
ATPAL-68020-00006
image AssetID missing!
image Serial Number missing!
image AssetID missing!
image AssetID missing!
unknown asset ID: 
ATPAL-58322-00003
image Serial Number missing!
image AssetID missing!
image AssetID missing!
image Serial Number missing!
image AssetID missing!
image AssetID missing!
image AssetID missing!
image Serial Number missing!
image AssetID missing!
image AssetID missing!
image AssetID missing!
image AssetID missing!
image AssetID missing!
image AssetID missing!
image Serial Number missing!
image AssetID missing!
image AssetID missing!
image AssetID missing!
image Serial Number missing!

In [23]:
latest_fuzzy_date = "20240109" # The fuzzy match script may not have been run the same day as is set to runDate...
# Here we read in the output of the fuzzy match notebook which has been curated. For example, multiple 
# potential assetID matches have been pared down to 1, or typos in SN or AssetID have been identified and corrected
# etc...
fuzzy_df = pd.read_csv(f"reportOuts/fuzzyMatches_HITL_{latest_fuzzy_date}.csv")

In [24]:
fuzzy_df.head()

Unnamed: 0.1,Unnamed: 0,referenceDesignator,deployYear,imageFile,imageSerialNumber,imageAssetID,matching_asset_ids,proportion_match,matching_mfg_sn,exact_SN_match,exact_assetID_match,any_match,HITL_match_notes
0,0,RS01SUM2-MJ01B-12-ADCPSK101,2018,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,18977,,['ATAPL-58419-00003'],,['18977'],True,,True,
1,1,RS01SLBS-LJ01A-10-ADCPTE101,2020,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,18813,,['ATAPL-68073-00002'],,['18813'],True,,True,
2,2,RS01SLBS-LJ01A-11-OPTAAC103,2020,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,248,ATAPL-69943-00008,['ATAPL-69943-00008'],,['248'],True,,True,
3,3,RS01SLBS-LJ01A-12-CTDPFB101,2020,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,4830-67627,ATAPL-67627-00002,['ATAPL-67627-00002'],,,,True,True,
4,4,RS01SLBS-LJ01A-12-DOSTAD101,2020,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,,ATAPL-58320-00014,['ATAPL-58320-00014'],,,,True,True,


In [25]:
fuzzy_df["matching_asset_ids"] = fuzzy_df["matching_asset_ids"].str.strip("[]'")
fuzzy_df["matching_mfg_sn"] = fuzzy_df["matching_mfg_sn"].str.strip("[]'")

fuzzy_df = fuzzy_df[["referenceDesignator", "imageFile", "deployYear", "matching_mfg_sn", "matching_asset_ids"]]
fuzzy_df = fuzzy_df.rename(columns={"matching_mfg_sn":"imageSerialNumber","matching_asset_ids":"imageAssetID"})

In [26]:
fuzzy_df.head()

Unnamed: 0,referenceDesignator,imageFile,deployYear,imageSerialNumber,imageAssetID
0,RS01SUM2-MJ01B-12-ADCPSK101,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,2018,18977.0,ATAPL-58419-00003
1,RS01SLBS-LJ01A-10-ADCPTE101,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,2020,18813.0,ATAPL-68073-00002
2,RS01SLBS-LJ01A-11-OPTAAC103,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,2020,248.0,ATAPL-69943-00008
3,RS01SLBS-LJ01A-12-CTDPFB101,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,2020,,ATAPL-67627-00002
4,RS01SLBS-LJ01A-12-DOSTAD101,/Volumes/Data0/Archive/Cruise_data/RCA/Visions...,2020,,ATAPL-58320-00014


In [76]:
# instrument types that are verifiable by manufacturer serial numbers extracted from raw files
verifiable_by_SN = ['CTD','SPK','NUT','PAR','FLOR','PREST','TMPSFA','OPTAA','ADCP','PAR']

In [94]:
# repo which contains subdirectories for instrument types that require calibration
calibration_repo_url = 'https://api.github.com/repos/oceanobservatories/asset-management/contents/calibration'

response = requests.get(calibration_repo_url)
data = response.json()

# extract subdirectories
instruments_requiring_calibration = [instrument['name'] for instrument in data if instrument['type'] == 'dir']

In [95]:
instruments_requiring_calibration

['ADCPAM',
 'ADCPAN',
 'ADCPSI',
 'ADCPSJ',
 'ADCPSK',
 'ADCPSL',
 'ADCPSN',
 'ADCPTA',
 'ADCPTB',
 'ADCPTC',
 'ADCPTD',
 'ADCPTE',
 'ADCPTF',
 'ADCPTG',
 'ADCPTM',
 'CTDAVN',
 'CTDBPC',
 'CTDBPD',
 'CTDBPE',
 'CTDBPF',
 'CTDBPN',
 'CTDBPO',
 'CTDBPP',
 'CTDGVM',
 'CTDMOG',
 'CTDMOH',
 'CTDMOQ',
 'CTDMOR',
 'CTDPFA',
 'CTDPFB',
 'DOFSTA',
 'DOFSTK',
 'DOSTAD',
 'DOSTAJ',
 'DOSTAM',
 'DOSTAN',
 'FLCDRA',
 'FLNTUA',
 'FLORDD',
 'FLORDG',
 'FLORDL',
 'FLORDM',
 'FLORTD',
 'FLORTJ',
 'FLORTK',
 'FLORTM',
 'FLORTN',
 'FLORTO',
 'HYDBBA',
 'METBKA',
 'NUTNRA',
 'NUTNRB',
 'NUTNRJ',
 'NUTNRM',
 'NUTNRN',
 'OPTAAC',
 'OPTAAD',
 'OPTAAJ',
 'PARADA',
 'PARADJ',
 'PARADK',
 'PARADM',
 'PARADN',
 'PCO2WA',
 'PCO2WB',
 'PCO2WC',
 'PHSENA',
 'PHSEND',
 'PHSENE',
 'PHSENF',
 'PRESFA',
 'PRESFB',
 'PRESFC',
 'SPKIRA',
 'SPKIRB',
 'SPKIRJ',
 'THSPHA',
 'TMPSFA',
 'TRHPHA',
 'VADCPA',
 'VEL3DA',
 'ZPLSCB']

In [103]:
for key, values in RefDes_dict.items():
    #print(key)
    for deployment in RefDes_dict[key]:
        if any(substring in key for substring in instruments_requiring_calibration):
            deployment["calibrationRequired"] = True
        else:
            deployment["calibrationRequired"] = 'NAN'

In [105]:
# This loop looks up the image assetID that we chose from the output of fuzzy match as it loops through the deployment dictionary
# It marks each instrument deployment image verification as eith MATCH, MISMATCH or NAN
for key, values in RefDes_dict.items():
    #print(key)
    for deployment in RefDes_dict[key]:
        try: #TODO important line below
            deployment['imageAssetID'] = fuzzy_df.loc[(fuzzy_df['referenceDesignator'] == key) & (fuzzy_df['deployYear'] == int(deployment['deployDate'].strftime('%Y'))), 'imageAssetID'].iloc[0]
            print(fuzzy_df.loc[(fuzzy_df['referenceDesignator'] == key) & (fuzzy_df['deployYear'] == int(deployment['deployDate'].strftime('%Y'))), 'imageAssetID'].iloc[0])
            if str(deployment['imageAssetID']) in deployment['AssetID']:
                deployment['image_verify'] = 'MATCH'
            else:
                deployment['image_verify'] = 'MISMATCH: image: ' + str(deployment['imageAssetID']) + ' deployment:' + str(deployment['AssetID'])
        except IndexError:
            deployment['image_verify'] = 'NAN'

ATAPL-71444-00002
ATAPL-58340-00002
ATAPL-58340-00002
ATAPL-58338-00001
ATAPL-58338-00003
ATAPL-58340-00002
nan
nan
ATCWK-67627-00010
ATCWK-67627-00008
ATCWK-67627-00011
nan
nan
ATAPL-58336-00002
ATAPL-58336-00010
ATAPL-58336-00010
ATAPL-70114-00007
ATAPL-70114-00002
ATAPL-70114-00002
nan
ATAPL-68020-00002
ATAPL-68020-00002
ATAPL-58341-00007
ATAPL-58341-00002
ATAPL-78452-00004
ATOSU-58341-00008
ATAPL-58332-00001
ATAPL-58332-00001
ATAPL-58322-00010
ATAPL-58322-00014
ATAPL-58322-00014
ATAPL-58337-00009
ATAPL-58337-00002
ATAPL-58337-00002
nan
ATAPL-66662-00010
nan
ATAPL-66662-00003
ATAPL-58322-00012
ATOSU-58346-00008
ATOSU-58346-00008
ATAPL-58337-00001
ATAPL-58337-00004
ATAPL-58337-00005
ATAPL-58320-00012
nan
ATOSU-68020-00005
ATAPL-66662-00011
ATAPL-66662-00004
ATAPL-66662-00004
ATAPL-58324-00009
ATAPL-58324-00004
ATAPL-78437-00005
nan
ATAPL-58345-00001
ATAPL-58345-00003
ATAPL-58345-00002
ATAPL-58315-00002
ATAPL-58315-00003
ATAPL-58320-00004
ATAPL-58320-00009
ATAPL-58320-00010
ATAPL-7011

In [106]:
for key, values in RefDes_dict.items():
    for deployment in RefDes_dict[key]:
        if (deployment['rawFile_verify'] == 'MATCH') or (deployment['HITLstatus'] == 'Clear') or (deployment['image_verify'] == 'MATCH'):
            deployment['verificationStatus'] = 'VERIFIED'
        elif any(substring in key for substring in verifiable_by_SN):
            deployment['verificationStatus'] = 'RAW_SN_POSSIBLE'
        else:
            deployment['verificationStatus'] = 'NOT_VERIFIED'
            

In [107]:
deployVerifyFile

'reportOuts/deploymentVerification_20240111.csv'

In [108]:
with open(deployVerifyFile,'w') as f:
    f.write('verificationStatus,referenceDesignator,deployYear,deploymentSheetAssetID,rawFileVerification,imageVerification,calFileVerification,calibrationRequired,HITLstatus,HITLnotes\n')
    for key,values in RefDes_dict.items():
        for deployment in RefDes_dict[key]:
            f.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (deployment['verificationStatus'],key, deployment['deployDate'].strftime('%Y'), deployment['AssetID'], deployment['rawFile_verify'], deployment['image_verify'], deployment['calFile_verify'], deployment['calibrationRequired'], deployment['HITLstatus'],deployment['HITLnotes']))
        

In [92]:
### Compile deployment and calibration file listing by instrument type

deployHistory_dict = {}
for i in df_deploy_sort['Reference Designator'].unique():
    deployHistory_dict[i] = [{'deployDate':datetime.datetime.strptime(df_deploy_sort['startDateTime'][j], '%Y-%m-%dT%H:%M:%S'), 'deployEnd':df_deploy_sort['stopDateTime'][j], 'AssetID':df_deploy_sort['sensor.uid'][j], 'deployNum':df_deploy_sort['deploymentNumber'][j],'vendorCalFile': 'none','githubCalFile': 'none','instrumentType': 'none','instrumentSN': 'none'} for j in df_deploy_sort[df_deploy_sort['Reference Designator']==i].index]

# Github sensor cals AssetID key to calibration dates (extracted from fileNames)
githubLink = amRepo_head + '/calibration/'
githubSensorCals = {}

for githubFile in githubFileList:
    if 'yes' in useLocal:
        fileLink = githubFile.replace(amRepo + '/calibration/',githubLink)
    else:
        fileLink = githubFile.replace(amRepo + '/calibration/',githubLink)
    fileBits = re.search(r"/.*/.*/.*/.*/.*/((.*)__(.*).csv)",githubFile)
    if fileBits:
        if fileBits.group(2) not in githubSensorCals:
            githubSensorCals[fileBits.group(2)] = {'calFile': []}
        githubSensorCals[fileBits.group(2)]['calFile'].append([datetime.datetime.strptime(fileBits.group(3), '%Y%m%d'),fileLink])
        
# Vendor sensor cals AssetID key to calibration dates (extracted from fileNames)
calRepoLink = calRepo_head + '/'
vendorSensorCals = {}

for vendorFile in calRepoList:
    if 'yes' in useLocal:
        fileLink = vendorFile.replace(calRepo + '/',calRepoLink)
    else:
        fileLink = vendorFile
    fileBits = re.search(r"/.*/.*/.*/.*/((.*)__([0-9]*).*\..*$)",vendorFile)

    if fileBits:
        if fileBits.group(2) not in vendorSensorCals:
            vendorSensorCals[fileBits.group(2)] = {'calFile': []}
        vendorSensorCals[fileBits.group(2)]['calFile'].append([datetime.datetime.strptime(fileBits.group(3), '%Y%m%d'),fileLink])
        


In [None]:
instType = []

for key,values in deployHistory_dict.items():
    for deploy in deployHistory_dict[key]:  
        #*# lookup instrumentType and instrumentSN
        if deploy['AssetID'] in asset_dict_RCA:
            deploy['instrumentType'] = '_'.join(asset_dict_RCA[deploy['AssetID']]['instrumentType']).replace('-','')
            deploy['instrumentSN'] = asset_dict_RCA[deploy['AssetID']]['mfgSN']
        else:
            print('AssetID not in RCA Asset List')
            print(deploy['AssetID'])
            deploy['instrumentType'] = 'noValidType'
            deploy['instrumentSN'] = ['noValidSN']
        instType.append(deploy['instrumentType'])
        #*# assign github calibration file
        if deploy['AssetID'] in githubSensorCals:
            calHistory = githubSensorCals[deploy['AssetID']]
            calDateList = list(filter(lambda d: d[0] < deploy['deployDate'], calHistory['calFile']))
            if calDateList:
                deploymentCalFile = min(calDateList, key = lambda x: abs(x[0]-deploy['deployDate']))
                #*# is calibration file available?
                if deploymentCalFile:
                    deploy['githubCalFile'] = deploymentCalFile[1]
            else:
                deploy['githubCalFile'] = 'noValidCalFile'
                
        #*# assign vendor calibration file
        if deploy['AssetID'] in vendorSensorCals:
            calHistory = vendorSensorCals[deploy['AssetID']]
            calDateList = list(filter(lambda d: d[0] < deploy['deployDate'], calHistory['calFile']))
            if calDateList:
                ### TODO: add capability to list multiple files as vendor file...i.e. OPTAAC ".cal" + ".dev"
                deploymentCalFile = min(calDateList, key = lambda x: abs(x[0]-deploy['deployDate']))
                #*# is calibration file available?
                if deploymentCalFile:
                    deploy['vendorCalFile'] = deploymentCalFile[1]
            else:
                deploy['vendorCalFile'] = 'noValidCalFile'

In [None]:
instTypes = set(instType)

for inst in instTypes:
    deploymentList = []
    for key,values in deployHistory_dict.items():
        for deployment in deployHistory_dict[key]:
            if inst in deployment['instrumentType']:
                deploymentList.append([deployment['instrumentType'],key,deployment['deployDate'],deployment['deployEnd'],deployment['AssetID'],deployment['instrumentSN'],deployment['githubCalFile'],deployment['vendorCalFile']])

    deploymentList_sorted = sorted(deploymentList, key = lambda deploymentList: (deploymentList[1], deploymentList[2]))
    deployHistoryFile = deployHistoryDir + '/' + inst + '_deployments.csv'    
    with open(deployHistoryFile,'w') as f:
        f.write('sensorType,referenceDesignator,startTime,endTime,assetID,instrumentSN,githubCalibrationFile,vendorCalibrationFile\n')
        for entry in deploymentList_sorted:
            f.write("%s,%s,%s,%s,%s,\"%s\",%s,%s\n" % (entry[0],entry[1],entry[2],entry[3],entry[4],entry[5],entry[6],entry[7]))
      

In [None]:
keyList = []
for inst in instTypes:
    for key,values in deployHistory_dict.items():
        for deployment in deployHistory_dict[key]:
            if inst in deployment['instrumentType']:
                keyList.append(key)

keyList_unique = set(keyList)

#print(keyList_unique)
with open(refDesList,'w') as f:
    f.write('referenceDesignator\n')
    for entry in keyList_unique:
        f.write("%s\n" % (entry))
        