In [1]:
# import libraries
import pandas as pd
import numpy as np
import re
import datetime

In [2]:
### Enter known MIO-specfic metadata bounds and files:

latBounds = [40,50]
longBounds = [-130,-120]

maxNiskin = 24
auxNiskins = ['Aft','Forward','Port','Starboard','Aft Port','Aft Starboard','Forward Port','Forward Starboard','Majors']

metadataColumns = ['Cruise','Station','Target Asset','Start Latitude [degrees]','Start Longitude [degrees]',
                   'Start Time [UTC]','Cast','Cast Flag','Bottom Depth at Start Position [m]','CTD File',
                   'CTD File Flag']

### Import metadata lists
cruises = pd.read_csv('cruiseList.csv').set_index('cruise').T.to_dict()
stations = pd.read_csv('stationNames.csv')['station'].values.tolist()
targetAssets = pd.read_csv('targetAssets.csv')['targetAsset'].tolist()

SummaryFiles = ['Cabled-6_TN326_Discrete_Summary.csv',
                'Cabled-7_SKQ201610S_Discrete_Summary.csv',
                'Cabled-8_RR1713-RR1717_Discrete_Summary.csv',
                'Cabled-9_RR1809-RR1812_Discrete_Summary.csv',
                'Cabled-10_AT4212_Discrete_Summary.csv',
                'Cabled-11_TN382_Discrete_Summary.csv']
       
templateFile = 'discreteSummary_template.csv'
flagsFile = 'flags.csv'
flagMapFile = 'flagMap.csv'

fillValue = '-9999999'

In [3]:
### Create list of strings for Niskin bottles
Niskins = [str(x) for x in range(1,maxNiskin + 1)]
for i in range(len(auxNiskins)):
    Niskins.append(auxNiskins[i])

### Import template
template = pd.read_csv(templateFile, dtype=str)

### Import flags into dictionary
flags = pd.read_csv(flagsFile).set_index('bitPosition').T.to_dict()

### Import flag map into dictionary
flagMap = pd.read_csv(flagMapFile).set_index('header').T.to_dict()

In [4]:
### For each summary file:
for summary in SummaryFiles:
    flagList = []
    print(summary)
    ### Load file
    summary = pd.read_csv(summary, dtype=str)
    
    ### Check for empty cells
    emptyCells = np.where(pd.isnull(summary))
    if emptyCells[0].size > 0:
        for cell in emptyCells:
            print('empty cell at ' + str(cell[0]) + ', ' + str(cell[1]))
            
    ### Verify all column header labels and order match template
    if len(template.keys()) != len(summary.keys()) and len(template.keys()) != sum([1 for i, j in zip(template.keys(), summary.keys()) if i == j]): 
        print ("Headers do not match template") 
        
    for index,row in summary.iterrows():
        for header in summary.keys():
            ### Verify all fill values are '-9999999'
            if '-99' in row[header]:
                if not re.search(r'^-9999999$', row[header]):
                    print('fill value improperly formatted: ' + header + ': ' + row[header] + ', row ' + str(index+1))
                next
            else:
                ### Identify any flags in non-flag columns
                if re.search(r'^\*[0-1]{16}$',row[header]) and 'Flag' not in header:
                    print(header + ' includes misplaced flag: ' + row[header] + ', row ' + str(index+1))
                ### Verify cruise in defined list
                if 'Cruise' in header:
                    if row[header] not in cruises:
                        print('Unknown cruise: ' + row[header] + ', row ' + str(index+1))
                ### Verify station in defined list
                if 'Station' in header:
                    for station in row[header].split(','):
                        if station not in stations:
                            print('Unknown station name: ' + row[header] + ', row ' + str(index+1))
                ### Verify targetAsset in defined list
                if 'Target Asset' in header:
                    for asset in row[header].split(','):
                        if asset not in targetAssets:
                            print('Unknown target asset: ' + row[header] + ', row ' + str(index+1))
                ### Verify Niskin bottles in defined list
                if 'Niskin/Bottle Position' in header:
                    if row[header] not in Niskins:
                        print('Niskin bottle number out of range: ' + row[header] + ', row ' + str(index+1))
                ###  Verify each flag has an asterix and a 16-character combination of zeroes and ones       
                if 'Flag' in header:
                    if re.search(r'^\*[0]{16}$',row[header]):
                        print(header + ' is a blank flag: ' + row[header] + ', row ' + str(index+1))
                    if not re.search(r'^\*[0-1]{16}$',row[header]):
                        print(header + ' not formatted correctly: ' + row[header] + ', row ' + str(index+1))
                    else:
                        for bit in range(1,17):
                            if row[header][bit] != '0':
                                bitPosition = 16 - int(bit)
                                if bitPosition != 2:
                                    flagString = flags[bitPosition][flagMap[header]['flagType']]
                                    if 'Unassigned' in flagString:
                                        print(header + ' formatted as "Unassigned": ' + row[header] + ', row ' + str(index+1))
                                    flagList.append(header + ':'+ str(index+1) + ':' + flagString)
                ### Verify each time is formatted as "YYYY-MM-DDTHH:mm:ss.000Z" and is within cruise dates
                if 'Time' in header:
                    if not re.search(r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d.\d\d\dZ',row[header]):
                        print(header + ' not formatted correctly: ' + row[header] + ', row ' + str(index+1))
                    else:
                        cruiseStart = datetime.datetime.strptime(cruises[row['Cruise']]['startDate'], '%Y-%m-%dT%H:%M:%S')
                        cruiseEnd = datetime.datetime.strptime(cruises[row['Cruise']]['endDate'], '%Y-%m-%dT%H:%M:%S')
                        sampleDate = datetime.datetime.strptime(row[header], '%Y-%m-%dT%H:%M:%S.%fZ')
                        if sampleDate < cruiseStart or sampleDate > cruiseEnd:
                            print('Sample date outside of cruise dates: ' + row[header] + ', row ' + str(index+1))
                ### Verify each latitude within latBounds
                if 'Latitude' in header:
                    if float(row[header]) < latBounds[0] or (float(row[header]) > latBounds[1]):
                        print(header + ' latitude out of defined bounds: ' + row[header] + ', row ' + str(index+1))
                ### Verify each longitude within longBounds
                if 'Longitude' in header:
                    if float(row[header]) < longBounds[0] or (float(row[header]) > longBounds[1]):
                        print(header + ' longitude out of defined bounds: ' + row[header] + ', row ' + str(index+1)) 
                        
    casts = summary['Cast'].unique()
    for cast in casts:
        cast_df = summary.loc[summary['Cast'] == cast]
        castRows = len(cast_df)
        ### Verify all metadata rows for a given cast are identical 
        for metadata in metadataColumns:
            ### Exclude ROV dives from metadata check for lat, long, time, bottom depth
            if 'CTD' in cast_df['Cast']:
                if len(cast_df[metadata].unique()) > 1:
                    print(metadata + ' metadata not identical for all rows of cast ' + cast)
        ### Verify casts with no samples have fill values for all fields except metadata columns        
        if fillValue not in cast_df['CTD File Flag'].values[0]:
            if int(cast_df['CTD File Flag'].values[0][15]) == 1:
                for header in summary.keys():
                    if header not in metadataColumns:
                        if not re.search(r'^-9999999$', cast_df[header].values[0]):
                            print('Unexpected fill value for cast with no samples: ' + cast_df[header].values[0] + ', cast: ' + cast + ', ' + header)
                       
    #print(flagList)    

Cabled-6_TN326_Discrete_Summary.csv
Cabled-7_SKQ201610S_Discrete_Summary.csv
Cabled-8_RR1713-RR1717_Discrete_Summary.csv
Cabled-9_RR1809-RR1812_Discrete_Summary.csv
Cabled-10_AT4212_Discrete_Summary.csv
Cabled-11_TN382_Discrete_Summary.csv
