In [1]:
import pandas as pd
import os
import re
import datetime as dt
import numpy as np
import csv
import operator

In [2]:
### define yearly variables
ROV = 'JASON'
cruiseYear = '2024'
worktag = 'GR041160'

In [3]:
fillValue = '-9999999'

In [4]:
compileDict = {}

In [5]:
### Define names for sample logs to be created ###
discreteSummaryFile = 'Cabled-15_AT50-29_Discrete_Summary.csv'
READMEfile = 'Cabled-15_README_notes'
nutLogFile = 'Cabled-15_AT50-29_Nutrients_Sample_Log_2024-10-09_ver-1-00.csv'
fluorLogFile = 'Cabled-15_AT50-29_Chlorophyll_Sample_Log_2024-10-09_ver-1-00.csv'
salLogFile = 'Cabled-15_AT50-29_Salinity_Sample_Log_2024-10-09_ver-1-00.csv'
dicLogFile = 'Cabled-15_AT50-29_DIC_Sample_Log_2024-10-09_ver-1-00.csv'

### Define data paths for cruise data ###
dataPath = '../AT50-29'

### Define discrete sample files already created ###
compileFile = 'Cabled-15_AT50-29_DiscreteCastLogs.xlsm'

nutFile  = 'Cabled-15_AT50-29_Nutrients_Sample_Data_2025-05-15_ver-1-00.xlsx'
fluorFile  = 'Cabled-15_AT50-29_Chlorophyll_Sample_Data_2025-07-01_ver-1-00.xlsx'
salFile  = 'Cabled-15_AT50-29_Salinity_Sample_Data_2025-06-02_ver-1-00.xlsx'
dicFile  = 'Cabled-15_AT50-29_DIC_Sample_Data_2025-04-25_ver-1-00.xlsx'

In [6]:
### If selected yes, script will generate sample logs for each sample type for distribution to analysis labs ###
generateSampleLogs = 'no'

In [7]:
### Define paths for cruise data and script dictionaries ###
CTDfilePath = 'CTD Data'
ROVfilePath = 'ROV Data'
bottleParams = 'bottleMap.csv'
headerFile = 'discreteSummaryHeaderMap.csv'
flagFile = 'flags.csv'
flagMapFile = 'flagMap.csv'

In [8]:
### ROV columns for CT2 (JASON) and CTD CSV (ROPOS) files can change depending on what instruments are installed.  
### Dictionary must be redefined (or verified) each year

ROVcolumnDict = {'temp': 4, 'cond': 5, 'press': 6, 'sal': 7}
#ROVcolumnDict = {'temp': 10, 'cond': 9, 'press': 11, 'sal': 13, 'oxy': 12}


In [9]:
### Load in map for discrete summary headers to data dictionaries created by loading excel sheets
headerMap_dict = pd.read_csv(headerFile, index_col=0, squeeze=True).to_dict()

### Load in map to assign column headers in bottle file to discrete summary columns
bottleMap_dict = pd.read_csv(bottleParams, index_col=0, squeeze=True).to_dict()

### Load in map to assign flag header columns to flag type
flagHeadersMap_dict = pd.read_csv(flagMapFile, index_col=0, squeeze=True).to_dict()

### Load in map to assign flag type to flag string and bit position
flagBitMap_dict = pd.read_csv(flagFile, index_col=0, squeeze=True).to_dict()



  headerMap_dict = pd.read_csv(headerFile, index_col=0, squeeze=True).to_dict()


  bottleMap_dict = pd.read_csv(bottleParams, index_col=0, squeeze=True).to_dict()


  flagHeadersMap_dict = pd.read_csv(flagMapFile, index_col=0, squeeze=True).to_dict()


  flagBitMap_dict = pd.read_csv(flagFile, index_col=0, squeeze=True).to_dict()


In [10]:
def flagBits(flagStrings,column):
    bits = []
    flagBitMap = list('*0000000000000000')
    flagKey = flagHeadersMap_dict[column]
    flagColumns = flagBitMap_dict[flagKey]
    strings = flagStrings.split(',')
    for flagStr in strings:
        for k,v in flagColumns.items():
            if flagStr.strip() in v:
                bits.append(k)

    for bit in bits:
        bitPosition = 16 - int(bit)
        flagBitMap[bitPosition] = '1'
    flag = "".join(flagBitMap)
    return flag

In [11]:
def parseBottleFile(btlFile):
    castDict = {}
    
    f = open(btlFile, 'r')
    btlLines = f.readlines()
    bottleTimes = []
    bottleData = []
    for line in btlLines:
        if line.startswith('*') or line.startswith('#') or re.search(r'.*Bottle.*Date.*',line) or re.search(r'.*Position.*Time.*',line):
            if '* NMEA Latitude' in line:
                # * NMEA Latitude = 45 49.81 N
                m = re.search(r'.* NMEA Latitude\s=\s(\d*)\s(\d*.\d*)\s.*',line)
                if m:
                    latitude = float(m.group(2))/60 + float(m.group(1))
                    castDict['Start Latitude [degrees]'] = latitude
            if '* NMEA Longitude' in line:
                # * NMEA Longitude = 129 44.77 W
                m = re.search(r'.* NMEA Longitude\s=\s(\d*)\s(\d*.\d*)\s.*',line)
                if m:
                    long = float(m.group(2))/60 + float(m.group(1))
                    # make longituede negative
                    castDict['Start Longitude [degrees]'] = -long
            if '* NMEA UTC' in line:
                # * NMEA UTC (Time) = Jul 30 2017 11:01:22
                # * NMEA UTC (Time) = Aug 03 2020  15:17:41
                # convert to: 2017-07-30T11:01:22.000Z
                m = re.search(r'.*NMEA\sUTC.*=.*([a-zA-Z]{3}).*(\d{2}).*(\d{4}).*(\d{2}:\d{2}:\d{2}).*',line)
                if m:
                    monthInt = '%02d' % dt.datetime.strptime(m.group(1), "%b").month
                    timeString = m.group(3) + '-' + str(monthInt) + '-' + m.group(2) + 'T' + m.group(4) + '.000Z'
                    castDict['Start Time [UTC]'] = timeString
            if re.search(r'.*Bottle.*Date.*',line):
                bottleHeader = line.split()
        else:
            dataLines = line.split()
            if re.search(r'.*\d{2}:\d{2}:\d{2}.*',dataLines[0]):
                bottleTimes.append(dataLines[0])
            elif re.search(r'.*[1-9]|1[1-9]|2[1-4].*',dataLines[0]):
                bottleData.append(dataLines)
    bottleHeader = bottleHeader[2:]
    castDict.setdefault('BottleData',{})
    for i in range(len(bottleData)):
        bottle = int(bottleData[i][0])
        castDict['BottleData'].setdefault(bottle,{})
        monthInt = '%02d' % dt.datetime.strptime(bottleData[i][1], "%b").month
        timeString = bottleData[i][3] + '-' + str(monthInt) + '-' + bottleData[i][2] + 'T' + bottleTimes[i] + '.000Z'
        castDict['BottleData'][bottle]['CTD Bottle Closure Time [UTC]'] = timeString
        dataList = bottleData[i][4:-1]
        for j in range(len(bottleHeader)):
            if bottleHeader[j] in bottleMap_dict:
                castDict['BottleData'][bottle][bottleMap_dict[bottleHeader[j]]] = dataList[j]
        if 'Ph' not in bottleHeader:
            castDict['BottleData'][bottle]['CTD pH'] = fillValue

    return castDict

In [12]:
def parseHeaderFile(hdrFile):
    hdrDict = {}
    
    f = open(hdrFile, 'r')
    hdrLines = f.readlines()
    headerTimes = []
    for line in hdrLines:
        if '* NMEA Latitude' in line:
            # * NMEA Latitude = 45 49.81 N
            m = re.search(r'.* NMEA Latitude\s=\s(\d*)\s(\d*.\d*)\s.*',line)
            if m:
                latitude = float(m.group(2))/60 + float(m.group(1))
                hdrDict['Start Latitude [degrees]'] = latitude
        if '* NMEA Longitude' in line:
            # * NMEA Longitude = 129 44.77 W
            m = re.search(r'.* NMEA Longitude\s=\s(\d*)\s(\d*.\d*)\s.*',line)
            if m:
                long = float(m.group(2))/60 + float(m.group(1))
                # make longituede negative
                hdrDict['Start Longitude [degrees]'] = -long
        if '* NMEA UTC' in line:
            # * NMEA UTC (Time) = Jul 30 2017 11:01:22
            # * NMEA UTC (Time) = Aug 03 2020  15:17:41
            # convert to: 2017-07-30T11:01:22.000Z
            m = re.search(r'.*NMEA\sUTC.*=.*([a-zA-Z]{3}).*(\d{2}).*(\d{4}).*(\d{2}:\d{2}:\d{2}).*',line)
            if m:
                monthInt = '%02d' % dt.datetime.strptime(m.group(1), "%b").month
                timeString = m.group(3) + '-' + str(monthInt) + '-' + m.group(2) + 'T' + m.group(4) + '.000Z'
                hdrDict['Start Time [UTC]'] = timeString
        
        
    return hdrDict

In [13]:
def parseROVfile(ROVfile,columns):
    ROVdata = []
    bottleTimes = []
    bottleData = []
    
    f = open(ROVfile, 'r')
    ROVlines = f.readlines()
    if ROVlines[0].startswith('IRLS'):
        ROV = 'ROPOS'
        dataLineStart = cruiseYear
        dateIndex = 0
        dateSplit = '-'
        for line in ROVlines:
            if line.startswith(cruiseYear):
                dataLine = line.split(',')
                data = dataLine[0].split(' ')
                timeString = date[0] + 'T' + date[1] + '.000Z'
                ROVdata.append([timeString,float(dataLine[columns['press']].strip(',')),float(dataLine[columns['temp']].strip(',')),float(dataLine[columns['cond']].strip(',')),float(dataLine[columns['sal']].strip(','))])       
    else:
        ROV = 'JASON'
        dataLineStart = 'CT2'
        dateIndex = 1
        dateSplit = '/'
        for line in ROVlines:
            if line.startswith('CT2'):
                dataLine = line.split(',')
                date = dataLine[1].replace('/','-')
                time = dataLine[2]
                timeString = date + 'T' + time + 'Z'
                ROVdata.append([timeString,float(dataLine[columns['press']].strip(',')),float(dataLine[columns['temp']].strip(',')),float(dataLine[columns['cond']].strip(',')),float(dataLine[columns['sal']].strip(','))])

    df_ROV = pd.DataFrame(ROVdata, columns = ['ROV Time', 'CTD Pressure [db]','CTD Temperature 1 [deg C]', 'CTD Conductivity 1 [S/m]','CTD Salinity 1 [psu]'])

    return df_ROV

In [14]:
def meanROVdata(sampleTime,meanWindow_mins,df_ROV):
    try:
        bottleTime = dt.datetime.strptime(sampleTime,"%Y-%m-%dT%H:%M:%S.%fZ")
    except:
        bottleTime = dt.datetime.strptime(sampleTime,"%Y-%m-%dT%H:%M:%SZ")
    windowStart = bottleTime - dt.timedelta(minutes=meanWindow_mins)
    try:
        df_ROV['ROV Time'] = pd.to_datetime(df_ROV['ROV Time'], format='%Y-%m-%dT%H:%M:%S.%fZ')
    except:
        try:
            df_ROV['ROV Time'] = pd.to_datetime(df_ROV['ROV Time'], format='%Y-%m-%dT%H:%M:%S.000Z')
        except:
            df_ROV['ROV Time'] = pd.to_datetime(df_ROV['ROV Time'], format='%Y-%m-%dT%H:%M:%SZ')  
    extractMask = (df_ROV['ROV Time'] > windowStart) & (df_ROV['ROV Time'] <= bottleTime)
    df_ROV = df_ROV.loc[extractMask].mean()
    
    return df_ROV

In [15]:
ROVfileList = []
ROVdict = {}

if ROV == 'ROPOS':
    rovCTDfile = '.csv'
    divePrefix = 'R'
    diveRegex = re.compile(r"(\d*)_ctd_dive_export")
elif ROV == 'JASON':
    rovCTDfile = '.ct2'
    divePrefix = 'J2'
    diveRegex = r'\/[A-Z]*.*-[0-9]*_(J2-\d*)'
       
for rootdir, dirs, files in os.walk(os.path.join(dataPath, ROVfilePath)):
    for file in files:
        if (rovCTDfile in file):
            ROVfile_full = os.path.join(rootdir, file)
            ROVfileList.append(ROVfile_full)

for ROVfile in ROVfileList:
    m = re.search(diveRegex, ROVfile)
    if m:
        cast = m.group(1)
        ROVdict[cast] = parseROVfile(ROVfile,ROVcolumnDict)            
    else:
        print('error retrieving cast number from file: ', ROVfile)
        
compileDict['ROVdict'] = ROVdict

In [16]:
bottleFileList = []
headerFileList = []
bottleDict = {}
headerDict = {}

for rootdir, dirs, files in os.walk(os.path.join(dataPath, CTDfilePath)):
    for file in files:
        if ('.btl' in file):
            btlFile_full = os.path.join(rootdir, file)
            bottleFileList.append(btlFile_full)
        if ('.hdr' in file):
            hdrFile_full = os.path.join(rootdir, file)
            headerFileList.append(hdrFile_full)

for btlFile in bottleFileList:
    m = re.search(r'.*(CTD-\d*).btl', btlFile)
    if m:
        cast = m.group(1)
        bottleDict[cast] = parseBottleFile(btlFile)            
    else:
        print('error retrieving cast number from file: ', btlFile)
        
for hdrFile in headerFileList:
    m = re.search(r'.*(CTD-\d*).hdr', hdrFile)
    if m:
        cast = m.group(1)
        headerDict[cast] = parseHeaderFile(hdrFile)
    else:
        print('error retrieving cast number from file: ', hdrFile)
        
compileDict['bottleDict'] = bottleDict
compileDict['headerDict'] = headerDict

In [17]:
# Import sheets from discrete summary compilation spreadsheet
df_casts = pd.read_excel(os.path.join(dataPath, compileFile),sheet_name = 'CastList',engine='openpyxl')

df_samples = pd.read_excel(os.path.join(dataPath, compileFile),sheet_name = 'SampleList',engine='openpyxl')
df_samples = df_samples[~df_samples['Cast'].isnull()]
df_samples.fillna(fillValue, inplace = True)

df_oxygen = pd.read_excel(os.path.join(dataPath, compileFile),sheet_name = 'OxygenLog_all',engine='openpyxl')
df_oxygen = df_oxygen[~df_oxygen['Unnamed: 4'].isnull()].reset_index()
new_header = df_oxygen.iloc[0] #grab the first row for the header
df_oxygen.fillna(fillValue, inplace = True)
df_oxygen = df_oxygen[1:].astype(str) #take the data less the header row, convert all to strings
df_oxygen.columns = new_header #set the header row as the df header
df_oxygen.rename(columns={'Oxygen Concentration (mL/L)':'Discrete Oxygen [mL/L]'}, inplace=True)
compileDict['df_oxygen'] = df_oxygen.filter(['Cast #', 'Niskin #', 'Sample Bottle #', 'Discrete Oxygen [mL/L]'])

df_chlorophyll = pd.read_excel(os.path.join(dataPath, compileFile),sheet_name = 'ChloroLog_all',engine='openpyxl')
df_chlorophyll = df_chlorophyll[~df_chlorophyll['Unnamed: 3'].isnull()].reset_index()
new_header = df_chlorophyll.iloc[0] #grab the first row for the header
df_chlorophyll.fillna(fillValue, inplace = True)
df_chlorophyll = df_chlorophyll[1:].astype(str) #take the data less the header row
df_chlorophyll.columns = new_header #set the header row as the df header
compileDict['df_chlorophyll'] = df_chlorophyll.filter(['Cast #','Niskin #','Sample Bottle #', 'Chlorophyll Vial', 'Volume Filtered', 'Acetone Volume'])

compileDict['df_CastLog_ROV'] = pd.read_excel(os.path.join(dataPath, compileFile),sheet_name = 'CastLog_ROV',engine='openpyxl')
compileDict['df_CTDflags'] = pd.read_excel(os.path.join(dataPath, compileFile),sheet_name = 'CTDflags',engine='openpyxl')
compileDict['df_CTDflags_ROV'] = pd.read_excel(os.path.join(dataPath, compileFile),sheet_name = 'CTDflags_ROV',engine='openpyxl')

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [18]:
# Create metadata dictionary from castList
metadataDict = {}
df_casts_columns = df_casts.columns.to_list()
df_casts_columns.remove('Cast')
df_casts.fillna(fillValue, inplace = True)
for index, row in df_casts.iterrows():
    if not isinstance(row['Cruise'], float):
        metadataDict.setdefault(row['Cast'],{})
        for col in df_casts_columns:
            metadataDict[row['Cast']][col] = row[col]
            
compileDict['metadataDict'] = metadataDict

In [19]:
### If generateSampleLogs is 'yes', produce sample logs for nutrients, salinity, chlorphyll, and DIC 
###
### Log formats share an identical header, and most column headers:
### CTD Station, CTD ID, Niskin Bottle Number, Target Depth (m), Sample Bottle Number
###
### For Chlorophyll samples, we add two additional columns: Volume Filtered (L), Extraction Volume (L)
### For DIC samples, we add two additional columns: In-Situ Temperature (avg), In-Situ Salinity (avg)
###
###
if 'yes' in generateSampleLogs:
    sampleLogTypes = {'nut':nutLogFile, 'sal':salLogFile, 'chloro':fluorLogFile, 'dic':dicLogFile}
    standardColumns = "%s,%s,%s,%s,%s" % ('CTD Station','CTD ID','Niskin Bottle Number','Target Depth (m)','Sample Bottle Number')

    for sampleType in sampleLogTypes:
        if 'nut' in sampleType:
            firstLine = 'Nutrient Analyses'
            #units = '\u03BCM'
            units = 'uM'
            columns = standardColumns
            bottleString = 'Nutrient Bottle Number'
        if 'sal' in sampleType:
            firstLine = 'Salinity Analyses'
            units = 'salinity'
            columns = standardColumns
            bottleString = 'Salinity Bottle Number'
        if 'chloro' in sampleType:
            firstLine = 'Chlorophyll Analyses'
            #units = '\u03BCg/L'
            units = 'ug/L'
            ChloroColumns = "%s,%s" % ('Volume Filtered (L)','Extraction Volume (L)')
            columns = standardColumns + ',' + ChloroColumns
            bottleString = 'Chlorophyll Bottle Number'
        if 'dic' in sampleType:
            firstLine = 'DIC Analysis for pCO2 and TCO2'
            units = 'none'
            dicColumns = "%s,%s" % ('In-Situ Temperature (avg)','In-Situ Salinity (avg)')
            columns = standardColumns + ',' + dicColumns
            bottleString = 'DIC Bottle Number'
            
        with open(sampleLogTypes[sampleType],'w') as f:
            f.write(firstLine + '\n')
            if 'none' in units:
                f.write('\n')
            else:
                f.write("%s,%s,%s,%s\n" % ('','','Units Required',units))
            f.write('\n')
            f.write("%s,%s\n" % ('Customer and Data Recipient','Wendi Ruef'))
            f.write("%s,%s\n" % ('Email','wruef@uw.edu'))
            f.write("%s,%s\n" % ('Office','206-221-6760'))
            f.write('\n')
            f.write("%s,%s,%s,%s\n" % ('Worktage',worktag,'Budget Contact','Jenny E'))
            f.write("%s,%s,%s,%s\n" % ('PI','D. Kelley', '','jenny9@uw.edu'))
            f.write("%s,%s,%s,%s\n" % ('','','','Office: 206-542-5279'))
            f.write('Total Samples' + '\n')
            f.write('\n')
            f.write('\n')
            f.write(columns + '\n')

            df_sub = df_samples[~df_samples[bottleString].str.match(fillValue, na = False)]
            df_sub_filtered = df_sub.filter(['Cast','Niskin/Bottle Position', 'Target Depth', bottleString])
            sampleRows = []
            for index, row in df_sub_filtered.iterrows():
                sampleRow = []
                sampleRow.append(str(compileDict['metadataDict'][row['Cast']]['Station']))
                sampleRow.append(str(compileDict['metadataDict'][row['Cast']]['CTD File']))
                sampleRow.append(str(row['Niskin/Bottle Position']))
                sampleRow.append(str(row['Target Depth']))
                if 'chloro' in sampleType:    
                    #print(row)
                    sampleRow.append(str(compileDict['df_chlorophyll'].loc[(compileDict['df_chlorophyll']['Sample Bottle #'].str.match(row[bottleString])) & (compileDict['df_chlorophyll']['Cast #'].str.match(row['Cast'])), 'Chlorophyll Vial'].values[0]))
                    sampleRow.append(str(compileDict['df_chlorophyll'].loc[(compileDict['df_chlorophyll']['Sample Bottle #'].str.match(row[bottleString])) & (compileDict['df_chlorophyll']['Cast #'].str.match(row['Cast'])), 'Volume Filtered'].values[0]))
                    sampleRow.append(str(compileDict['df_chlorophyll'].loc[(compileDict['df_chlorophyll']['Sample Bottle #'].str.match(row[bottleString])) & (compileDict['df_chlorophyll']['Cast #'].str.match(row['Cast'])), 'Acetone Volume'].values[0]))
                elif 'dic' in sampleType:
                    sampleRow.append(str(row[bottleString]))
                    if divePrefix not in row.Cast:
                        sampleRow.append(str(compileDict['bottleDict'][row.Cast]['BottleData'][row['Niskin/Bottle Position']]['CTD Temperature 1 [deg C]']))
                        sampleRow.append(str(compileDict['bottleDict'][row.Cast]['BottleData'][row['Niskin/Bottle Position']]['CTD Salinity 1 [psu]']))
                    elif divePrefix in row.Cast:
                        sampleTime = compileDict['df_CastLog_ROV'].loc[(compileDict['df_CastLog_ROV']['Dive'].str.match(row['Cast'])) & (compileDict['df_CastLog_ROV']['Niskin'].str.match(row['Niskin/Bottle Position'])), 'CTD Bottle Closure Time [UTC]'].values[0]
                        ROVctdData = meanROVdata(sampleTime,2,compileDict['ROVdict'][row['Cast']])
                        sampleRow.append(str(ROVctdData['CTD Temperature 1 [deg C]']))
                        sampleRow.append(str(ROVctdData['CTD Salinity 1 [psu]']))
                else:
                    sampleRow.append(str(row[bottleString]))
                sampleString = ",".join(sampleRow)
                f.write(sampleString + '\n')
            f.close()
                    

In [20]:
# Load sample data from outside labs

### Nutrients ###
if 'notYetReceived' not in nutFile:
    nutData = pd.read_excel(os.path.join(dataPath, nutFile),sheet_name = None, engine='openpyxl')

    df_list = []
    nut_header = ['index','bottle #','Discrete Phosphate [uM]','Discrete Silicate [uM]','Discrete Nitrate [uM]','Discrete Nitrite [uM]','Discrete Ammonium [uM]']
    for k,v in nutData.items():
        filteredData = v[(~v['Unnamed: 1'].isnull()) & (v['Unnamed: 2'].str.contains("CTD|R"))][['Unnamed: 1','Unnamed: 5','Unnamed: 6','Unnamed: 7','Unnamed: 8','Unnamed: 9']].reset_index()
        filteredData.columns = nut_header
        df_list.append(filteredData)
    
    compileDict['df_nuts'] = pd.concat(df_list, axis=0).fillna(fillValue)
else:
    compileDict['df_nuts'] = fillValue

### Salinity ###
if 'notYetReceived' not in salFile:
    salData = pd.read_excel(os.path.join(dataPath, salFile),sheet_name = None, engine='openpyxl')
    df_list = []
    sal_header = ['index','bottle #','Discrete Salinity [psu]']
    for k,v in salData.items():
        filteredData = v[~v['Unnamed: 4'].isnull()][['Unnamed: 3','Unnamed: 5']].reset_index()
        filteredData.columns = sal_header
        df_list.append(filteredData)
    
    compileDict['df_sal'] = pd.concat(df_list, axis=0).fillna(fillValue)
else:
    compileDict['df_sal'] = fillValue
    
### Chlorophyll ###
if 'notYetReceived' not in fluorFile:
    fluorData = pd.read_excel(os.path.join(dataPath, fluorFile),sheet_name = None, engine='openpyxl')
    df_list = []
    fluor_header = ['index','bottle #','Discrete Chlorophyll [ug/L]','Discrete Phaeopigment [ug/L]','Discrete Fo/Fa Ratio']
    for k,v in fluorData.items():
        filteredData = v[(~v['UW'].isnull()) & (v['UW'].apply(isinstance, args=(int,)))][['UW','Unnamed: 6','Unnamed: 7','Unnamed: 8']].reset_index()
        filteredData.columns = fluor_header
        df_list.append(filteredData)
    
    compileDict['df_fluor'] = pd.concat(df_list, axis=0).fillna(fillValue)
else:
    compileDict['df_fluor'] = fillValue
    
### DIC ###
if 'notYetReceived' not in dicFile:
    dicData = pd.read_excel(os.path.join(dataPath, dicFile),sheet_name = None, engine='openpyxl')
    df_list = []
    dic_header = ['index','bottle #','pCO2 Analysis Temp [deg C]','Calculated Alkalinity [umol/kg]',
                  'Discrete DIC [umol/kg]','Discrete pCO2 [uatm]','Calculated CO2aq [umol/kg]',
                  'Calculated Bicarb [umol/kg]','Calculated CO3 [umol/kg]','Calculated pH',
                  'Calculated Omega-C','Calculated Omega-A']
    for k,v in dicData.items():
        filteredData = v[(v[dicData['Sheet1'].keys()[0]].str.contains('DIC')) & (v['AnalysisT'].apply(isinstance, args=(float,)))][[dicData['Sheet1'].keys()[0],'AnalysisT','alk (µeq/kg)','TCO2 (µmol/kg)','pco2_in situ (µatm)','co2aq (µmol/kg)','bicarb (µmol/kg)','co3 (µmol/kg)','pHt','omega-C','omega-A']].reset_index()
        filteredData.columns = dic_header
        df_list.append(filteredData)
    compileDict['df_DIC'] = pd.concat(df_list, axis=0).fillna(fillValue)
else:
    compileDict['df_DIC'] = fillValue


  warn("""Cannot parse header or footer so it will be ignored""")


In [21]:
### Define columns (headers) and dataRow list
headers = [k for k,v in headerMap_dict.items()]
dataRows = []

In [22]:
### append data list with casts that are data only, no corresponding discrete samples
for index, row in df_casts.iterrows():
    dataRow = []
    if 'Data cast only, no Niskins triggered' in row['CTD File Flag']:
        for column in headers:
            source = headerMap_dict[column]
            if ',' in source:
                sources = source.split(',')
                if 'CTD' in row.Cast:
                    source = sources[0]
                elif 'J2' or 'R' in row.Cast:
                    source = sources[1]   
            if 'metadataDict' in source:
                if 'Flag' in column:
                    flagStrings = compileDict[source][row.Cast][column]
                    if fillValue in flagStrings:
                        dataCell = fillValue
                    else:
                        dataCell = flagBits(flagStrings, column)
                else:
                    dataCell = compileDict[source][row.Cast][column]
            else:
                if 'Cast' in column:
                    dataCell = row.Cast
                elif any(columnString in column for columnString in ['Start Latitude [degrees]','Start Longitude [degrees]','Start Time [UTC]']):
                    dataCell = compileDict['headerDict'][row.Cast][column]
                else:
                    dataCell = fillValue
            
            dataRow.append(dataCell)
        dataRows.append(dataRow)


In [23]:
### apppend data list with each row of discrete samples
for index, row in df_samples.iterrows():
    dataRow = []
    for column in headers:
        source = headerMap_dict[column]
        if ',' in source:
            sources = source.split(',')
            if 'CTD' in row.Cast:
                source = sources[0]
            elif 'J2' or 'R' in row.Cast:
                source = sources[1]      
        if 'df_samples' in source:
            if 'Flag' in column:
                if fillValue in row[column]:
                    dataCell = fillValue
                else:
                    flagStrings = row[column]
                    dataCell = flagBits(flagStrings, column)
            else:
                dataCell = row[column]
            
        elif any(sourceString in source for sourceString in ['metadataDict','bottleDict']):
            if 'bottleDict_bottle' in source:
                dataCell = compileDict['bottleDict'][row.Cast]['BottleData'][row['Niskin/Bottle Position']][column]  
            else:
                if 'Flag' in column:
                    flagStrings = compileDict[source][row.Cast][column]
                    if fillValue in flagStrings:
                        dataCell = fillValue
                    else:
                        dataCell = flagBits(flagStrings, column)
                else:
                    dataCell = compileDict[source][row.Cast][column]
            
        elif any(sourceString in source for sourceString in ['df_CTDflags','df_CTDflags_ROV']):
            df = compileDict[source]
            cell = df.loc[df['Parameter'] == column, 'Parameter Flag']
            if len(cell) > 0:
                flagStrings = cell.values[0]
                dataCell = flagBits(flagStrings,column)
            else:
                dataCell = fillValue
                
        elif 'df_CastLog_ROV' in source:
            df = compileDict[source]
            cell = df.loc[((df['Dive'] == row.Cast) & (df['Niskin'] == row['Niskin/Bottle Position'])), column]
            dataCell = cell.values[0]

        elif 'ROVdict' in source:
            df = compileDict['df_CastLog_ROV']
            bottleTime = df.loc[((df['Dive'] == row.Cast) & (df['Niskin'] == row['Niskin/Bottle Position'])), 'CTD Bottle Closure Time [UTC]'].values[0]
            cell = meanROVdata(bottleTime,2,ROVdict[row.Cast])
            dataCell = cell[column]
            
        elif 'fill' in source:
            dataCell = fillValue

        elif 'df_oxygen' in source:
            if row['Oxygen Bottle Number'] != fillValue:
                df = compileDict[source]
                if not isinstance(df, str):
                    cell = df.loc[( (df['Sample Bottle #'] == str(row['Oxygen Bottle Number'])) &
                                  (df['Cast #'] == str(row.Cast)) & 
                                  (df['Niskin #'] == str(row['Niskin/Bottle Position'])) ), column]

                    if len(cell) > 0:
                        dataCell = cell.values[0]
                    else:
                        print('error retrieving oxygen values...')
                        print(str(row['Oxygen Bottle Number']), row.Cast, row['Niskin/Bottle Position'])
                        dataCell = fillValue

                else:
                    dataCell = fillValue
            else:
                dataCell = fillValue

        elif 'df_fluor' in source:
            if row['Chlorophyll Bottle Number'] != fillValue:
                df_chloro = compileDict['df_chlorophyll']
                vialNumber = df_chloro.loc[( df_chloro['Sample Bottle #'].str.match(row['Chlorophyll Bottle Number']) &
                                           (df_chloro['Cast #'] == str(row.Cast)) &
                                           (df_chloro['Niskin #'] == str(row['Niskin/Bottle Position'])) ), 'Chlorophyll Vial'].values[0]
                df = compileDict[source]
                if not isinstance(df, str):
                    df['bottle #'] = df['bottle #'].astype(str)
                    cell = df.loc[df['bottle #'] == vialNumber, column]
                    if len(cell) > 0:
                        dataCell = cell.values[0]
                    else:
                        print('error retrieving chlorphyll values...')
                        dataCell = fillValue
                else:
                    dataCell = fillValue
            else:
                dataCell = fillValue
       
        elif 'df_nuts' in source:
            if row['Nutrient Bottle Number'] != fillValue:
                df = compileDict[source]
                if not isinstance(df, str):
                    df['bottle #'] = df['bottle #'].astype(str)
                    cell = df.loc[df['bottle #'] == str(row['Nutrient Bottle Number']).split('-')[1], column]
                    if len(cell) > 0:
                        dataCell = cell.values[0]
                    else:
                        print('error retriving nutrient values...')
                        print(str(row['Nutrient Bottle Number']))
                        print(df['bottle #'])
                        dataCell = fillValue
                else:
                    dataCell = fillValue
            else:
                dataCell = fillValue
            
        elif 'df_sal' in source:
            if row['Salinity Bottle Number'] != fillValue:
                df = compileDict[source]
                if not isinstance(df, str):
                    df['bottle #'] = df['bottle #'].astype(str)
                    salBottleNumber = str(row['Salinity Bottle Number']).split('-')[1]
                    if salBottleNumber.startswith("0"):
                        salBottleNumber = salBottleNumber[1:]
                    cell = df.loc[df['bottle #'] == salBottleNumber, column]
                    if len(cell) > 0:
                        dataCell = cell.values[0]
                    else:
                        print('error retrieving salinity values...')
                        print(str(row['Salinity Bottle Number']))
                        dataCell = fillValue
                else:
                    dataCell = fillValue
            else:
                dataCell = fillValue
            
        elif 'df_DIC' in source:
            NA_DIC_vars = ['Discrete Alkalinity [umol/kg]','Discrete pH [Total scale]',
                           'pH Analysis Temp [deg C]','Calculated DIC [umol/kg]',
                          'Calculated pCO2 [uatm]']
            if any(var in column for var in NA_DIC_vars):
                dataCell = fillValue
            else:
                if row['DIC Bottle Number'] != fillValue:
                    df = compileDict[source]
                    if not isinstance(df, str):
                        #bottleString = row['DIC Bottle Number'].replace("-", " ")
                        bottleString = row['DIC Bottle Number']
                        cell = df.loc[df['bottle #'].str.contains(bottleString), column]
                        if len(cell) > 0:
                            dataCell = cell.values[0]
                        else:
                            print('error retrieving carbon values...')
                            print(str(row['DIC Bottle Number']))
                            print(df['bottle #'])
                            dataCell = fillValue
                    else:
                        dataCell = fillValue
                else:
                    dataCell = fillValue
                       
        dataRow.append(dataCell)
    dataRows.append(dataRow)
        

  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask]

  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask]

  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask]

  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()
  df_ROV = df_ROV.loc[extractMask].mean()


In [24]:
### Sort list by Cast Number and Niskin
dataRows = sorted(dataRows, key=operator.itemgetter(6,11))  


In [25]:
# append CTD parameter flags if discrete sample is present
# discrete samples for salinity, oxygen, chlorophyll, pH

discreteCTDmatch = {'Discrete Salinity [psu]':['CTD Conductivity 1 Flag','CTD Conductivity 2 Flag'],
                   'Discrete Oxygen [mL/L]':['CTD Oxygen Flag'],
                   'Discrete Chlorophyll [ug/L]':['CTD Fluorescence Flag'],
                   'Discrete pH [Total scale]':['CTD pH Flag'],
                   'Calculated pH':['CTD pH Flag']}

for row in dataRows:
    for key in discreteCTDmatch:
        discreteIndex = headers.index(key)
        if fillValue not in str(row[discreteIndex]):
            for flag in discreteCTDmatch[key]:
                flagIndex = headers.index(flag)
                if fillValue not in row[flagIndex]:
                    newFlag = list(row[flagIndex])
                    newFlag[9] = '1'
                    newFlagString = "".join(newFlag)
                    row[flagIndex] = newFlagString
    

In [26]:
with open(discreteSummaryFile, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(dataRows)

In [27]:
### Output list of CTD files for re-naming list in README
ctdFiles = []
for key, value in compileDict['metadataDict'].items():
    if fillValue not in key:
        ctdFiles.append(compileDict['metadataDict'][key]['CTD File'])
        

In [28]:
### Output all Notes from CastList and SampleList for README

notes = []
for index, row in df_samples.iterrows():
    if fillValue not in row.Notes:
        cruise = compileDict['metadataDict'][row.Cast]['Cruise']
        noteLine = cruise + ', ' + row.Cast + ', ' + 'Niskin ' + str(row['Niskin/Bottle Position']) + ': ' + row.Notes
        notes.append(noteLine)

for key,value in compileDict['metadataDict'].items():
    if fillValue not in compileDict['metadataDict'][key]['Notes']:
        cruise = compileDict['metadataDict'][key]['Cruise']
        note = compileDict['metadataDict'][key]['Notes']
        noteLine = cruise + ', ' + key + ', ' + note
        notes.append(noteLine)


In [29]:
f = open(READMEfile, "w")
f.write('File Mapping:\n')
for line in ctdFiles:
    f.write(line + "\n")

f.write('Summary Notes:\n')
for line in notes:
    f.write(line + "\n")

f.close()