In [1]:
import numpy as np  # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re

In [2]:
# define paths to the source files and eventual output file
pathBottle='/ocean/eolson/MEOPAR/obs/Nina/All 2016 SoG bottle.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/Nina/2015-2018 Abs phyto groupsCorrected.xlsx'

pathOut='/ocean/eolson/MEOPAR/obs/Nina/bottlePhytoMerged2016.csv'

In [3]:
#formatting function to get year-(3digit) format
def fmtCruise(istr):
    if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): 
        sp=re.split('-',istr)
        rstr=sp[0]+'-0'+sp[1]
    elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
        rstr=istr
    else:
        raise ValueError('Input had unexpected format:',istr)
    return rstr

In [4]:
# get names of sheets in notebook
with pd.ExcelFile(pathBottle) as xl:
    sheets=xl.sheet_names
print(sheets)

['2016-05', '2016-47', '2016-07', '2016-62', '2016-010', '2016-071']


In [5]:
# load each sheet in the 2015 bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,
                      na_values=(-99,-99.9)) # read each sheet; include additional na values
    df0['Cruise']=fmtCruise(sheet)  # create and populate Cruise column based on sheet name
    dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))

Reading sheet 2016-05
Reading sheet 2016-47
Reading sheet 2016-07
Reading sheet 2016-62
Reading sheet 2016-010
Reading sheet 2016-071
removed empty columns: {'Oxygen:Dissolved.1', 'Flag:Ammonium', 'Ammonium'}


In [6]:
# list the column names in the resulting table
print(dfbot.keys())

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH',
       'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'Pressure',
       'Temperature:Secondary', 'Transmissivity', 'Fluorescence:URU:Seapoint',
       'PAR', 'PAR:Reference', 'pH:SBE:Nominal', 'Salinity:T1:C1',
       'Oxygen:Dissolved:SBE', 'Temperature:Draw', 'Salinity:Bottle',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',
       'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted',
       'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite',
       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',
       'Flag:Phosphate', 'Cruise', 'PAR1', 'Sample_Number.1',
       'Number_of_bin_records', 'YYYY/MM/DD HH:MM:SS', 'ADM:MISSION',
       'ADM:PROJECT', 'Pressure [decibar]',
       'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]',
       'Fluorescence:URU:Seapoint [mg/m^3]', 

In [7]:
temp=dfbot.loc[0:10,['Oxygen:Dissolved','Oxygen:Dissolved.1',
                'Oxygen:Dissolved [mL/L]','Oxygen:Dissolved [umol/kg]']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [8]:
[r['Oxygen:Dissolved [mL/L]'] if not pd.isna(r['Oxygen:Dissolved [mL/L]']) else  r['Oxygen:Dissolved'] if not pd.isna(r['Oxygen:Dissolved']) else r['Oxygen:Dissolved.1'] for i,r in temp.iterrows()]

[1.535, 4.964, 5.424, 5.293, 5.561, 6.269, 6.406, 8.517, 11.634, 4.374, 4.397]

In [9]:
def subval(idf,colList):
    # first value in colList should be the column you are going to keep
    # follow with other columns that will be used to fill in when that column is NaN
    # in order of precedence
    if len(colList)==2:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] for i,r in idf.iterrows()]
    elif len(colList)==3:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \
                         else r[colList[2]] for i,r in idf.iterrows()]
    else:
        raise NotImplementedError('Add to code to handle this case')
    return idf

In [10]:
# there are some duplicate columns here; handle them:
print(np.max(np.abs(dfbot['Sample_Number']-dfbot['Sample_Number.1'])),
      ': zero means no difference between Sample_Number and Sample_number.1 columns containing data')
# check there are no rows missing a value in 'Sample_Number':
print(np.sum(pd.isna(dfbot['Sample_Number'])),
      ': zero means no missing values in Sample_Number')
# -> extra copy is repeat so drop it
# 'ADM:MISSION' contains cruise id so drop it
# 'YYYY/MM/DD HH:MM:SS' data should be in 'FIL:START TIME YYYY/MM/DD HH:MM:SS'; move it there:
dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS'))
dfbot=subval(dfbot,('Pressure [decibar]','Pressure'))
dfbot=subval(dfbot,('Temperature:Secondary [deg C (ITS90)]','Temperature:Secondary'))
dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity'))
dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]','Fluorescence:URU:Seapoint'))
dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR'))
dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference'))
dfbot=subval(dfbot,('Salinity:T1:C1 [PSS-78]','Salinity:T1:C1'))
dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle'))
dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted'))
dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]','Oxygen:Dissolved:SBE'))
dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]','Oxygen:Dissolved'))
dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]','Nitrate_plus_Nitrite'))
dfbot=subval(dfbot,('Silicate [umol/L]','Silicate'))
dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate'))
dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR1'))

0.0 : zero means no difference between Sample_Number and Sample_number.1 columns containing data
0 : zero means no missing values in Sample_Number


In [11]:
# drop repetetive/unecessary columns:
dfbot.drop(labels=['Sample_Number.1','ADM:MISSION','YYYY/MM/DD HH:MM:SS','Transmissivity',
                   'Pressure','Temperature:Secondary','Fluorescence:URU:Seapoint','PAR',
                   'PAR:Reference','Salinity:T1:C1','Chlorophyll:Extracted',
                   'Oxygen:Dissolved:SBE','Oxygen:Dissolved',
                   'Nitrate_plus_Nitrite','Silicate','PAR1'],axis=1,inplace=True)

In [12]:
# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
    try:
        x =int(val)
    except ValueError:
        x=np.nan
    return x

In [13]:
# load the 2015 phytoplankton data with the following options:
#   sheet_name='2015 CHEMTAX abs results'  -> choose the 2015 sheet
#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet
#   skiprows=2     -> start reading at the 3rd row of the sheet, 
#                     which contains the column headings
#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column
#   verbose = True   -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2016 CHEMTAX abs results',usecols='A:I,U:AD',
                      skiprows=2,converters={'Index': convertIndex,},
                      verbose=True)

Reading sheet 2016 CHEMTAX abs results


In [14]:
# display rows 48 to 59 of the resulting table
dfPhyto[48:60]

Unnamed: 0,Bin #,Index,Subgroup,Cruise,Month,Station,Sample#,rep,depth,Diatoms-1.1,Diatoms-2.1,Prasinophytes.1,Cryptophytes.1,Dinoflagellates-1.1,Haptophytes.1,Dictyo.1,Raphido.1,Cyanobacteria.1,TchlA.1
48,5,49.0,1,2016-05,April,28,304,B,0,,,,,,,,,,
49,5,50.0,1,2016-05,April,SC04,308,A,0,7.74952,0.160529,0.16922,0.00871706,0.00373486,0,0.00494186,0,0,8.09666
50,5,51.0,1,2016-05,April,SC04,308,B,0,,,,,,,,,,
51,Absolute Pigment Compositions - Bin # 7,,,,,From Sheet: OutR6,,,,,,,,,,,,,
52,Tchl_a,,,,,,,,,,,,,,,,,,
53,Bin #,,Subgroup,Cruise,Month,Station,Sample#,rep,depth,Diatoms-1,Diatoms-2,Prasinophytes,Cryptophytes,Dinoflagellates-1,Haptophytes,Dictyo,Raphido,Cyanobacteria,TchlA
54,7,52.0,2,2016-47,June,14,438,A,0,1.10234,0.558928,0.0906997,0.177926,0.13286,0.0825219,0.0351295,0,0.0221866,2.20259
55,7,53.0,2,2016-47,June,14,438,B,0,,,,,,,,,,
56,7,54.0,2,2016-47,June,11,449,A,0,0.428352,0.255826,0.100261,0.0965965,0.11901,0.149184,0.0185796,0,0.0150108,1.18282
57,7,55.0,2,2016-47,June,11,449,B,0,,,,,,,,,,


In [15]:
# now, drop any rows from the table that have NaN values in either of the columns
#  'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all 
#  non-numeric Index values would have a consistent NaN entry, making them easy to identify
#  and remove
dfPhyto.dropna(subset=['Index', 'TchlA.1'],how='any',inplace=True)

In [16]:
# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)

In [17]:
# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]

In [18]:
# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]

Unnamed: 0,Bin #,Index,Subgroup,Cruise,Month,Station,Sample#,rep,depth,Diatoms-1.1,Diatoms-2.1,Prasinophytes.1,Cryptophytes.1,Dinoflagellates-1.1,Haptophytes.1,Dictyo.1,Raphido.1,Cyanobacteria.1,TchlA.1
48,14,96.0,3,2016-007,June,42,165,A,0,2.5736,0.110744,0.217416,0.305346,0.269703,0.254544,0.606853,0.0456593,0.00717568,4.39105
49,14,98.0,3,2016-007,June,39,187,A,0,0.787673,0.0461115,0.0921613,0.178211,0.126162,0.149745,0.185453,0.0464751,0.0209057,1.6329
50,14,100.0,3,2016-007,June,27,204,A,0,0.275825,0.252672,0.0394,0.0965363,0.0735499,0.171149,0.382332,0.0,0.0462792,1.33774
51,14,102.0,3,2016-007,June,2,220,A,0,0.510704,0.833001,0.0613842,0.23729,0.257099,0.14984,1.03367,0.0,0.0420515,3.12504
52,14,104.0,3,2016-007,June,3,240,A,0,0.0147476,0.483363,0.0727701,0.124406,0.120476,0.191926,0.579239,0.0912153,0.0237403,1.70188
53,14,106.0,3,2016-007,June,BS11,248,A,0,2.17935,0.161141,0.118986,0.116712,0.0760726,1.27153,0.0,0.0649914,0.0336102,4.0224
54,14,108.0,3,2016-007,June,6,262,A,0,0.0,0.00500107,0.1578,0.237657,0.0648448,0.780472,0.0,0.00572125,0.021938,1.27343
55,14,110.0,3,2016-007,June,9,275,A,0,0.0,0.168997,0.133203,0.26119,0.0583605,0.527089,0.0770829,0.0042515,0.029262,1.25944
56,14,112.0,3,2016-007,June,12,292,A,0,0.0,0.262245,0.171069,0.313105,0.0966463,0.650261,0.352208,0.0137084,0.0162181,1.87546
57,14,114.0,3,2016-007,June,14,306,A,0,0.0,0.0,0.174892,0.427669,0.107881,1.25327,0.0,0.0,0.00934217,1.97305


In [19]:
# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns; 
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
    if colName.endswith('.1'):
        renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)

In [20]:
dfPhyto

Unnamed: 0,Bin #,Index,Subgroup,Cruise,Month,Station,Sample#,rep,depth,Diatoms-1,Diatoms-2,Prasinophytes,Cryptophytes,Dinoflagellates-1,Haptophytes,Dictyo,Raphido,Cyanobacteria,TchlA
0,5,1.0,1,2016-005,April,59,23,A,0,5.1916,0.0946848,0.0488779,0.0387973,0.00389556,0,0.00372726,0.00292029,0.00253688,5.38704
1,5,3.0,1,2016-005,April,102,37,A,0,5.37123,1.15238,0.40413,0.481215,0.158845,0.254655,0.0502024,0.0705506,0,7.9432
2,5,5.0,1,2016-005,April,75,51,A,0,16.161,0.106123,0.412824,0.321249,0.0889,0,0,0,0,17.0901
3,5,7.0,1,2016-005,April,72,64,A,0,2.09709,0.463744,0.365898,0.319666,0.0865536,0.0616627,0.00386904,0.0241544,0,3.42264
4,5,9.0,1,2016-005,April,69,76,A,0,4.59361,0.0264039,0.191864,0.129161,0.0562811,0,0.000159775,0,0,4.99748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,16,230.0,6,2016-071,Nov,9,250,A,0,0.166163,0.00989967,0.0879108,0.00274454,0.000365983,0,0.00747525,0,0,0.274559
116,16,232.0,6,2016-071,Nov,6,264,A,0,0.192053,0.021066,0.140119,0.067455,0.000266122,0,0.00722367,0,0.00478627,0.432969
117,16,234.0,6,2016-071,Nov,3,281,A,0,0.146044,0.008746,0.0774312,0.0189927,0.0204091,0,0.00622126,0,0,0.277844
118,16,236.0,6,2016-071,Nov,2,297,A,0,0.101162,0,0.193608,0.194109,0.0462191,0.0261594,0.0153874,0.0639245,0.00386339,0.644433


In [21]:
# This is the important step- join the two tables ('left' and 'right'), 
#  matching the cruise IDs and sample numbers
#   how='outer'  -> all rows from both the left and the right tables will be included, 
#                   even if they cannot be matched; this makes it easy to check for 
#                   unmatched data later
#   left_on  specifies the name of the column to match in the left table (dfbot) 
#   right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto,  how='outer', 
                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])

In [22]:
# show the column names in the resulting table
dfout.keys()

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH',
       'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'pH:SBE:Nominal',
       'Temperature:Draw', 'Salinity:Bottle', 'Flag:Salinity:Bottle',
       'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted',
       'Flag:Oxygen:Dissolved', 'Flag:Nitrate_plus_Nitrite', 'Flag:Silicate',
       'Phosphate', 'Flag:Phosphate', 'Cruise', 'Number_of_bin_records',
       'ADM:PROJECT', 'Pressure [decibar]',
       'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]',
       'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',
       'PAR:Reference [uE/m^2/sec]', 'Salinity:T1:C1 [PSS-78]',
       'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',
       'Salinity:Bottle [PSS-78]', 'Chlorophyll:Extracted [mg/m^3]',
       'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',
       'Silicate [umol/L]', 'P

### Checks

In [23]:
# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])


In [24]:
temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']

In [25]:
np.unique(temp.icount)

array([1])

In [26]:
# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,icount
Cruise,Sample#,Unnamed: 2_level_1


In [27]:
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Unnamed: 0_level_0,Unnamed: 1_level_0,icount
Cruise,Sample#,Unnamed: 2_level_1


In [28]:
temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,icount
Cruise,Sample_Number,Unnamed: 2_level_1


In [29]:
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Unnamed: 0_level_0,Unnamed: 1_level_0,icount
Cruise,Sample#,Unnamed: 2_level_1


In [30]:
# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)

(1420, 120, 1420)

In [31]:
# Check that the number of cells with data in the 'Cyanobacteria' column is 
#  the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)

(120, 120)

In [32]:
# If there were data rows from the phytoplankton table that were not matched to 
#  rows from the bottle table, their indices from the phytoplankton table would be 
#  displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])

Series([], Name: Index, dtype: float64)


In [33]:
# drop repetetive/unecessary columns:
dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep',
                   'depth',],axis=1,inplace=True)

In [34]:
# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
       'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'TchlA'):
    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column

In [35]:
dfout['Cyanobacteria']

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
1415      NaN
1416      NaN
1417      NaN
1418    0.013
1419      NaN
Name: Cyanobacteria, Length: 1420, dtype: float64

In [36]:
# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)

In [37]:
dfout.keys()

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH',
       'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'pH:SBE:Nominal',
       'Temperature:Draw', 'Salinity:Bottle', 'Flag:Salinity:Bottle',
       'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted',
       'Flag:Oxygen:Dissolved', 'Flag:Nitrate_plus_Nitrite', 'Flag:Silicate',
       'Phosphate', 'Flag:Phosphate', 'Cruise', 'Number_of_bin_records',
       'ADM:PROJECT', 'Pressure [decibar]',
       'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]',
       'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',
       'PAR:Reference [uE/m^2/sec]', 'Salinity:T1:C1 [PSS-78]',
       'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',
       'Salinity:Bottle [PSS-78]', 'Chlorophyll:Extracted [mg/m^3]',
       'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',
       'Silicate [umol/L]', 'P