# Merge Louise's bottle data with data from ECO2A cruise
Created by Ivan Lima on Mon Mar  7 2022 15:57:46 -0500

In [1]:
import pandas as pd
import numpy as np
import os, datetime, warnings
print('Last updated on {}'.format(datetime.datetime.now().ctime()))

Last updated on Wed Apr  6 15:09:53 2022


## Read ECO2A data

In [2]:
df_ecoa2 = pd.read_csv('data/eco2a_satellite_data.csv', parse_dates=['Date'], index_col=0, na_values=-999)

# remove bad/questionable data points
df_ecoa2 = df_ecoa2[df_ecoa2.DIC_FLAG.isin([2, 6])] 
df_ecoa2 = df_ecoa2[df_ecoa2.TA_FLAG.isin([2, 6])]

df_ecoa2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1032 entries, 0 to 2479
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   EXPOCODE       1032 non-null   object        
 1   Accession      1032 non-null   int64         
 2   Cruise_ID      1032 non-null   object        
 3   Date           1032 non-null   datetime64[ns]
 4   Year           1032 non-null   int64         
 5   Month          1032 non-null   int64         
 6   Day            1032 non-null   int64         
 7   Time_UTC       1032 non-null   object        
 8   Depth          1032 non-null   float64       
 9   Temperature    1032 non-null   float64       
 10  Salinity       1032 non-null   float64       
 11  Oxygen         941 non-null    float64       
 12  DIC            1032 non-null   float64       
 13  TA             1032 non-null   float64       
 14  DIC_FLAG       1032 non-null   int64         
 15  TA_FLAG        1032 n

## Read Louise's bottle data

In [3]:
df_bottle = pd.read_csv('data/bottle_satellite_data.csv', parse_dates=['Date'], index_col=0)

bot_cols = ['EXPOCODE', 'Accession', 'Cruise_ID', 'Date','Year', 'Month', 'Day', 'Time_UTC',
            'Latitude', 'Longitude', 'Depth', 'Salinity', 'Temperature', 'Oxygen', 'DIC', 
            'DIC_FLAG', 'TA', 'TA_FLAG', 'pCO2_yearave', 'pCO2_monthave', 'bottom_depth',
            'ADT', 'UGOS', 'VGOS', 'SLA', 'UGOSA', 'VGOSA', 'SST', 'SST_hires', 'Chl', 'KD490']

df_bottle = df_bottle[bot_cols]
df_bottle.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2748 entries, 0 to 3039
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   EXPOCODE       1473 non-null   object        
 1   Accession      101 non-null    float64       
 2   Cruise_ID      2029 non-null   object        
 3   Date           2748 non-null   datetime64[ns]
 4   Year           2748 non-null   int64         
 5   Month          2748 non-null   int64         
 6   Day            2748 non-null   int64         
 7   Time_UTC       2668 non-null   object        
 8   Latitude       2748 non-null   float64       
 9   Longitude      2748 non-null   float64       
 10  Depth          2748 non-null   float64       
 11  Salinity       2748 non-null   float64       
 12  Temperature    2748 non-null   float64       
 13  Oxygen         2748 non-null   float64       
 14  DIC            2748 non-null   float64       
 15  DIC_FLAG       2454 n

## Concatenate ECO2A cruise data and Louise's bottle data  

In [4]:
df_all = pd.concat([df_bottle, df_ecoa2], axis=0, ignore_index=True)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3780 entries, 0 to 3779
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   EXPOCODE       2505 non-null   object        
 1   Accession      1133 non-null   float64       
 2   Cruise_ID      3061 non-null   object        
 3   Date           3780 non-null   datetime64[ns]
 4   Year           3780 non-null   int64         
 5   Month          3780 non-null   int64         
 6   Day            3780 non-null   int64         
 7   Time_UTC       3700 non-null   object        
 8   Latitude       3780 non-null   float64       
 9   Longitude      3780 non-null   float64       
 10  Depth          3780 non-null   float64       
 11  Salinity       3780 non-null   float64       
 12  Temperature    3780 non-null   float64       
 13  Oxygen         3689 non-null   float64       
 14  DIC            3780 non-null   float64       
 15  DIC_FLAG       3486 n

## Clean merged dataset & write it to CSV file

In [5]:
df_all['bottom_depth'] = df_all.bottom_depth.abs() # convert negative to positive values
df_all = df_all[df_all.bottom_depth>8] # remove points in very shallow water (Chesapeake Bay & Buzzards Bay)

# flag anomalous O2 values
df_all['outlier'] = False # outlier flag
df_all.loc[df_all.Oxygen>500, 'outlier'] = True
df_all.loc[(df_all.Oxygen>300) & (df_all.Depth>200), 'outlier'] = True
df_all = df_all[df_all.outlier==False]

df_all = df_all.drop(1736) # remove record with bad temperature value

df_all['log_Chl'] = np.log(df_all.Chl)     # log-transformed Chl
df_all['log_KD490'] = np.log(df_all.KD490) # log-transformed KD490

# write cleaned data to CSV file
df_all.to_csv('data/bottle_satellite_data_clean.csv')

df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3623 entries, 0 to 3779
Data columns (total 34 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   EXPOCODE       2488 non-null   object        
 1   Accession      1124 non-null   float64       
 2   Cruise_ID      3044 non-null   object        
 3   Date           3623 non-null   datetime64[ns]
 4   Year           3623 non-null   int64         
 5   Month          3623 non-null   int64         
 6   Day            3623 non-null   int64         
 7   Time_UTC       3543 non-null   object        
 8   Latitude       3623 non-null   float64       
 9   Longitude      3623 non-null   float64       
 10  Depth          3623 non-null   float64       
 11  Salinity       3623 non-null   float64       
 12  Temperature    3623 non-null   float64       
 13  Oxygen         3533 non-null   float64       
 14  DIC            3623 non-null   float64       
 15  DIC_FLAG       3329 n