# Using pandas dataframes
This notebook demonstrates how to load data into a pandas dataframe and do basic operations with the dataframe data.

## Loading BB3 Cruise data into a pandas df

In [2]:
# === using `import ... as ...`
# you can import pandas as usual
import pandas
# and use it
print(f"pandas.__version: {pandas.__version__}")

# but people often give pandas another name - `pd` when imported
# to do this you can use `import ... as ...`
import pandas as pd
# now we can make the same code more succinct
print(f"pd.__version__: {pd.__version__}")

# pd and pandas are exactly the same *object*.
print("pd =?= pandas")
print(f"{pd == pandas}")

# if we add something to the pd object
pd.my_randomly_named_attribute = "This is Tylar's special string object"
# the two objects are still equal
print(f"Did adding the attribute change `pd` but not `pandas`? {pd == pandas}")
# we can use the attribute we put on `pd` on the `pandas`
print(f"pandas.my_randomly_named_attribute: {pandas.my_randomly_named_attribute}")

# you can even import it again with a different name
import pandas as whatever_i_feel_like
print(f"whatever_i_feel_like.__version__: {whatever_i_feel_like.__version__}")
# and the new import is still the exact same object as `pd` and `pandas`
print(f"whatever_i_feel_like.my_randomly_named_attribute: {pandas.my_randomly_named_attribute}")
# You can get creative with this but... please don't.
# Stick to the original name or common usages like `pandas as pd`
# Another common one is `numpy as np`
import numpy as np
print(f"np.__version__: {np.__version__}")
# and xarray
import xarray as xr
print(f"xr.__version__: {xr.__version__}")


pandas.__version: 1.3.3
pd.__version__: 1.3.3
pd =?= pandas
True
Did adding the attribute change `pd` but not `pandas`? True
pandas.my_randomly_named_attribute: This is Tylar's special string object
whatever_i_feel_like.__version__: 1.3.3
whatever_i_feel_like.my_randomly_named_attribute: This is Tylar's special string object
np.__version__: 1.21.3


  "Failed to load cfgrib - most likely there is a problem accessing the ecCodes library. "


xr.__version__: 0.19.0


In [3]:
import os

import pandas
from IPython.display import display

FILEPATH = "../../data/WS19266_BB3.raw"

# read the tab-separated-values file (`.raw` = `.tsv`)
bb3_df = pandas.read_csv(
    FILEPATH, 
    sep='\t',
    on_bad_lines='skip',  # default is 'error'. can also use 'warn' and 'skip'
    # usecols=["date","time","470nm", "470nm_data","532nm", "532nm_data", "650nm", "650nm_data", "mystery_column"],
    skiprows=1
)

# now you can use the pandas dataframe
SEPARATOR = "="*302  # this creates big string like `=====================` with 302 `=` characters
display(bb3_df.describe())
display(SEPARATOR)
display(bb3_df.info())
display(SEPARATOR)
display(bb3_df)

Unnamed: 0,470,2153,532,2043,650,4130,536
count,30905.0,30784.0,30630.0,30477.0,30270.0,30061.0,29786.0
mean,1333.423,3012.149,1561.36,1326.214,936.5565,1918.062,528.212348
std,54425.26,240812.6,62115.11,30771.09,37766.43,34763.98,599.449778
min,0.0,-268.0,0.0,-2049.0,0.0,-267.0,-65.0
25%,470.0,126.0,532.0,151.0,650.0,182.0,517.0
50%,470.0,305.0,532.0,385.0,650.0,522.0,521.0
75%,470.0,1040.0,532.0,1106.0,650.0,3761.0,524.0
max,4704130.0,41304130.0,5324130.0,4130130.0,6503842.0,4134130.0,52521.0




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31073 entries, 0 to 31072
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   07/29/19  31056 non-null  object 
 1   12:06:15  31007 non-null  object 
 2   470       30905 non-null  float64
 3   2153      30784 non-null  float64
 4   532       30630 non-null  float64
 5   2043      30477 non-null  float64
 6   650       30270 non-null  float64
 7   4130      30061 non-null  float64
 8   536       29786 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.1+ MB


None



Unnamed: 0,07/29/19,12:06:15,470,2153,532,2043,650,4130,536
0,07/29/19,12:06:16,470.0,2151.0,532.0,2034.0,650.0,4130.0,536.0
1,07/29/19,12:06:17,470.0,2121.0,532.0,1994.0,650.0,4130.0,535.0
2,07/29/19,12:06:18,470.0,2119.0,532.0,1992.0,650.0,4130.0,535.0
3,07/29/19,12:06:19,470.0,2117.0,532.0,1991.0,650.0,4130.0,535.0
4,07/29/19,12:06:20,470.0,2115.0,532.0,1993.0,650.0,4130.0,535.0
...,...,...,...,...,...,...,...,...,...
31068,09/29/19,05:10:20,470.0,2045.0,532.0,1537.0,650.0,4130.0,519.0
31069,09/29/19,05:10:21,470.0,1405.0,532.0,1480.0,650.0,4130.0,519.0
31070,09/29/19,17:36:11,470.0,120.0,532.0,87.0,650.0,853.0,527.0
31071,09/29/19,17:36:12,470.0,121.0,532.0,79.0,650.0,847.0,526.0


In [5]:
# pickle the data into a file 
import pickle
with open("bbd_df.pickle", "wb") as file_obj:
    pickle.dump(bb3_df, file_obj)

## Loading sat image data into pandas

In [6]:
import pandas
FILEPATH =  "../../data/MODA_OC_py_data/A2007143182500.L2_LAC_OC.x.nc"

# The xarray library handles arbitrary-dimensional netCDF data, and retains metadata. 
# Xarray provides a simple method of opening netCDF files, and converting them to pandas dataframes.
import xarray

# create an xarray.Dataset from `.nc` file
img_dataset = xarray.open_dataset(FILEPATH)

# Here we use `display()` instead of `print()` to get a prettier output.
# (You can't do this outside of a jupyter notebook.)
from IPython.display import display
display(img_dataset)

# but the xr.DataSet breaks when we try to convert it to a dataframe.
# uncomment the next line and try it yourself:
img_df = img_dataset.to_dataframe()  # throws `ValueError: no valid index for a 0-dimensional object`

# === convert the `xr.Dataset` to a `pandas.DataFrame`
# this file has a hierarchy of groups so it is more complicated than just using
# `img_dataset.to_dataframe()` as usual.
# [SO q/a ref](https://stackoverflow.com/a/54813257/1483986)

Cannot find the ecCodes library


ValueError: no valid index for a 0-dimensional object

In [20]:
!FILEPATH="../../data/MODA_OC_py_data/A2007143182500.L2_LAC_OC.x.nc"
!ncdump -h $FILEPATH

/bin/bash: ncdump: command not found


In [8]:
FILEPATH =  "../../data/MODA_OC_py_data/A2007143182500.L2_LAC_OC.x.nc"

import xarray
from IPython.display import display

# TODO: figure out how to get group names from within python
NC_GROUP_TO_OPEN = "geophysical_data"  # get this using bash `ncdump -h` or similar
LAT_LON_GROUP = "navigation_data"

img_dataset = xarray.open_dataset(FILEPATH, group=NC_GROUP_TO_OPEN)
print(img_dataset.variables)
display(img_dataset)

img_df = img_dataset.to_dataframe()

# now you can use the pandas dataframe
display(img_df.describe())
# print("="*302)

Frozen({'aot_869': <xarray.Variable (number_of_lines: 1353, pixels_per_line: 664)>
[898392 values with dtype=float32]
Attributes:
    long_name:      Aerosol optical thickness at 869 nm
    standard_name:  atmosphere_absorption_optical_thickness_due_to_ambient_ae...
    valid_min:      0
    valid_max:      30000, 'angstrom': <xarray.Variable (number_of_lines: 1353, pixels_per_line: 664)>
[898392 values with dtype=float32]
Attributes:
    long_name:      Aerosol Angstrom exponent, 443 to 865 nm
    standard_name:  aerosol_angstrom_exponent
    valid_min:      -30000
    valid_max:      5000, 'Rrs_412': <xarray.Variable (number_of_lines: 1353, pixels_per_line: 664)>
[898392 values with dtype=float32]
Attributes:
    long_name:         Remote sensing reflectance at 412 nm
    units:             sr^-1
    standard_name:     surface_ratio_of_upwelling_radiance_emerging_from_sea_...
    valid_min:         -30000
    valid_max:         25000
    solar_irradiance:  1711.8177, 'Rrs_443': <xarr

Unnamed: 0,aot_869,angstrom,Rrs_412,Rrs_443,Rrs_469,Rrs_488,Rrs_531,Rrs_547,Rrs_555,Rrs_645,...,Rrs_678,chlor_a,chl_ocx,Kd_490,pic,poc,ipar,nflh,par,l2_flags
count,87628.0,87628.0,87628.0,87628.0,87628.0,87628.0,87628.0,87628.0,87628.0,87628.0,...,87628.0,87604.0,87556.0,87617.0,44964.0,87608.0,87628.0,79188.0,218457.0,898392.0
mean,0.195771,0.893121,0.009778,0.008267,0.007764,0.006362,0.00278,0.00207,0.001676,-4e-05,...,0.000202,0.20814,0.181997,0.033976,0.000811,46.941074,0.002049,0.114106,56.96777,1058904000.0
std,0.084263,0.411226,0.00313,0.002406,0.002271,0.002412,0.002639,0.00256,0.002345,0.000463,...,0.000351,1.225809,1.22901,0.084339,0.003873,62.733784,3.4e-05,0.110101,4.294397,140048100.0
min,0.0088,-0.1522,-0.004886,-0.004868,-0.004682,-0.004862,-0.004286,-0.004196,-0.00491,-0.00683,...,-0.006556,0.001511,0.001,0.0166,1.2e-05,0.199707,0.00189,-0.48326,0.743996,0.0
25%,0.1197,0.5388,0.007486,0.0066,0.00645,0.005124,0.001628,0.001018,0.00075,-0.000256,...,4.8e-05,0.066973,0.032221,0.0184,1.4e-05,23.0,0.002034,0.053005,56.416,1073743000.0
50%,0.1903,0.8003,0.009082,0.00777,0.007505,0.006084,0.00228,0.00157,0.001218,-9.6e-05,...,0.00014,0.099953,0.058755,0.0236,7.6e-05,29.600098,0.00205,0.10596,58.239998,1074791000.0
75%,0.2594,1.1819,0.012132,0.009834,0.008754,0.007048,0.002884,0.002086,0.001676,8.2e-05,...,0.000274,0.141229,0.125688,0.0342,0.00023,45.399902,0.00207,0.155685,59.192001,1074791000.0
max,0.4847,2.1474,0.032446,0.02792,0.03231,0.034136,0.028008,0.025212,0.022602,0.012668,...,0.011154,217.868942,217.868942,6.4,0.130534,5652.200195,0.002113,4.904295,61.619999,1616429000.0


In [9]:
# pickle the data into a file 
import pickle
with open("A2007143182500.L2_LAC_OC.x.nc.pickle", "wb") as file_obj:
    pickle.dump(img_df, file_obj)

## GEE data into dataframe

In [None]:
# TODO: cp from https://mygeoblog.com/2017/01/13/your-gee-data-in-pandas/

## streaming sat data from `rio-tiler` into dataframe

In [13]:
# TODO: is there an easy way to get from data loaded using `rio-tiler` into a `pandas.dataframe`? 