# Using pandas dataframes
This notebook demonstrates how to load data into a pandas dataframe and do basic operations with the dataframe data.

## Loading BB3 Cruise data into a pandas df

In [6]:
# === using `import ... as ...`
# you can import pandas as usual
import pandas
# and use it
print(f"pandas.__version: {pandas.__version__}")

# but people often give pandas another name - `pd` when imported
# to do this you can use `import ... as ...`
import pandas as pd
# now we can make the same code more succinct
print(f"pd.__version__: {pd.__version__}")

# pd and pandas are exactly the same *object*.
print("pd =?= pandas")
print(f"{pd == pandas}")

# if we add something to the pd object
pd.my_randomly_named_attribute = "This is Tylar's special string object"
# the two objects are still equal
print(f"Did adding the attribute change `pd` but not `pandas`? {pd == pandas}")
# we can use the attribute we put on `pd` on the `pandas`
print(f"pandas.my_randomly_named_attribute: {pandas.my_randomly_named_attribute}")

# you can even import it again with a different name
import pandas as whatever_i_feel_like
print(f"whatever_i_feel_like.__version__: {whatever_i_feel_like.__version__}")
# and the new import is still the exact same object as `pd` and `pandas`
print(f"whatever_i_feel_like.my_randomly_named_attribute: {pandas.my_randomly_named_attribute}")
# You can get creative with this but... please don't.
# Stick to the original name or common usages like `pandas as pd`
# Another common one is `numpy as np`
import numpy as np
print(f"np.__version__: {np.__version__}")
# and xarray
import xarray as xr
print(f"xr.__version__: {xr.__version__}")


pandas.__version: 1.3.3
pd.__version__: 1.3.3
pd =?= pandas
True
Did adding the attribute change `pd` but not `pandas`? True
pandas.my_randomly_named_attribute: This is Tylar's special string object
whatever_i_feel_like.__version__: 1.3.3
whatever_i_feel_like.my_randomly_named_attribute: This is Tylar's special string object
np.__version__: 1.21.2


  "Failed to load cfgrib - most likely there is a problem accessing the ecCodes library. "


xr.__version__: 0.19.0


In [5]:
pip install "xarray[io]"

Collecting netCDF4
  Downloading netCDF4-1.5.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.7 MB/s eta 0:00:01
[?25hCollecting cfgrib
  Downloading cfgrib-0.9.9.1-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.9 MB/s  eta 0:00:01
[?25hCollecting fsspec
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 73.5 MB/s eta 0:00:01
[?25hCollecting rasterio
  Downloading rasterio-1.2.10-cp37-cp37m-manylinux1_x86_64.whl (19.3 MB)
[K     |████████████████████████████████| 19.3 MB 41.7 MB/s eta 0:00:01
[?25hCollecting cftime
  Downloading cftime-1.5.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (220 kB)
[K     |████████████████████████████████| 220 kB 51.8 MB/s eta 0:00:01
[?25hCollecting pydap
  Downloading Pydap-3.2.2-py3-none-any.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 38.0 MB/s eta 0:00:01
Coll

In [15]:
# Loading Sat data into pandas (is df the right data structure for this?)
import os

import pandas
FILEPATH = "../../data/WS19266_BB3.raw"

# read the tab-separated-values file (`.raw` = `.tsv`)
bb3_df = pandas.read_csv(
    FILEPATH, 
    sep='\t',
    on_bad_lines='skip',  # default is 'error'. can also use 'warn' and 'skip'
    # usecols=["date","time","470nm", "470nm_data","532nm", "532nm_data", "650nm", "650nm_data", "mystery_column"],
    skiprows=1
)

# now you can use the pandas dataframe
SEPARATOR = "="*302  # this creates big string like `=====================` with 302 `=` characters
print(bb3_df.describe())
print(SEPARATOR)
print(bb3_df.info())
print(SEPARATOR)
print(bb3_df)

                470          2153           532          2043           650  \
count  3.090500e+04  3.078400e+04  3.063000e+04  3.047700e+04  3.027000e+04   
mean   1.333423e+03  3.012149e+03  1.561360e+03  1.326214e+03  9.365565e+02   
std    5.442526e+04  2.408126e+05  6.211511e+04  3.077109e+04  3.776643e+04   
min    0.000000e+00 -2.680000e+02  0.000000e+00 -2.049000e+03  0.000000e+00   
25%    4.700000e+02  1.260000e+02  5.320000e+02  1.510000e+02  6.500000e+02   
50%    4.700000e+02  3.050000e+02  5.320000e+02  3.850000e+02  6.500000e+02   
75%    4.700000e+02  1.040000e+03  5.320000e+02  1.106000e+03  6.500000e+02   
max    4.704130e+06  4.130413e+07  5.324130e+06  4.130130e+06  6.503842e+06   

               4130           536  
count  3.006100e+04  29786.000000  
mean   1.918062e+03    528.212348  
std    3.476398e+04    599.449778  
min   -2.670000e+02    -65.000000  
25%    1.820000e+02    517.000000  
50%    5.220000e+02    521.000000  
75%    3.761000e+03    524.000000  


## Loading sat image data into pandas

In [19]:
import pandas
FILEPATH =  "../../data/MODA_OC_py_data/A2007143182500.L2_LAC_OC.x.nc"

# The xarray library handles arbitrary-dimensional netCDF data, and retains metadata. 
# Xarray provides a simple method of opening netCDF files, and converting them to pandas dataframes.
import xarray

# create an xarray.Dataset from `.nc` file
img_dataset = xarray.open_dataset(FILEPATH)

# Here we use `display()` instead of `print()` to get a prettier output.
# (You can't do this outside of a jupyter notebook.)
from IPython.display import display
display(img_dataset)

# but the xr.DataSet breaks when we try to convert it to a dataframe.
# uncomment the next line and try it yourself:
# img_df = img_dataset.to_dataframe()  # throws `ValueError: no valid index for a 0-dimensional object`

# === convert the `xr.Dataset` to a `pandas.DataFrame`
# this file has a hierarchy of groups so it is more complicated than just using
# `img_dataset.to_dataframe()` as usual.
# [SO q/a ref](https://stackoverflow.com/a/54813257/1483986)

In [20]:
!FILEPATH="../../data/MODA_OC_py_data/A2007143182500.L2_LAC_OC.x.nc"
!ncdump -h $FILEPATH

/bin/bash: ncdump: command not found


In [26]:
NC_GROUP_TO_OPEN = "TODO: PUT_THE_GROUP_HERE"  # get this using bash `ncdump -h` or similar
xr.open_dataset(FILEPATH, group=NC_GROUP_TO_OPEN)

# img_df = img_dataset.to_dataframe()

# now you can use the pandas dataframe
# print(img_df.describe())
# print("="*302)

# print(img_df.variables)
# print(img_df)


OSError: [Errno group not found: TODO: PUT_THE_GROUP_HERE] 'TODO: PUT_THE_GROUP_HERE'