This notebook downloads a full day's woth of rainfall data from the CASA repository and converts it from individual NetCDF files into one pandas dataframe.

In [1]:
! pip install xarray pandas pysftp

Collecting xarray
  Downloading xarray-2023.12.0-py3-none-any.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.1.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pysftp
  Downloading pysftp-0.2.9.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting numpy>=1.22 (from xarray)
  Using cached numpy-1.26.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.4-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting paramiko>=1.17 (from pysftp)
  Downloading paramiko-3.4.0-py3-none-any.whl.metadata (4.4 kB)
Collecting bcrypt>=3.2 (from paramiko>=1.17->pysftp)
  Downloading bcrypt-4.1.2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting cryptography>=3.3 (from paramiko>=1.17->pysftp)
  Downloading cryptography-41.0.7

In [1]:
import os
import pysftp
import xarray as xr
import pandas as pd
import gzip
import shutil

In [2]:
def download_directory(ssh_host, ssh_username, ssh_pk, remote_dir, local_dir):
    # Connect to the SFTP server
    # The SFTP server is connected to the NAS that holds the CASA data repository
    cnopts = pysftp.CnOpts()
    cnopts.hostkeys = None
    with pysftp.Connection(ssh_host, username=ssh_username, private_key=ssh_pk, cnopts=cnopts) as sftp:
        # Change to the remote directory
        sftp.chdir(remote_dir)

        # List the contents of the remote directory
        remote_files = sftp.listdir()

        # Recursively download each file
        for file_name in remote_files:
            remote_path = os.path.join(remote_dir, file_name)
            local_path = os.path.join(local_dir, file_name)

            # If it's a directory, create the local directory
            if sftp.isdir(remote_path):
                os.makedirs(local_path, exist_ok=True)
                download_directory(ssh_host, ssh_username, ssh_pk, remote_path, local_path)
            else:
                # Download the file
                sftp.get(remote_path, local_path)

In [2]:
# Example usage
ssh_host = ""
ssh_username = ""
ssh_pk = ""
remote_directory = "/mnt/casa-ssd-pool/casa/qpe/20180908"
local_directory_gz = "/work/pi_mzink_umass_edu/SPRITE/UsableNotebooks/netcdf/20180908_gz"
local_directory = "/work/pi_mzink_umass_edu/SPRITE/UsableNotebooks/netcdf/20180908"

if not os.path.exists(local_directory):
    os.makedirs(local_directory)

if not os.path.exists(local_directory_gz):
    os.makedirs(local_directory_gz)

In [4]:
download_directory(ssh_host, ssh_username, ssh_pk, remote_directory, local_directory_gz)

In [5]:
def netcdf_to_dataframe(directory_path):
    # Create an empty list to store individual DataFrames
    dfs = []

    # Iterate through each NetCDF file in the directory
    for file_name in os.listdir(directory_path):
        local_file = os.path.splitext(file_name)[0]
        with (
            gzip.open(os.path.join(directory_path, file_name), "rb") as f_in,
            open(os.path.join(directory_path, local_file), "wb") as f_out,
        ):
            shutil.copyfileobj(f_in, f_out)

        file_path = os.path.join(directory_path, local_file)

        # Open the NetCDF file using xarray
        ds = xr.open_dataset(file_path)

        # Convert the xarray dataset to a Pandas DataFrame
        df = ds.to_dataframe()

        # Append the DataFrame to the list
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(dfs)

    return final_df

In [3]:
def limited_netcdf_to_dataframe(directory_path_gz, unzip_directory_path, number_of_files=10):
    # Create an empty list to store individual DataFrames
    dfs = []

    # Iterate through each NetCDF file in the directory
    for file_name in os.listdir(directory_path_gz)[:number_of_files]:
        local_file = os.path.splitext(file_name)[0]
        with (
            gzip.open(os.path.join(directory_path_gz, file_name), "rb") as f_in,
            open(os.path.join(unzip_directory_path, local_file), "wb") as f_out,
        ):
            shutil.copyfileobj(f_in, f_out)

        file_path = os.path.join(unzip_directory_path, local_file)

        # Open the NetCDF file using xarray
        ds = xr.open_dataset(file_path)

        # Convert the xarray dataset to a Pandas DataFrame
        df = ds.to_dataframe()

        # Append the DataFrame to the list
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(dfs)

    return final_df

In [4]:
# result_dataframe = netcdf_to_dataframe(local_directory) # run out of memory
result_dataframe = limited_netcdf_to_dataframe(local_directory_gz, local_directory, number_of_files=10)

In [5]:
result_dataframe.info()
result_dataframe.shape

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 128100 entries, (0.5, 31.775, -97.99) to (0.5, 33.6, -96.244995)
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   RRdata  128100 non-null  float32
dtypes: float32(1)
memory usage: 1.1 MB


(128100, 1)

# This is for later

In [21]:
# Draft of a custom dataset class for PyTorch to load the NetCDF files to model


class NetCDFDataset(Dataset):
    def __init__(self, file_path):
        self.file_path = file_path

        # Open the NetCDF file
        os.chdir(local_directory)
        netcdfiles = os.listdir()
        with nc.Dataset(file_path, "r") as nc_file:
            self.data = nc_file.variables["RRdata"][:]
            self.labels = nc_file.variables["labels"][:]  # Assuming you have labels in your NetCDF file

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = {"input": self.data[idx], "label": self.labels[idx]}
        return sample

NameError: name 'Dataset' is not defined