In [None]:
# In the root of the repo run "pip install --editable ."

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

# Replace sample_func_dir by the name of directory in src/ and replace sample_func_file the
# file name in src/sample_func_dir

# How to import dataframes

This notebook will give you an overview of the implemented methods for import dataframes from external files.

### Import a dataframe from a zip-archive containing pickled dataframe objects

This method imports a dataframe from a pickled dataframe object (`.pkl`) located within a zip-archive without unpacking the zip-archive. The function `src.data.import_data.import_df_from_zip_pkl` returns a dataframe and needs a path to the zip archive and optionally an index for which dataframe you want (defaults to 0, ie, the first dataframe in the timeseries).

In [None]:
# import the function
from src.data.import_data import import_df_from_zip_pkl

In [None]:
# path to the zip-archive
path_to_zip_pkl = '../data/raw/synthetic_weather.zip' 

In [None]:
# get dataframe (~0.5 seconds)
df = import_df_from_zip_pkl(path_to_zip_pkl, index=10, verbose=True)

In [None]:
# show the last 10 minutes of the day
print(df['minute_of_day'][-10:])

In [None]:
# show help
help(import_df_from_zip_pkl)

### Normalized Data (PI)

To import the normalized data: set `minofday` to `False`!

In [None]:
path_to_zip_pkl_pi = '../data/raw/synthetic_soil_weather_pi_daily.zip'
df = import_df_from_zip_pkl(path_to_zip_pkl_pi, index=0, verbose=True, minofday=False)

In [None]:
df.head()

### Import a dataframe from a zip-archive containing csv-files

This method imports a dataframe from a csv-file (`.csv`) located within a zip-archive without unpacking the zip-archive. The function `src.data.import_data.import_df_from_zip_csv` returns a dataframe and needs a path to the zip archive and optionally an index for which dataframe you want (defaults to 0, ie, the first dataframe in the timeseries). Parsing the datetime takes a lot of time (30-60 seconds per csv-file)!

This function will probably not be needed anymore since we have a better method (`src.data.import_data.import_df_from_zip_pkl`).

In [None]:
# import the function
from src.data.import_data import import_df_from_zip_csv

In [None]:
# path to the zip-archive
path_to_zip_csv = '../data/raw/old/synthetic_soil.zip' 

In [None]:
# get dataframe without parsing datetime (~2-3 seconds)
df = import_df_from_zip_csv(path_to_zip_csv, index=15, datetime=False, verbose=True)

In [None]:
# get dataframe and parse the datetime (~30-60 seconds)
df = import_df_from_zip_csv(path_to_zip_csv, index=15, datetime=True, verbose=True)

In [None]:
# show help
help(import_df_from_zip_csv)

### Import a dataframe from a directory containing csv-files

This method imports a dataframe from a csv-file (`.csv`) located within a directory. The function `src.data.import_data.import_df_from_dir_csv` returns a dataframe and needs a path to the directory and optionally an index for which dataframe you want (defaults to 0, ie, the first dataframe in the timeseries). Parsing the datetime takes a lot of time (30-60 seconds per csv-file)!

This function will probably not be needed anymore since we have a better method (`src.data.import_data.import_df_from_zip_pkl`).

In [None]:
# import the function
from src.data.import_data import import_df_from_dir_csv

In [None]:
# path to the directory
path_to_dir_csv = '../data/raw/old/synthetic_soil' 

In [None]:
# get dataframe without parsing datetime (~2-3 seconds)
df = import_df_from_dir_csv(path_to_dir_csv, index=15, datetime=False, verbose=True)

In [None]:
# get dataframe and parse the datetime (~30-60 seconds)
df = import_df_from_dir_csv(path_to_dir_csv, index=15, datetime=True, verbose=True)

In [None]:
# show help
help(import_df_from_dir_csv)

### Import the dataset info 

Import the dataset info from the csv-file ("synthetic_xxx_info.csv") from within a zip-archive without unzipping.

In [None]:
# import the function
from src.data.import_data import import_df_info_from_zip

In [None]:
path_to_zip = '../data/raw/synthetic_basic.zip' 
df_info = import_df_info_from_zip(path_to_zip, verbose=True)

In [None]:
df_info.head(10)