In [1]:
# In the root of the repo run "pip install --editable ."

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

# Replace sample_func_dir by the name of directory in src/ and replace sample_func_file the
# file name in src/sample_func_dir

# How to import dataframes

This notebook will give you an overview of the implemented methods for import dataframes from external files.

### Import a dataframe from a zip-archive containing pickled dataframe objects

This method imports a dataframe from a pickled dataframe object (`.pkl`) located within a zip-archive without unpacking the zip-archive. The function `src.data.import_data.import_df_from_zip_pkl` returns a dataframe and needs a path to the zip archive and optionally an index for which dataframe you want (defaults to 0, ie, the first dataframe in the timeseries).

In [2]:
# import the function
from src.data.import_data import import_df_from_zip_pkl

In [3]:
# path to the zip-archive
path_to_zip_pkl = '../data/raw/synthetic_soil.zip' 

In [4]:
# get dataframe (~0.5 seconds)
df = import_df_from_zip_pkl(path_to_zip_pkl, index=15, verbose=True)

time for importing dataframe: 0.77 seconds


In [5]:
# show the last 10 minutes of the day
print(df['minute_of_day'][-10:])

datetime
2019-12-31 23:50:00-05:00    1430
2019-12-31 23:51:00-05:00    1431
2019-12-31 23:52:00-05:00    1432
2019-12-31 23:53:00-05:00    1433
2019-12-31 23:54:00-05:00    1434
2019-12-31 23:55:00-05:00    1435
2019-12-31 23:56:00-05:00    1436
2019-12-31 23:57:00-05:00    1437
2019-12-31 23:58:00-05:00    1438
2019-12-31 23:59:00-05:00    1439
Freq: T, Name: minute_of_day, dtype: int64


In [6]:
# show help
help(import_df_from_zip_pkl)

Help on function import_df_from_zip_pkl in module src.data.import_data:

import_df_from_zip_pkl(path_to_zip, index=0, verbose=False)
    Import a timeseries from a zipped pickled dataframe
    
    Extracts a dataframe file from the pickle (compressed using gzip)
    which is saved within a zipped folder
    
        Args:
            path_to_zip (str): path to the zip-file containing the pickled
                dataframes
            index (int, optional): index (0-49) of the desired timeseries,
                defaults to 0, ie, the first timeseries in the dataset
            verbose (bool, optional): print output if true, defaults to False
    
        Returns:
            data_frame (Pandas DataFrame): unpickled dataframe



### Import a dataframe from a zip-archive containing csv-files

This method imports a dataframe from a csv-file (`.csv`) located within a zip-archive without unpacking the zip-archive. The function `src.data.import_data.import_df_from_zip_csv` returns a dataframe and needs a path to the zip archive and optionally an index for which dataframe you want (defaults to 0, ie, the first dataframe in the timeseries). Parsing the datetime takes a lot of time (30-60 seconds per csv-file)!

This function will probably not be needed anymore since we have a better method (`src.data.import_data.import_df_from_zip_pkl`).

In [7]:
# import the function
from src.data.import_data import import_df_from_zip_csv

In [8]:
# path to the zip-archive
path_to_zip_csv = '../data/raw/old/synthetic_soil.zip' 

In [9]:
# get dataframe without parsing datetime (~2-3 seconds)
df = import_df_from_zip_csv(path_to_zip_csv, index=15, datetime=False, verbose=True)

time for importing dataframe: 4.28 seconds


In [10]:
# get dataframe and parse the datetime (~30-60 seconds)
df = import_df_from_zip_csv(path_to_zip_csv, index=15, datetime=True, verbose=True)

time for importing dataframe: 60.18 seconds


In [11]:
# show help
help(import_df_from_zip_csv)

Help on function import_df_from_zip_csv in module src.data.import_data:

import_df_from_zip_csv(path_to_zip, index=0, datetime=True, verbose=False)
    Import a timeseries from a zipped datafile
    
    Extracts a csv file from the zip-archive and transforms it to a formatted
    dataframe.
    
        Args:
            path_to_zip (str): path to the zip-file containing the dataset
            index (int, optional): index (0-49) of the desired timeseries,
                defaults to 0, ie, the first timeseries in the dataset
            datetime (bool, optional): parse time-string to datetime, defaults
                to True
            verbose (bool, optional): print output if true, defaults to False
    
        Returns:
            data_frame (Pandas DataFrame): dataframe converted from csv



### Import a dataframe from a directory containing csv-files

This method imports a dataframe from a csv-file (`.csv`) located within a directory. The function `src.data.import_data.import_df_from_dir_csv` returns a dataframe and needs a path to the directory and optionally an index for which dataframe you want (defaults to 0, ie, the first dataframe in the timeseries). Parsing the datetime takes a lot of time (30-60 seconds per csv-file)!

This function will probably not be needed anymore since we have a better method (`src.data.import_data.import_df_from_zip_pkl`).

In [12]:
# import the function
from src.data.import_data import import_df_from_dir_csv

In [13]:
# path to the directory
path_to_dir_csv = '../data/raw/old/synthetic_soil' 

In [14]:
# get dataframe without parsing datetime (~2-3 seconds)
df = import_df_from_dir_csv(path_to_dir_csv, index=15, datetime=False, verbose=True)

time for importing dataframe: 3.99 seconds


In [15]:
# get dataframe and parse the datetime (~30-60 seconds)
df = import_df_from_dir_csv(path_to_dir_csv, index=15, datetime=True, verbose=True)

time for importing dataframe: 59.25 seconds


In [16]:
# show help
help(import_df_from_dir_csv)

Help on function import_df_from_dir_csv in module src.data.import_data:

import_df_from_dir_csv(path_to_dir, index=0, datetime=True, verbose=False)
    Import a timeseries from a data directory
    
    Takes csv file from the destination directory and returns a formatted
    dataframe.
    
        Args:
            path_to_dir (str): path to the directory containing the csv-files
            index (int, optional): index (0-49) of the desired timeseries,
                defaults to 0, ie, the first timeseries in the dataset
            datetime (bool, optional): parse time-string to datetime, defaults
                to True
            verbose (bool, optional): print output if true, defaults to False
    
        Returns:
            data_frame (Pandas DataFrame): dataframe converted from csv



### Import the dataset info 

Import the dataset info from the csv-file ("synthetic_xxx_info.csv") from within a zip-archive without unzipping.

In [7]:
# import the function
from src.data.import_data import import_df_info_from_zip

In [8]:
path_to_zip = '../data/raw/synthetic_basic.zip' 
df_info = import_df_info_from_zip(path_to_zip)

BadGzipFile: Not a gzipped file (b'Pl')

In [10]:
import zipfile as zf