In [1]:
def csv_to_feather_function(input_file):
    import filepattern
    import argparse, logging
    import numpy as np
    import typing, os
    from pathlib import Path
    import vaex
    
    # Import environment variables
    POLUS_LOG = getattr(logging,os.environ.get('POLUS_LOG','INFO'))

    # Initialize the logger
    logging.basicConfig(format='%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s',
                        datefmt='%d-%b-%y %H:%M:%S')
    logger = logging.getLogger("main")
    logger.setLevel(POLUS_LOG)

    def csv2parquet(input_path: Path,
                    output_path: Path
                    ) -> np.ndarray:
        """Convert a csv file to Apache feather file
        """
        print("input_path: ", input_path)
        # Get the number of columns in the csv
        with open(input_path,'r') as fr:
            ncols = len(fr.readline().split(','))

        # Set the chunk size to get roughly 16MB at a time, or at least 1 row
        chunk_size = max([2**24 // ncols,1])

        # Load the file and convert to hdf5 to make reading scalable
        logger.info('reading file...')
        csv = vaex.from_csv(input_path,convert=True,chunk_size=chunk_size)

        # Write to parquet file
        logger.info('writing file...')
        csv.export_feather(output_path)
        csv.close()

        # Remove intermediate files
        logger.info('removing intermediates...')
        os.unlink(input_path.with_suffix('.csv.yaml'))
        os.unlink(input_path.with_suffix('.csv.hdf5'))
        logger.info('done.')

    def main(input_file,
             outDir: Path,
             ) -> None:
        """ Main execution function

        All functions in your code must have docstrings attached to them, and the
        docstrings must follow the Google Python Style:
        https://www.sphinx-doc.org/en/master/usage/extensions/example_google.html
        """

        input_file = Path(input_file)
        print("input_file before: ", input_file)
        print("type: ", type(input_file))
        output_file = Path(outDir).joinpath(input_file.name.replace('.csv','.feather'))

        logger.info(f'Converting file: {input_file.name}')
        print("after: input_file: ", input_file)
        csv2parquet(input_file, output_file)

    if __name__=="__main__":

        ''' Argument parsing '''
        logger.info("Parsing arguments...")
        parser = argparse.ArgumentParser(prog='main', description='Convert data from csv to feather.')


        #: Input csv collection to be processed by this plugin
        logger.info('input_file = {}'.format(input_file))
        #: Output collection
        outDir = ""

        main(input_file, outDir)

In [2]:
# Load UCI census train and test data into dataframes.
import pandas as pd
features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status",
            "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
            "Hours per week", "Country", "Target"]
train_data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    names=features,
    sep=r'\s*,\s*',
    engine='python',
    na_values="?")
test_data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
    names=features,
    sep=r'\s*,\s*',
    skiprows=[0],
    engine='python',
    na_values="?")

In [3]:
import vaex
from pathlib import Path
def csv_to_vaex_table(train_data, test_data):
    """This function converts train/test dfs to .csv, .csv to .feather files, then .feather to vaex tables"""
    # Generate CSV file from adult.data hyperlink and convert to .feather format
    train_data.to_csv("train_data.csv", index=False)
    test_data.to_csv("test_data.csv", index=False)

    #: Generate feather files from original csv files used by facets
    csv_to_feather_function(Path("train_data.csv"))
    csv_to_feather_function(Path("test_data.csv"))

    #: Convert .feather file to vaex table
    train_data = vaex.open("train_data.feather")
    test_data = vaex.open("test_data.feather")
    return train_data, test_data

#: Convert dataframes to vaex tables
#train_data, test_data = csv_to_vaex_table(train_data, test_data)

## Replace dataframes in lines 9 and 10 with vaex table generated from feather file

In [4]:
# Calculate the feature statistics proto from the datasets and stringify it for use in facets overview.

# This code assumes that the facets-overview package has been installed through pip,
# along with a tensorflow (or tensorflow-gpu) package.
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
import base64

gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_data},
                                  {'name': 'test', 'table': test_data}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

In [5]:
# Display the facets overview visualization for this data
from IPython.core.display import display, HTML

HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
html = HTML_TEMPLATE.format(protostr=protostr)
display(HTML(html))