In [5]:
import lib.io_ as io
import pandas as pd
import os
from pyarrow import CompressedOutputStream as COS
from pyarrow import Table as Table
from pyarrow import _csv as csv
from pyarrow import dataset as dataset
from time import monotonic

import config
ENV = {name: values for name, values in vars(config).items() if not name.startswith("__")}

paths = io.generate_paths(ENV['MLHD_ROOT'])
paths[50]

'/data/mlhd-zstd/8e/8e6492e2-b4aa-44b0-9573-37a9aad9b8c0.csv.zst'

In [6]:
%%time
df = io.load_path(paths[50])
df.head()

CPU times: user 96.4 ms, sys: 32.1 ms, total: 129 ms
Wall time: 66.8 ms


Unnamed: 0,timestamp,artist_MBID,release_MBID,recording_MBID
0,1242583679,ba85753c-671a-409e-b813-1e3be41e2a2e,0a2d7193-fc4b-418c-8581-08d2695f884a,0d7960f0-ee33-4868-8e9e-7c705558b6e5
1,1242583969,f660d7e2-a3bd-4456-a7be-86ec139c1016,6ecac165-0267-3822-b995-cda1282ea5b5,6afc8617-d545-4629-8151-9ee9d71c4742
2,1242584191,371f152d-1643-4b54-b32b-dd13d4c23442,59c844ca-4c11-452d-8330-c107892319eb,e2477990-d9b1-43b6-b8db-be03e43559c4
3,1242584530,98fb7792-01fa-4ed1-a15d-20077a47210f,,1fdcf214-b4d5-4490-8626-5afeda04b73d
4,1242584820,d50a4b89-ff1f-4659-9fde-f76f8d5b3c89,,657cf27c-8f4a-4758-aaaa-bd5686d7e103


In [7]:
%%time
# Writing to CSV using pandas

def write_frame(df_input, original_path):
    """
    Function to write a dataframe to a csv file
    """
    # Replace MLHD_ROOT with path to new MLHD folder.
    write_path = original_path.replace(ENV["MLHD_ROOT"], ENV["WRITE_ROOT"])
    write_path = write_path.replace('txt.gz', 'csv.zst')
    
    # print(write_path)
    # Make directory inside WRITE_ROOT if it doesn't exist
    os.makedirs(os.path.dirname(write_path), exist_ok=True)

    df_input.to_csv(
        write_path,
        index=False, 
        sep='\t',
        header=False, 
        compression={'method': 'zstd', 'level': 10},
        )

    return write_path

start_pandas = monotonic()
write_frame(df, original_path = paths[50]+'_pandas')
end_pandas = monotonic()

CPU times: user 553 ms, sys: 16.3 ms, total: 569 ms
Wall time: 581 ms


In [8]:
%%time
# Writing to CSV using pyarrow._csv.write_csv

write_options  = csv.WriteOptions(
    include_header = False,
    delimiter = '\t',
)

def write_frame_arrow(df_input, original_path):
    """
    Function to write a dataframe to a csv file using pyarrow
    """
    # Replace MLHD_ROOT with path to new MLHD folder.
    write_path = original_path.replace(ENV["MLHD_ROOT"], ENV["WRITE_ROOT"])
    write_path = write_path.replace('txt.gz', 'csv.zst')
    
    # print(write_path)
    # Make directory inside WRITE_ROOT if it doesn't exist
    os.makedirs(os.path.dirname(write_path), exist_ok=True)
    
    df_input = Table.from_pandas(df_input)
    # csv.write_csv(df_input, output_file = write_path, write_options = )
    with COS(write_path, "zstd") as out:
        csv.write_csv(df_input, out, write_options = write_options)

    df_input

start_arrow = monotonic()
write_frame_arrow(df, original_path = paths[50]+'_arrow') 
end_arrow = monotonic() 

CPU times: user 60.6 ms, sys: 12.4 ms, total: 73 ms
Wall time: 71.2 ms


In [9]:
csv_pandas = pd.read_csv('warehouse/processed_outputs/8e/8e6492e2-b4aa-44b0-9573-37a9aad9b8c0.csv.zst_pandas', sep='\t', header=None, compression='zstd')
csv_arrow = pd.read_csv('warehouse/processed_outputs/8e/8e6492e2-b4aa-44b0-9573-37a9aad9b8c0.csv.zst_arrow', sep='\t', header=None, compression='zstd')

csv_arrow.compare(csv_pandas)

In [10]:
time_pandas = round(end_pandas - start_pandas, 2)
time_arrow = round(end_arrow - start_arrow, 2)
print(f'Time Taken by pandas.to_csv(): {time_pandas}s')
print(f'Time Taken by arrow.: {time_arrow}s')
print(f'Time Improvement: {round(((time_pandas - time_arrow)/(time_pandas))*100, 2)}%')

Time Taken by pandas.to_csv(): 0.58s
Time Taken by arrow.: 0.07s
Time Improvement: 87.93%
