# Big Data File Read/Write
### by [Jason DeBacker](https://jasondebacker.com), October 2024

This notebook shows how different file types perform when reading and writing large data files.  We will compare the following file types:
- CSV
- Parquet
- Feather
- HDF5
- Pickle

In [22]:
# imports
import pandas as pd
import os
import time
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# set paths to data
CUR_DIR = os.getcwd()
DATA_DIR = os.path.join(CUR_DIR, 'data')
DATA_FILE = os.path.join(DATA_DIR, 'sitc_country_country_product_year_4_2022.dta')

In [4]:
# Create a dictionary to store results
file_dict = {
    'csv': {},
    'parquet': {},
    'feather': {},
    'hdf': {},
    'pkl': {}
}

In [5]:
# First, see how large the file size is for the same data is when saved in different formats
# return file size in bytes
def get_file_size(file_path):
    return os.path.getsize(file_path)

for file_format in file_dict.keys():
    size = get_file_size(os.path.join(DATA_DIR, f'trade_data_2022.{file_format}'))
    file_dict[file_format]['size'] = size

In [10]:
# plot file size for different file formats
file_df = pd.DataFrame(file_dict).T.reset_index()
file_df.rename(columns={'index': 'file_format'}, inplace=True)
fig = px.bar(file_df, x='file_format', y='size', title='File size for different file formats')
fig.show()

In [8]:
# Second, how long to read the data in each format
# return time in seconds
def read_data(file_path, file_format):
    start = time.time()
    if file_format == 'csv':
        pd.read_csv(file_path)
    elif file_format == 'parquet':
        pd.read_parquet(file_path)
    elif file_format == 'feather':
        pd.read_feather(file_path)
    elif file_format == 'hdf':
        pd.read_hdf(file_path)
    elif file_format == 'pkl':
        pd.read_pickle(file_path)
    else:
        print('Invalid file format')
    end = time.time()
    return end - start

for file_format in file_dict.keys():
    read_time = read_data(os.path.join(DATA_DIR, f'trade_data_2022.{file_format}'), file_format)
    file_dict[file_format]['read_time'] = read_time

In [16]:
# Plot read time
file_df = pd.DataFrame(file_dict).T.reset_index()
file_df.rename(columns={'index': 'file_format'}, inplace=True)
fig = px.bar(file_df, x='file_format', y='read_time', title='Read times for different file formats')
fig.show()


In [14]:
# Third, how long to write the data in each format
# return time in seconds
def write_data(data, file_path, file_format):
    start = time.time()
    if file_format == 'csv':
        data.to_csv(file_path)
    elif file_format == 'parquet':
        data.to_parquet(file_path)
    elif file_format == 'feather':
        data.to_feather(file_path)
    elif file_format == 'hdf':
        data.to_hdf(file_path, key='my_data')
    elif file_format == 'pkl':
        data.to_pickle(file_path)
    else:
        print('Invalid file format')
    end = time.time()
    return end - start

df = pd.read_stata(DATA_FILE)
for file_format in ['csv', 'parquet', 'feather', 'hdf', 'pkl']:
    write_time = write_data(df, os.path.join(DATA_DIR, f'trade_data_2022_test.{file_format}'), file_format)
    file_dict[file_format]['write_time'] = write_time

In [17]:
# Plot write time
file_df = pd.DataFrame(file_dict).T.reset_index()
file_df.rename(columns={'index': 'file_format'}, inplace=True)
fig = px.bar(file_df, x='file_format', y='write_time', title='Write times for different file formats')
fig.show()

In [24]:
# Plot file size, read and write all together (file size on one axis, time on the other), grouped bar plot
file_df = pd.DataFrame(file_dict).T.reset_index()
file_df.rename(columns={'index': 'file_format'}, inplace=True)
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces for size
fig.add_trace(
    go.Bar(x=file_df['file_format'], y=file_df['size'], name="Size", offsetgroup=0),
    secondary_y=False,
)

# Add traces for read_time
fig.add_trace(
    go.Bar(x=file_df['file_format'], y=file_df['read_time'], name="Read Time", offsetgroup=1),
    secondary_y=True,
)

# Add traces for write_time
fig.add_trace(
    go.Bar(x=file_df['file_format'], y=file_df['write_time'], name="Write Time", offsetgroup=2),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="File size, read and write times for different file formats",
    barmode='group'
)

# Set x-axis title
fig.update_xaxes(title_text="File Format")

# Set y-axes titles
fig.update_yaxes(title_text="Size", secondary_y=False)
fig.update_yaxes(title_text="Time", secondary_y=True)

# Show the figure
fig.show()

In [26]:
# clean up files
for file_format in ['csv', 'parquet', 'feather', 'hdf', 'pkl']:
    os.remove(os.path.join(DATA_DIR, f'trade_data_2022_test.{file_format}'))