In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

def filesize_mb(file):
    ''' helper function to display file size in Megabytes'''
    stats = Path(file).stat()
    return f'{stats.st_size / 1_000_000:.0f} MB'

get file size of household_power_consumption.csv

In [2]:
filesize_mb('household_power_consumption.csv')

'133 MB'

prepare the dataframe

In [3]:
df = pd.read_csv('household_power_consumption.csv', delimiter=';')

  df = pd.read_csv('household_power_consumption.csv', delimiter=';')


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Time                   object 
 2   Global_active_power    object 
 3   Global_reactive_power  object 
 4   Voltage                object 
 5   Global_intensity       object 
 6   Sub_metering_1         object 
 7   Sub_metering_2         object 
 8   Sub_metering_3         float64
dtypes: float64(1), object(8)
memory usage: 142.5+ MB


perform some simple cleaning

In [5]:
df.replace('?', np.nan, inplace=True)
df['Global_active_power'] = pd.to_numeric(df['Global_active_power'], errors='coerce')
df['Global_reactive_power'] = pd.to_numeric(df['Global_reactive_power'], errors='coerce')
df['Voltage'] = pd.to_numeric(df['Voltage'], errors='coerce')
df['Global_intensity'] = pd.to_numeric(df['Global_intensity'], errors='coerce')
df['Sub_metering_1'] = pd.to_numeric(df['Sub_metering_1'], errors='coerce')
df['Sub_metering_2'] = pd.to_numeric(df['Sub_metering_2'], errors='coerce')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Time                   object 
 2   Global_active_power    float64
 3   Global_reactive_power  float64
 4   Voltage                float64
 5   Global_intensity       float64
 6   Sub_metering_1         float64
 7   Sub_metering_2         float64
 8   Sub_metering_3         float64
dtypes: float64(7), object(2)
memory usage: 142.5+ MB


## Benchmarks

write and read CSV

Write the cleaned DataFrame to a CSV file and time it

In [7]:
%%timeit
df.to_csv('hh.csv')

3.93 s ± 39.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


file size is significantly smaller after cleaning

In [9]:
filesize_mb('hh.csv')

'126 MB'

In [10]:
%%timeit
df = pd.read_csv('hh.csv')

592 ms ± 3.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


write and read JSON

In [11]:
%%timeit
df.to_json('hh.json')

1.13 s ± 6.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
filesize_mb('hh.json')

'300 MB'

In [13]:
%%timeit
df = pd.read_json('hh.json')

8.82 s ± 251 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


write and read Excel

Excel fails to write for a DataFrame of this size

In [166]:
df.to_excel('hh.xlxs')

ValueError: This sheet is too large! Your sheet size is: 2075259, 9 Max sheet size is: 1048576, 16384

write and read Feather

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Time                   object 
 2   Global_active_power    float64
 3   Global_reactive_power  float64
 4   Voltage                float64
 5   Global_intensity       float64
 6   Sub_metering_1         float64
 7   Sub_metering_2         float64
 8   Sub_metering_3         float64
dtypes: float64(7), object(2)
memory usage: 142.5+ MB


In [15]:
%%timeit
df.to_feather('hh.feather')

111 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
filesize_mb('hh.feather')

'42 MB'

In [17]:
%%timeit
df = pd.read_feather('hh.feather')

62.8 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


write and read Parquet

In [18]:
%%timeit
df.to_parquet('hh.parquet')

319 ms ± 2.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
filesize_mb('hh.parquet')

'10 MB'

In [20]:
%%timeit
df = pd.read_parquet('hh.parquet')

59.4 ms ± 487 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
