## Initialization

In [1]:
import platform
import cpuinfo
import sys

import polars as pl
import pyarrow as pa
import pandas as pd
import numba as nb

import xlsx2csv
import openpyxl
import pyxlsb

import ordered_set
import recordclass
import tqdm as _tqdm
import lxml

import memory_profiler

from rxls import xl_scan
from tqdm import tqdm

cpu: dict = cpuinfo.get_cpu_info()

print(
    f'[{"=":=^120}]',
    f'|{" benchmark environment used ":^120}|',
    f'[{" product ":=^24}][{" version ":=^94}]',
    *[
        f'|{f" {name} ":^24}||{f" {version} ":^94}|' for name, version in {
            'Platform': platform.platform(),
            'CPU': platform.processor(),
            'CPU HZ': f'{cpu["hz_advertised_friendly"]} (actual: {cpu["hz_actual_friendly"]})',
            'Python': sys.version,
            'PyArrow': pa.__version__,
            'Polars': pl.__version__,
            'Pandas': pd.__version__,
            'Numba': nb.__version__,
            'XLSX2CSV': xlsx2csv.__version__,
            'OpenPyXL': openpyxl.__version__,
            'PyXLSB': pyxlsb.__version__,
            'tqdm': _tqdm.__version__,
            'ordered-set': ordered_set.__version__,
            'recordclass': recordclass.__version__,
            'lxml': lxml.__version__,
            'cpuinfo': '.'.join(map(str, cpuinfo.CPUINFO_VERSION)),
            'memory_profiler': memory_profiler.__version__
        }.items()
    ],
    f'[{"=":=^120}]',
    sep='\n'
)

|                                               benchmark environment used                                               |
|        Platform        ||                                  Windows-10-10.0.19045-SP0                                   |
|          CPU           ||                      AMD64 Family 23 Model 24 Stepping 1, AuthenticAMD                       |
|         CPU HZ         ||                               2.0960 GHz (actual: 2.1000 GHz)                                |
|         Python         ||       3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)]       |
|        PyArrow         ||                                            13.0.0                                            |
|         Polars         ||                                            0.19.3                                            |
|         Pandas         ||                                            2.1.0                                             |
|         Numba 

## DataSource:

https://www.learningcontainer.com/download/sample-sales-data-excel-xls/

**I've download this, and save with MS Excel to .xlsx and .xlsb formats with same name**

In [2]:
XLSX_FILE = 'Sample-sales-data-excel.xlsx'
XLSB_FILE = 'Sample-sales-data-excel.xlsb'

### RXLS. Scan both xlsx and xlsb (*plain = as in pyxlsb (`inferring='no'`)*)

In [3]:
def scan_rxls_xlsx_plain(path: str = XLSX_FILE, tq: bool = False) -> pl.DataFrame:
    return xl_scan(path, head=True, with_tqdm=tq, inferring='no')

def scan_rxls_xlsb_plain(path: str = XLSB_FILE, tq: bool = False) -> pl.DataFrame:
    return xl_scan(path, head=True, with_tqdm=tq, inferring='no')

def scan_rxls_xlsx(path: str = XLSX_FILE, tq: bool = False) -> pl.DataFrame:
    return xl_scan(path, head=True, with_tqdm=tq)

def scan_rxls_xlsb(path: str = XLSB_FILE, tq: bool = False) -> pl.DataFrame:
    return xl_scan(path, head=True, with_tqdm=tq)

In [7]:
scan_rxls_xlsx_plain().head()

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""CA-2016-152156…","""42682""","""42685""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""","""42420""","""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …","""261.9599999999…","""2""","""0""","""41.91360000000…"
"""2""","""CA-2016-152156…","""42682""","""42685""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""","""42420""","""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…","""731.9399999999…","""3""","""0""","""219.5819999999…"
"""3""","""CA-2016-138688…","""42533""","""42537""","""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""","""90036""","""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …","""14.62""","""2""","""0""","""6.871399999999…"
"""4""","""US-2015-108966…","""42288""","""42295""","""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""","""33311""","""South""","""FUR-TA-1000057…","""Furniture""","""Tables""","""Bretford CR450…","""957.5774999999…","""5""","""0.45""","""-383.031000000…"
"""5""","""US-2015-108966…","""42288""","""42295""","""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""","""33311""","""South""","""OFF-ST-1000076…","""Office Supplie…","""Storage""","""Eldon Fold 'N …","""22.36800000000…","""2""","""0.2""","""2.516399999999…"


In [8]:
scan_rxls_xlsb_plain().head()

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1.0""","""CA-2016-152156…","""42682""","""42685""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""","""42420.0""","""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …","""261.96""","""2.0""","""0.0""","""41.9136"""
"""2.0""","""CA-2016-152156…","""42682""","""42685""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""","""42420.0""","""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…","""731.94""","""3.0""","""0.0""","""219.582"""
"""3.0""","""CA-2016-138688…","""42533""","""42537""","""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""","""90036.0""","""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …","""14.62""","""2.0""","""0.0""","""6.8714"""
"""4.0""","""US-2015-108966…","""42288""","""42295""","""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""","""33311.0""","""South""","""FUR-TA-1000057…","""Furniture""","""Tables""","""Bretford CR450…","""957.5775""","""5.0""","""0.45""","""-383.031"""
"""5.0""","""US-2015-108966…","""42288""","""42295""","""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""","""33311.0""","""South""","""OFF-ST-1000076…","""Office Supplie…","""Storage""","""Eldon Fold 'N …","""22.368""","""2.0""","""0.2""","""2.5164"""


In [9]:
# NOTE: There is a bug: floating-point columns scanned as utf-8, but it's not critical...
scan_rxls_xlsx().head()

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
str,str,date,date,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""CA-2016-152156…",2016-11-08,2016-11-11,"""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""","""42420""","""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …","""261.9599999999…","""2""","""0""","""41.91360000000…"
"""2""","""CA-2016-152156…",2016-11-08,2016-11-11,"""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""","""42420""","""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…","""731.9399999999…","""3""","""0""","""219.5819999999…"
"""3""","""CA-2016-138688…",2016-06-12,2016-06-16,"""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""","""90036""","""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …","""14.62""","""2""","""0""","""6.871399999999…"
"""4""","""US-2015-108966…",2015-10-11,2015-10-18,"""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""","""33311""","""South""","""FUR-TA-1000057…","""Furniture""","""Tables""","""Bretford CR450…","""957.5774999999…","""5""","""0.45""","""-383.031000000…"
"""5""","""US-2015-108966…",2015-10-11,2015-10-18,"""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""","""33311""","""South""","""OFF-ST-1000076…","""Office Supplie…","""Storage""","""Eldon Fold 'N …","""22.36800000000…","""2""","""0.2""","""2.516399999999…"


In [10]:
# NOTE: All columns have correct datatypes
scan_rxls_xlsb().head()

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
i64,str,date,date,str,str,str,str,str,str,str,i64,str,str,str,str,str,f64,i64,f64,f64
1,"""CA-2016-152156…",2016-11-08,2016-11-11,"""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …",261.96,2,0.0,41.9136
2,"""CA-2016-152156…",2016-11-08,2016-11-11,"""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…",731.94,3,0.0,219.582
3,"""CA-2016-138688…",2016-06-12,2016-06-16,"""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""",90036,"""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …",14.62,2,0.0,6.8714
4,"""US-2015-108966…",2015-10-11,2015-10-18,"""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""",33311,"""South""","""FUR-TA-1000057…","""Furniture""","""Tables""","""Bretford CR450…",957.5775,5,0.45,-383.031
5,"""US-2015-108966…",2015-10-11,2015-10-18,"""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""",33311,"""South""","""OFF-ST-1000076…","""Office Supplie…","""Storage""","""Eldon Fold 'N …",22.368,2,0.2,2.5164


### XLSX2CSV case: xlsx -> csv -> polars.read_csv()

In [4]:
def scan_xlsx2csv(path: str = XLSX_FILE) -> pl.DataFrame:
    return pl.read_excel(path, engine='xlsx2csv')

In [11]:
# NOTE: It's fine, but date columns have incorrect datatype, and are not ISO-dates.
scan_xlsx2csv().head()

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
i64,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,f64,i64,f64,f64
1,"""CA-2016-152156…","""11-08-16""","""11-11-16""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …",261.96,2,0.0,41.9136
2,"""CA-2016-152156…","""11-08-16""","""11-11-16""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…",731.94,3,0.0,219.582
3,"""CA-2016-138688…","""06-12-16""","""06-16-16""","""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""",90036,"""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …",14.62,2,0.0,6.8714
4,"""US-2015-108966…","""10-11-15""","""10-18-15""","""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""",33311,"""South""","""FUR-TA-1000057…","""Furniture""","""Tables""","""Bretford CR450…",957.5775,5,0.45,-383.031
5,"""US-2015-108966…","""10-11-15""","""10-18-15""","""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""",33311,"""South""","""OFF-ST-1000076…","""Office Supplie…","""Storage""","""Eldon Fold 'N …",22.368,2,0.2,2.5164


### OpenPyXL: Scan xlsx only

In [5]:
def scan_openpyxl(path: str = XLSX_FILE) -> pl.DataFrame:
    return pl.read_excel(path, engine='openpyxl')

In [12]:
# NOTE: Postal Code column has missing data.
scan_openpyxl().head()

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
i64,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,str,str,f64,i64,f64,f64
1,"""CA-2016-152156…",2016-11-08 00:00:00,2016-11-11 00:00:00,"""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",,"""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …",261.96,2,0.0,41.9136
2,"""CA-2016-152156…",2016-11-08 00:00:00,2016-11-11 00:00:00,"""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",,"""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…",731.94,3,0.0,219.582
3,"""CA-2016-138688…",2016-06-12 00:00:00,2016-06-16 00:00:00,"""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""",,"""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …",14.62,2,0.0,6.8714
4,"""US-2015-108966…",2015-10-11 00:00:00,2015-10-18 00:00:00,"""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""",,"""South""","""FUR-TA-1000057…","""Furniture""","""Tables""","""Bretford CR450…",957.5775,5,0.45,-383.031
5,"""US-2015-108966…",2015-10-11 00:00:00,2015-10-18 00:00:00,"""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""",,"""South""","""OFF-ST-1000076…","""Office Supplie…","""Storage""","""Eldon Fold 'N …",22.368,2,0.2,2.5164


### PyXLSB: Scan xlsb only

In [6]:
def scan_pyxlsb(path: str = XLSB_FILE, tq: bool = False) -> pl.DataFrame:
    with pyxlsb.open_workbook(path) as wb:
        sheet = wb.get_sheet(1)

        row_it = sheet.rows(True)
        
        head = list(map(lambda cell: str(cell[1].v) if cell[1].v else f'Unnamed: {cell[0]}', enumerate(next(row_it))))
        # NOTE: pyxlsb not recognized dates automatically - you can do this manually, but xl/styles.bin parsing required.
        data_it = (
            [str(cell.v) for cell in row] for row in row_it
        )
        if tq:
            data_it = tqdm(data_it)
        
        return pl.DataFrame(list(data_it), head, orient='row')

In [48]:
scan_pyxlsb().head()

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1.0""","""CA-2016-152156…","""42682.0""","""42685.0""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""","""42420.0""","""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …","""261.96""","""2.0""","""0.0""","""41.9136"""
"""2.0""","""CA-2016-152156…","""42682.0""","""42685.0""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""","""42420.0""","""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…","""731.9399999999…","""3.0""","""0.0""","""219.5819999999…"
"""3.0""","""CA-2016-138688…","""42533.0""","""42537.0""","""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""","""90036.0""","""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …","""14.62""","""2.0""","""0.0""","""6.871399999999…"
"""4.0""","""US-2015-108966…","""42288.0""","""42295.0""","""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""","""33311.0""","""South""","""FUR-TA-1000057…","""Furniture""","""Tables""","""Bretford CR450…","""957.5775""","""5.0""","""0.45""","""-383.031000000…"
"""5.0""","""US-2015-108966…","""42288.0""","""42295.0""","""Standard Class…","""SO-20335""","""Sean O'Donnell…","""Consumer""","""United States""","""Fort Lauderdal…","""Florida""","""33311.0""","""South""","""OFF-ST-1000076…","""Office Supplie…","""Storage""","""Eldon Fold 'N …","""22.36800000000…","""2.0""","""0.2""","""2.516399999999…"


## Benchmarks

### Timeit

*This tests I've done without kernel reloading*

In [8]:
%%timeit
scan_xlsx2csv().head()

2.82 s ± 266 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
scan_openpyxl().head()

2.86 s ± 64.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
scan_pyxlsb().head()

2.58 s ± 105 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%timeit
scan_rxls_xlsx().head()

2.55 s ± 171 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%%timeit
scan_rxls_xlsx_plain().head()

2.6 s ± 87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
scan_rxls_xlsb().head()

2.75 s ± 350 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%%timeit
scan_rxls_xlsb_plain().head()

2.51 s ± 42.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Memory Profiler

*For these tests, I've reload kernel between tests, for better consistency*

In [7]:
%load_ext memory_profiler

In [8]:
%memit scan_rxls_xlsb()

peak memory: 202.35 MiB, increment: 23.70 MiB


In [8]:
%memit scan_rxls_xlsx()

peak memory: 200.18 MiB, increment: 21.16 MiB


In [8]:
%memit scan_rxls_xlsb_plain()

peak memory: 194.72 MiB, increment: 15.71 MiB


In [8]:
%memit scan_rxls_xlsx_plain()

peak memory: 195.05 MiB, increment: 16.29 MiB


In [9]:
%memit scan_xlsx2csv()

peak memory: 206.03 MiB, increment: 18.24 MiB


In [8]:
%memit scan_openpyxl()

peak memory: 268.98 MiB, increment: 90.93 MiB


In [8]:
%memit scan_pyxlsb()

peak memory: 200.15 MiB, increment: 21.14 MiB


## Results:

### TimeIt:

- XLSB reading:
    1) RXLS `(plain: 2.51 s ± 42.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`
    2) PyXLSB `(2.58 s ± 105 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`
    3) RXLS `(basic: 2.75 s ± 350 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`


- XLSX reading:
    1) RXLS `(2.55 s ± 171 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))` *plain ~ basic*
    2) XLSX2CSV `(2.82 s ± 266 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`
    3) OpenPyXL `(2.86 s ± 64.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`

### Memory profiler

- XLSB reading:
    1) RXLS `(plain: peak memory: 194.72 MiB, increment: 15.71 MiB)`
    2) PyXLSB `(peak memory: 200.15 MiB, increment: 21.14 MiB)`
    3) RXLS `(basic: peak memory: 202.35 MiB, increment: 23.70 MiB)`


- XLSX reading:
    1) RXLS `(plain: peak memory: 195.05 MiB, increment: 16.29 MiB)`
    2) XLSX2CSV `(peak memory: 206.03 MiB, increment: 18.24 MiB)`
    3) RXLS `(basic: peak memory: 200.18 MiB, increment: 21.16 MiB)`
    4) OpenPyXL `(peak memory: 268.98 MiB, increment: 90.93 MiB)` 