## Initialization

In [1]:
import platform
import cpuinfo
import sys

import polars as pl
import pyarrow as pa
import pandas as pd
import numba as nb

import xlsx2csv
import openpyxl
import pyxlsb

import ordered_set
import recordclass
import tqdm as _tqdm
import lxml

import memory_profiler

from rxls import xl_scan
from tqdm import tqdm

cpu: dict = cpuinfo.get_cpu_info()

print(
    f'[{"=":=^120}]',
    f'|{" benchmark environment used ":^120}|',
    f'[{" product ":=^24}][{" version ":=^94}]',
    *[
        f'|{f" {name} ":^24}||{f" {version} ":^94}|' for name, version in {
            'Platform': platform.platform(),
            'CPU': platform.processor(),
            'CPU HZ': f'{cpu["hz_advertised_friendly"]} (actual: {cpu["hz_actual_friendly"]})',
            'Python': sys.version,
            'PyArrow': pa.__version__,
            'Polars': pl.__version__,
            'Pandas': pd.__version__,
            'Numba': nb.__version__,
            'XLSX2CSV': xlsx2csv.__version__,
            'OpenPyXL': openpyxl.__version__,
            'PyXLSB': pyxlsb.__version__,
            'tqdm': _tqdm.__version__,
            'ordered-set': ordered_set.__version__,
            'recordclass': recordclass.__version__,
            'lxml': lxml.__version__,
            'cpuinfo': '.'.join(map(str, cpuinfo.CPUINFO_VERSION)),
            'memory_profiler': memory_profiler.__version__
        }.items()
    ],
    f'[{"=":=^120}]',
    sep='\n'
)

|                                               benchmark environment used                                               |
|        Platform        ||                                  Windows-10-10.0.19045-SP0                                   |
|          CPU           ||                      AMD64 Family 23 Model 24 Stepping 1, AuthenticAMD                       |
|         CPU HZ         ||                               2.0960 GHz (actual: 2.1000 GHz)                                |
|         Python         ||       3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)]       |
|        PyArrow         ||                                            13.0.0                                            |
|         Polars         ||                                            0.19.3                                            |
|         Pandas         ||                                            2.1.0                                             |
|         Numba 

## DataSource:

Our World In Data: COVID-19

https://ourworldindata.org/explorers/coronavirus-data-explorer?zoomToSelection=true&time=2020-03-01..latest&facet=none&country=USA~GBR~CAN~DEU~ITA~IND&pickerSort=asc&pickerMetric=location&Metric=Confirmed+cases&Interval=7-day+rolling+average&Relative+to+Population=true&Color+by+test+positivity=false

**I've download this, and save with MS Excel to .xlsx and .xlsb formats**

In [2]:
XLSX_FILE = 'covid.xlsx'
XLSB_FILE = 'covid.xlsb'

In [3]:
from IPython.core.display import display_html

def show_df(df: pl.DataFrame):
    print(f'rows: {df.height} columns: {df.width}')
    display_html(df.head())

### RXLS. Scan both xlsx and xlsb (*plain = as in pyxlsb (`inferring='no'`)*)

In [4]:
def scan_rxls_xlsx_plain(path: str = XLSX_FILE, tq: bool = False) -> pl.DataFrame:
    return xl_scan(path, head=True, with_tqdm=tq, inferring='no')

def scan_rxls_xlsb_plain(path: str = XLSB_FILE, tq: bool = False) -> pl.DataFrame:
    return xl_scan(path, head=True, with_tqdm=tq, inferring='no')

def scan_rxls_xlsx(path: str = XLSX_FILE, tq: bool = False) -> pl.DataFrame:
    return xl_scan(path, head=True, with_tqdm=tq)

def scan_rxls_xlsb(path: str = XLSB_FILE, tq: bool = False) -> pl.DataFrame:
    return xl_scan(path, head=True, with_tqdm=tq)

In [9]:
show_df(scan_rxls_xlsx_plain())
# 4m 35.5s (341376 rows, 67 columns)

rows: 341376 columns: 67


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""","""43833""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43834""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43835""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43836""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43837""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,


In [10]:
show_df(scan_rxls_xlsb_plain())
# 11m 1.5s (341376 rows, 67 columns)

rows: 341376 columns: 67


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""","""43833""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43834""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43835""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43836""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43837""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,


In [11]:
show_df(scan_rxls_xlsx())
# 4m 30.7s (341376 rows, 67 columns)

rows: 341376 columns: 67


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,date,i64,i64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,f64,i64,f64,i64,f64,i64,i64,f64,f64,i64,f64,f64,f64,str,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64
"""AFG""","""Asia""","""Afghanistan""",2020-01-03,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-04,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-05,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-06,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-07,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,


In [12]:
# NOTE: Not all columns have correct datatypes, but it is not critical
show_df(scan_rxls_xlsb())
# 10m 29.5s (341376 rows, 67 columns)

rows: 341376 columns: 67


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,date,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,str,str,f64,str,str,str,str,f64,str,str
"""AFG""","""Asia""","""Afghanistan""",2020-01-03,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-04,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-05,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-06,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-07,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,


### XLSX2CSV case: xlsx -> csv -> polars.read_csv()

In [5]:
def scan_xlsx2csv(path: str = XLSX_FILE) -> pl.DataFrame:
    return pl.read_excel(path, engine='xlsx2csv')

In [13]:
# NOTE: It's fine, but date columns have incorrect datatype, and are not ISO-dates.
show_df(scan_xlsx2csv())
# 4m 41.7s  (341376 rows, 67 columns)

rows: 341376 columns: 67


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""","""01-03-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,
"""AFG""","""Asia""","""Afghanistan""","""01-04-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,
"""AFG""","""Asia""","""Afghanistan""","""01-05-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,
"""AFG""","""Asia""","""Afghanistan""","""01-06-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,
"""AFG""","""Asia""","""Afghanistan""","""01-07-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,


### OpenPyXL: Scan xlsx only

In [6]:
def scan_openpyxl(path: str = XLSX_FILE) -> pl.DataFrame:
    return pl.read_excel(path, engine='openpyxl')

In [14]:
# NOTE: median_age column has missing data.
show_df(scan_openpyxl())
# 22m 51.3s

rows: 341376 columns: 67


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,datetime[μs],str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""",2020-01-03 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-04 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-05 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-06 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-07 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,


### PyXLSB: Scan xlsb only

In [7]:
def scan_pyxlsb(path: str = XLSB_FILE, tq: bool = False) -> pl.DataFrame:
    with pyxlsb.open_workbook(path) as wb:
        sheet = wb.get_sheet(1)

        row_it = sheet.rows(True)
        
        head = list(map(lambda cell: str(cell[1].v) if cell[1].v else f'Unnamed: {cell[0]}', enumerate(next(row_it))))
        schema = dict.fromkeys(head, pl.Utf8)
        # NOTE: pyxlsb not recognized dates automatically - you can do this manually, but xl/styles.bin parsing required.
        data_it = (
            [str(cell.v) if cell.v else None for cell in row] for row in row_it
        )
        if tq:
            data_it = tqdm(data_it)
        
        return pl.DataFrame(list(data_it), schema, orient='row')

In [16]:
show_df(scan_pyxlsb())

rows: 341376 columns: 67


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""","""43833.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43834.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43835.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43836.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43837.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,


## Benchmarks

### Timeit

*For these tests, I've reload kernel between tests, for better consistency*

In [12]:
%%time
scan_xlsx2csv().head()

CPU times: total: 4min 57s
Wall time: 5min 5s


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""","""01-03-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,
"""AFG""","""Asia""","""Afghanistan""","""01-04-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,
"""AFG""","""Asia""","""Afghanistan""","""01-05-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,
"""AFG""","""Asia""","""Afghanistan""","""01-06-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,
"""AFG""","""Asia""","""Afghanistan""","""01-07-20""",,0.0,,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772.0,,,,


In [13]:
%%time
scan_openpyxl().head()

CPU times: total: 17min 56s
Wall time: 25min 18s


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,datetime[μs],str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""",2020-01-03 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-04 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-05 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-06 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-07 00:00:00,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""",,"""2.581""","""1.337""","""1803.987""",,"""597.029""",,,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,


In [14]:
%%time
scan_pyxlsb().head()

CPU times: total: 6min 2s
Wall time: 6min 11s


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""","""43833.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43834.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43835.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43836.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43837.0""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,


In [15]:
%%time
scan_rxls_xlsx().head()

CPU times: total: 4min 16s
Wall time: 4min 15s


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,date,i64,i64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,f64,i64,f64,i64,f64,i64,i64,f64,f64,i64,f64,f64,f64,str,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64
"""AFG""","""Asia""","""Afghanistan""",2020-01-03,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-04,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-05,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-06,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-07,,0,,,0,,,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.422,45095.0,2.581,1.337,1803.987,,597.029,21794.0,,,37.746,0.5,64.83,0.511,41128772,,,,


In [16]:
%%time
scan_rxls_xlsx_plain().head()

CPU times: total: 4min 3s
Wall time: 4min 1s


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""","""43833""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43834""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43835""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43836""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43837""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,


In [17]:
%%time
scan_rxls_xlsb().head()

CPU times: total: 8min 18s
Wall time: 8min 27s


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,date,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,str,str,f64,str,str,str,str,f64,str,str
"""AFG""","""Asia""","""Afghanistan""",2020-01-03,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-04,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-05,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-06,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""",2020-01-07,,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""",21794.0,,,"""37.746""",0.5,"""64.83""","""0.511""","""41128772.0""",,,,


In [18]:
%%time
scan_rxls_xlsb_plain().head()

CPU times: total: 8min 3s
Wall time: 8min 12s


iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""AFG""","""Asia""","""Afghanistan""","""43833""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43834""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43835""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43836""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,
"""AFG""","""Asia""","""Afghanistan""","""43837""",,"""0.0""",,,"""0.0""",,,"""0.0""",,,"""0.0""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""0.0""","""54.422""","""45095.0""","""2.581""","""1.337""","""1803.987""",,"""597.029""","""21794.0""",,,"""37.746""","""0.5""","""64.83""","""0.511""","""41128772.0""",,,,


### Memory Profiler

*For these tests, I've reload kernel between tests, for better consistency*

In [8]:
%load_ext memory_profiler

In [9]:
%memit scan_rxls_xlsb()

peak memory: 1062.39 MiB, increment: 884.97 MiB


In [9]:
%memit scan_rxls_xlsx()

peak memory: 1197.21 MiB, increment: 1018.47 MiB


In [9]:
%memit scan_rxls_xlsb_plain()

peak memory: 1295.19 MiB, increment: 1117.53 MiB


In [9]:
%memit scan_rxls_xlsx_plain()

peak memory: 1347.00 MiB, increment: 1168.97 MiB


In [9]:
%memit scan_xlsx2csv()

peak memory: 713.02 MiB, increment: 533.98 MiB


In [9]:
%memit scan_openpyxl()

peak memory: 4611.20 MiB, increment: 4433.22 MiB


In [9]:
%memit scan_pyxlsb()

peak memory: 2666.74 MiB, increment: 2488.54 MiB


## Results 

### Middle-size: https://www.learningcontainer.com/download/sample-sales-data-excel-xls/

*(see previous commit of this file for details)*

### TimeIt:

- XLSB reading:
    1) RXLS `(plain: 2.51 s ± 42.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`
    2) PyXLSB `(2.58 s ± 105 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`
    3) RXLS `(basic: 2.75 s ± 350 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`


- XLSX reading:
    1) RXLS `(2.55 s ± 171 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))` *plain ~ basic*
    2) XLSX2CSV `(2.82 s ± 266 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`
    3) OpenPyXL `(2.86 s ± 64.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each))`

### Memory profiler

- XLSB reading:
    1) RXLS `(plain: peak memory: 194.72 MiB, increment: 15.71 MiB)`
    2) PyXLSB `(peak memory: 200.15 MiB, increment: 21.14 MiB)`
    3) RXLS `(basic: peak memory: 202.35 MiB, increment: 23.70 MiB)`


- XLSX reading:
    1) RXLS `(plain: peak memory: 195.05 MiB, increment: 16.29 MiB)`
    2) XLSX2CSV `(peak memory: 206.03 MiB, increment: 18.24 MiB)`
    3) RXLS `(basic: peak memory: 200.18 MiB, increment: 21.16 MiB)`
    4) OpenPyXL `(peak memory: 268.98 MiB, increment: 90.93 MiB)` 

### Large size (OWID)

### Time

* XLSB reading:
    1. PyXLSB *(my workaround :) )*: `CPU times: total: 4min 57s; Wall time: 5min 5s`
    2. RXLS: `CPU times: total: 8min 3s; Wall time: 8min 12s`

* XLSX reading:
    1. RXLS: `plain: CPU times: total: 4min 3s; Wall time: 4min 1s`
    2. RXLS: `basic: CPU times: total: 4min 16s; Wall time: 4min 15s`
    3. XLSX2CSV: `CPU times: total: 4min 57s; Wall time: 5min 5s`
    4. OpenPyXL: `CPU times: total: 17min 56s; Wall time: 25min 18s`

### Memory profiler

* XLSB reading:
    1. RXLS: `basic: peak memory: 1062.39 MiB, increment: 884.97 MiB`
    2. RXLS: `plain: peak memory: 1295.19 MiB, increment: 1117.53 MiB`
    3. PyXLSB: `peak memory: 2666.74 MiB, increment: 2488.54 MiB`

* XLSX reading:
    1. XLSX2CSV: `peak memory: 713.02 MiB, increment: 533.98 MiB`
    2. RXLS: `basic: peak memory: 1197.21 MiB, increment: 1018.47 MiB`
    3. RXLS: `plain: peak memory: 1347.00 MiB, increment: 1168.97 MiB`
    4. OpenPyXL: `peak memory: 4611.20 MiB, increment: 4433.22 MiB`

*XLSB parsing is ~2x slower than XLSX, in RXLS (but there is many optimisation points, in both parsers)*

*Memory footprint of OpenPyXL is very large - it's take all of my RAM for 97MB xlsx file.*

*Maybe, some optimisations missed in Polars openpyxl adapter: `read-only workbooks, data-only mode` (?)*