In [1]:
%cd ..

# Data URI Million Rows Flights

DATA_URI = "assets/flights_1m.parquet"

/home/pwd/Codes/hadithi/dev.io


In [2]:
# Pandas

import pandas as pd

In [3]:
%%time

# Pandas does not natively support lazy loading. So we have to eagerly load data
flights_pd = pd.read_parquet(
   DATA_URI 
)

CPU times: user 101 ms, sys: 33.9 ms, total: 135 ms
Wall time: 120 ms


In [4]:
# Polars

import polars as pl

In [5]:
%%time

# Polars supports both eager read, pl.read_parquet, but the lazy load is what sets it apart
flights_pl = pl.scan_parquet(
   DATA_URI 
)

CPU times: user 425 µs, sys: 416 µs, total: 841 µs
Wall time: 824 µs


In [6]:
# ibis

import ibis

In [8]:
%%time

# Ibis can be used as Pandas and Polars
flights_ib = ibis.read_parquet(
    DATA_URI
)

CPU times: user 651 ms, sys: 5.64 ms, total: 656 ms
Wall time: 635 ms


In [14]:
%%time

# But it power comes from using backends e.g. DuckDB, Postgres, Snowflake etc
# ibis changes the game by leaving data where it is. For this example, we would use in-memory duckdb database
connection = (ibis
              .duckdb # kind (could be snowflake, postgres etc
              .connect()  # in-memory database 
)
connection.register(DATA_URI, table_name="flights_ib") #we can register more sources
         

CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 100 ms


In [15]:
%%time

fligths_ib = connection.table("flights_ib")

CPU times: user 86.2 ms, sys: 4.57 ms, total: 90.7 ms
Wall time: 68.3 ms


In [17]:
ibis.show_sql(fligths_ib)

SELECT
  t0.index,
  t0.date,
  t0.delay,
  t0.distance,
  t0.origin,
  t0.destination
FROM flights_ib AS t0
