In [55]:
%cd ..

# Data URI Million Rows Flights
import numpy as np
from functools import partial


DATA_URI = "assets/flights_1m.parquet"

/home/pwd/Codes


In [2]:
# Pandas

import pandas as pd

In [3]:
%%time

# Pandas does not natively support lazy loading. So we have to eagerly load data
flights_pd = pd.read_parquet(DATA_URI)

CPU times: user 101 ms, sys: 33.9 ms, total: 135 ms
Wall time: 120 ms


In [4]:
# Polars

import polars as pl

In [5]:
%%time

# Polars supports both eager read, pl.read_parquet, but the lazy load is what sets it apart
flights_pl = pl.scan_parquet(DATA_URI)

CPU times: user 425 µs, sys: 416 µs, total: 841 µs
Wall time: 824 µs


In [6]:
# ibis

import ibis

In [8]:
%%time

# Ibis can be used as Pandas and Polars
flights_ib = ibis.read_parquet(DATA_URI)

CPU times: user 651 ms, sys: 5.64 ms, total: 656 ms
Wall time: 635 ms


In [14]:
%%time

# But it power comes from using backends e.g. DuckDB, Postgres, Snowflake etc
# ibis changes the game by leaving data where it is. For this example, we would use in-memory duckdb database
connection = (
    ibis.duckdb.connect()  # kind (could be snowflake, postgres etc  # in-memory database
)
connection.register(DATA_URI, table_name="flights_ib")  # we can register more sources

CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 100 ms


In [15]:
%%time

fligths_ib = connection.table("flights_ib")

CPU times: user 86.2 ms, sys: 4.57 ms, total: 90.7 ms
Wall time: 68.3 ms


In [18]:
ibis.show_sql(fligths_ib, dialect="postgres")

SELECT
  t0.index,
  t0.date,
  t0.delay,
  t0.distance,
  t0.origin,
  t0.destination
FROM flights_ib AS t0


In [29]:
%%time
fligths_ib

CPU times: user 16 µs, sys: 4 µs, total: 20 µs
Wall time: 39.8 µs


In [25]:
%%time

flights_pd.head()

CPU times: user 177 µs, sys: 44 µs, total: 221 µs
Wall time: 289 µs


Unnamed: 0,index,date,delay,distance,origin,destination
0,0,2001-02-14 15:12:00,8,342,SJC,SNA
1,1,2001-01-22 09:50:00,-10,601,PHX,RNO
2,2,2001-01-17 10:59:00,4,181,DAL,OKC
3,3,2001-01-14 16:20:00,4,630,PDX,SLC
4,4,2001-03-14 10:50:00,-3,223,BUR,LAS


In [26]:
%%time

flights_pl.head()

CPU times: user 16 µs, sys: 4 µs, total: 20 µs
Wall time: 25.3 µs


In [30]:
%%time
flights_pl.head().collect()

CPU times: user 5.99 ms, sys: 1.51 ms, total: 7.49 ms
Wall time: 1.59 ms


index,date,delay,distance,origin,destination
i64,datetime[ns],i64,i64,str,str
0,2001-02-14 15:12:00,8,342,"""SJC""","""SNA"""
1,2001-01-22 09:50:00,-10,601,"""PHX""","""RNO"""
2,2001-01-17 10:59:00,4,181,"""DAL""","""OKC"""
3,2001-01-14 16:20:00,4,630,"""PDX""","""SLC"""
4,2001-03-14 10:50:00,-3,223,"""BUR""","""LAS"""


In [33]:
ibis.show_sql(flights_ib.head())

SELECT
  t0.index,
  t0.date,
  t0.delay,
  t0.distance,
  t0.origin,
  t0.destination
FROM _ibis_read_parquet_gukrwxqo7vd2rkakxjuqzje37y AS t0
LIMIT 5


In [36]:
flights_ib.head().execute()

Unnamed: 0,index,date,delay,distance,origin,destination
0,0,2001-02-14 15:12:00,8,342,SJC,SNA
1,1,2001-01-22 09:50:00,-10,601,PHX,RNO
2,2,2001-01-17 10:59:00,4,181,DAL,OKC
3,3,2001-01-14 16:20:00,4,630,PDX,SLC
4,4,2001-03-14 10:50:00,-3,223,BUR,LAS
