# Partition Pruning

`pfeed` supports partition pruning for parquet files.

In [1]:
import pfeed as pe

# get the configured data path
config = pe.get_config()
data_path = config.data_path

## Using PyArrow

In [None]:
import pyarrow.dataset as ds

dataset = ds.dataset(data_path, format="parquet", partitioning="hive")
# filter the dataset by product type
dataset.filter(
    (ds.field("product_type") == "PERP")
).to_table().to_pandas()

## Using Polars

In [39]:
import polars as pl

lf = pl.scan_parquet(f'{data_path}/**/*.parquet', hive_partitioning=True)
# filter by product type, need to cast to string first using lazyframe
df = lf.filter(pl.col("product_type").cast(pl.String) == "PERP").collect()
df

date,resolution,product,symbol,open,high,low,close,volume,data_layer,data_domain,env,data_source,data_origin,product_type,year,month,day
datetime[μs],str,str,str,f64,f64,f64,f64,f64,str,str,str,str,str,str,i64,i64,i64
2025-01-01 00:00:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",93530.0,93590.8,93501.3,93590.5,30.284,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,1,1
2025-01-01 00:01:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",93590.5,93627.7,93571.8,93625.0,30.334,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,1,1
2025-01-01 00:02:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",93625.1,93672.8,93607.2,93672.8,32.049,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,1,1
2025-01-01 00:03:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",93672.8,93672.8,93606.1,93649.9,36.541,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,1,1
2025-01-01 00:04:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",93650.0,93650.0,93618.0,93630.1,17.922,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2025-02-02 23:55:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",97538.3,97648.0,97499.6,97623.5,40.119,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,2,2
2025-02-02 23:56:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",97623.5,97696.0,97600.1,97696.0,39.314,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,2,2
2025-02-02 23:57:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",97696.0,97762.9,97630.0,97732.8,67.428,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,2,2
2025-02-02 23:58:00,"""1m""","""BTC_USDT_PERP""","""BTCUSDT""",97732.9,97736.4,97670.0,97670.0,22.012,"""cleaned""","""market_data""","""BACKTEST""","""BYBIT""","""BYBIT""","""PERP""",2025,2,2


## Using DuckDB

In [2]:
import duckdb

query = f"""
SELECT * FROM '{data_path}/**/*.parquet'
WHERE volume > 100
"""
df = duckdb.sql(query).df()
df

Unnamed: 0,date,resolution,product,symbol,open,high,low,close,volume,data_domain,data_layer,data_origin,data_source,day,env,month,product_type,year
0,2025-01-01 00:10:00,1m,BTC_USDT_PERP,BTCUSDT,93574.1,93602.4,93494.1,93500.0,114.178,market_data,cleaned,BYBIT,BYBIT,01,BACKTEST,01,PERP,2025
1,2025-01-01 00:19:00,1m,BTC_USDT_PERP,BTCUSDT,93785.8,93791.3,93755.0,93762.1,130.847,market_data,cleaned,BYBIT,BYBIT,01,BACKTEST,01,PERP,2025
2,2025-01-01 00:47:00,1m,BTC_USDT_PERP,BTCUSDT,93850.0,93934.2,93845.0,93909.6,142.830,market_data,cleaned,BYBIT,BYBIT,01,BACKTEST,01,PERP,2025
3,2025-01-01 00:50:00,1m,BTC_USDT_PERP,BTCUSDT,93929.0,94051.0,93914.9,94037.9,148.347,market_data,cleaned,BYBIT,BYBIT,01,BACKTEST,01,PERP,2025
4,2025-01-01 00:54:00,1m,BTC_USDT_PERP,BTCUSDT,94150.0,94278.7,94145.4,94205.0,265.202,market_data,cleaned,BYBIT,BYBIT,01,BACKTEST,01,PERP,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760,2025-02-02 23:40:00,1m,BTC_USDT_PERP,BTCUSDT,97501.8,97590.0,97300.4,97487.3,131.160,market_data,cleaned,BYBIT,BYBIT,02,BACKTEST,02,PERP,2025
761,2025-02-02 23:42:00,1m,BTC_USDT_PERP,BTCUSDT,97421.4,97479.6,97291.1,97434.9,107.282,market_data,cleaned,BYBIT,BYBIT,02,BACKTEST,02,PERP,2025
762,2025-02-02 23:44:00,1m,BTC_USDT_PERP,BTCUSDT,97320.1,97367.5,97202.6,97288.1,156.376,market_data,cleaned,BYBIT,BYBIT,02,BACKTEST,02,PERP,2025
763,2025-02-02 23:46:00,1m,BTC_USDT_PERP,BTCUSDT,97459.0,97628.1,97400.0,97581.7,119.197,market_data,cleaned,BYBIT,BYBIT,02,BACKTEST,02,PERP,2025
