In [1]:
import duckdb
import pyarrow.parquet as pq
import pandas as pd
import s3fs

In [2]:
fs = s3fs.S3FileSystem(anon=False, endpoint_url="http://localhost:9000",
                 key='catedra', secret='catedrapass', use_ssl=False)

In [5]:
df = pd.read_parquet('s3://randata/month=5/day=30/0.parquet', engine='pyarrow', filesystem=fs)
df.head()

Unnamed: 0,ts,cantidad,hour,valor,tipo
0,2023-05-30,70,0,0.289176,aaa
1,2023-05-30,62,0,0.852335,ddd
2,2023-05-30,65,0,0.854607,ddd
3,2023-05-30,61,0,0.792608,ddd
4,2023-05-30,58,0,0.794051,bbb


In [4]:
dataset = pq.ParquetDataset('s3://randata/', filesystem=fs)
dataset.files[:5]

ArrowInvalid: GetFileInfo() yielded path 'randata/month=6/day=5/hour=9/0.parquet', which is outside base dir 's3://randata/'

In [61]:
# dataset = pq.ParquetDataset('minio_data/randata/',
#                         filters=[('valor','>',0.99),('tipo','=','aaa')])
# dataset.read()

In [4]:
duckdb.sql("""INSTALL httpfs;
LOAD httpfs;
SET s3_region='us-east-1';
SET s3_url_style='path';
SET s3_endpoint='localhost:9000';
SET s3_access_key_id='catedra' ;
SET s3_use_ssl=false;
SET s3_secret_access_key='catedrapass';""")

In [9]:
duckdb.sql("""
    SELECT * 
    FROM parquet_scan('s3://flujovehicular/flujo.parquet')
    WHERE CANTIDAD > 13000
        AND SENTIDO = 'Egreso';
""")

┌─────────────────┬─────────────────────┬──────────┬─────────┬──────────┬────────────┐
│ CODIGO_LOCACION │        HORA         │ CANTIDAD │ SENTIDO │ LATITUD  │  LONGITUD  │
│     varchar     │      timestamp      │  int32   │ varchar │  float   │   float    │
├─────────────────┼─────────────────────┼──────────┼─────────┼──────────┼────────────┤
│ 48Q39G00+       │ 2020-03-10 17:00:00 │    13041 │ Egreso  │ -34.6333 │ -58.468594 │
│ 48Q39G00+       │ 2020-03-06 17:00:00 │    13041 │ Egreso  │ -34.6333 │ -58.468594 │
└─────────────────┴─────────────────────┴──────────┴─────────┴──────────┴────────────┘

In [12]:
duckdb.sql("""
    CREATE TABLE randat_mem AS
    SELECT * 
    FROM parquet_scan('s3://randata/*/*/*.parquet', hive_partitioning=1);
""")

In [15]:
duckdb.sql("""
    SELECT count(*) from randat_mem WHERE valor>0.99;
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         2184 │
└──────────────┘

In [16]:
duckdb.sql("""
    SELECT count(*) from parquet_scan('s3://randata/*/*/*.parquet', hive_partitioning=1) WHERE valor>0.99;
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         2324 │
└──────────────┘

In [17]:
duckdb.sql("""
    SELECT * 
    FROM parquet_scan('s3://randata/*/*/*.parquet', hive_partitioning=1)
    WHERE valor > 0.99 AND tipo = 'aaa';
""")

┌────────────┬──────────┬────────────┬─────────┬───────┬───────┐
│     ts     │ cantidad │   valor    │  tipo   │  day  │ month │
│    date    │  int32   │   float    │ varchar │ int64 │ int64 │
├────────────┼──────────┼────────────┼─────────┼───────┼───────┤
│ 2023-10-01 │       70 │  0.9987942 │ aaa     │     1 │    10 │
│ 2023-10-01 │       82 │ 0.99181855 │ aaa     │     1 │    10 │
│ 2023-10-01 │       46 │  0.9900786 │ aaa     │     1 │    10 │
│ 2023-10-01 │        5 │ 0.99440426 │ aaa     │     1 │    10 │
│ 2023-10-01 │       79 │  0.9978182 │ aaa     │     1 │    10 │
│ 2023-10-01 │       84 │ 0.99124575 │ aaa     │     1 │    10 │
│ 2023-10-01 │       37 │  0.9923211 │ aaa     │     1 │    10 │
│ 2023-10-02 │       47 │  0.9902418 │ aaa     │     2 │    10 │
│ 2023-10-02 │       53 │ 0.99719626 │ aaa     │     2 │    10 │
│ 2023-10-02 │       55 │ 0.99125504 │ aaa     │     2 │    10 │
│     ·      │        · │      ·     │  ·      │     · │     · │
│     ·      │        · │