# `QuantumPioneer/databases` Demo
This notebook demonstrates how to interact with the `QuantumPioneer` databases.

See inline comments for more information.

In [1]:
# both pandas and polars can read parquet files - pick whichever you prefer!
# there are advantages and disadvantages to both
import pandas as pd
import polars as pl

# this library interacts with the parquet format directly, and both pandas and polars can use it too
import pyarrow.parquet as pq

# schema = layout of the database (what are the datatypes, etc.)
# the schema for the quantumpioneer databases are stored in databases.schema and vary depending on the type of data
from databases.schema import DLPNO_SCHEMA

In [2]:
# just open and read the entire dataset (very practical with the DLPNO data, which is small), which will be slow with pandas
df = pd.read_parquet(
    "dlpno.parquet",
    schema=DLPNO_SCHEMA,  # pandas will try and guess this on its own if you don't provide it - it gets it right, but is slower
)
df.head(4)

Unnamed: 0,source,route_section,charge,multiplicity,energy,run_time,input_coordinates,dipole_au
0,/data1/groups/co2_capture/reactant_product_cal...,uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...,0,2,-514.989394,2496.0,"[[-2.360469, -1.761878, -0.109737], [-1.560797...",
1,/data1/groups/co2_capture/reactant_product_cal...,uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...,0,2,-439.242168,1018.0,"[[3.029253, 0.698307, -0.169511], [1.874487, -...",
2,/data1/groups/co2_capture/reactant_product_cal...,uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...,0,2,-649.692914,1001.0,"[[2.473871, -0.648153, 0.590555], [1.045148, -...",
3,/data1/groups/co2_capture/reactant_product_cal...,uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...,0,2,-447.794979,705.0,"[[2.92968, 0.128456, -0.032012], [1.868792, -0...",


In [3]:
# from here you can do all of your usual pandas manipulations
df.iloc[0][["route_section", "input_coordinates"]].to_list()

['uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J cc-pvqz/c cc-pvqz-f12-cabs RIJCOSX NormalSCF NormalPNO',
 array([array([-2.360469, -1.761878, -0.109737]),
        array([-1.560797, -0.685955, -0.252496]),
        array([-2.127112,  0.659518, -0.191565]),
        array([-1.340214,  1.733086,  0.021883]),
        array([0.088651, 1.576148, 0.198928]),
        array([0.708166, 0.380756, 0.012916]),
        array([2.183807, 0.285345, 0.177394]),
        array([ 2.941005, -0.518201, -0.426811]),
        array([2.941055, 1.179135, 1.172935]),
        array([2.451954, 2.043206, 1.840821]),
        array([-0.081235, -0.795185, -0.422791]),
        array([ 0.466511, -1.882603, -0.997212]),
        array([-1.96053 , -2.775147, -0.038008]),
        array([-3.444336, -1.643877, -0.036627]),
        array([-3.212525,  0.765135, -0.263067]),
        array([-1.77051 ,  2.732335,  0.109103]),
        array([0.668645, 2.451011, 0.487481]),
        array([4.025992, 0.932871, 1.216651]),
        array([ 

In [4]:
# you can reduce the amount of memory consumed by only loading the columns that you care about using columns=...
df = pd.read_parquet("dlpno.parquet", columns=["source", "energy"])
df.head(4)

Unnamed: 0,source,energy
0,/data1/groups/co2_capture/reactant_product_cal...,-514.989394
1,/data1/groups/co2_capture/reactant_product_cal...,-439.242168
2,/data1/groups/co2_capture/reactant_product_cal...,-649.692914
3,/data1/groups/co2_capture/reactant_product_cal...,-447.794979


In [5]:
# and you filter out specific rows _when reading_ the database to further reduce memory consumption (and speed things up)
# these statements can be complex, but the pandas docs explain it well:
# https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
df = pd.read_parquet(
    "dlpno.parquet",
    # SKIP rows where...
    filters=[
        [
            (  # multiplicity is equal to 1
                "multiplicity",
                "=",
                1,
            ),  # AND
            (  # energy is less than -500
                "energy",
                "<",
                -500,
            ),
        ],
        [  # OR
            (  # these two specific files
                "source",
                "not in",
                (
                    "/data1/groups/co2_capture/reactant_product_calculation/ts_nho_round1/output/DLPNO_sp_f12/outputs/outputs_146/146857.log",
                    "/data1/groups/co2_capture/reactant_product_calculation/ts_nho_round1/output/DLPNO_sp_f12/outputs/outputs_146/146989.log",
                ),
            ),
        ],
    ],
)
df.head(4)

Unnamed: 0,source,route_section,charge,multiplicity,energy,run_time,input_coordinates
0,/data1/groups/co2_capture/reactant_product_cal...,uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...,0,2,-649.692914,1001.0,"[[2.473871, -0.648153, 0.590555], [1.045148, -..."
1,/data1/groups/co2_capture/reactant_product_cal...,uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...,0,2,-447.794979,705.0,"[[2.92968, 0.128456, -0.032012], [1.868792, -0..."
2,/data1/groups/co2_capture/reactant_product_cal...,uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...,0,2,-523.491165,4319.0,"[[0.077567, 3.346077, -0.220834], [0.117832, 2..."
3,/data1/groups/co2_capture/reactant_product_cal...,uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...,0,2,-617.293924,4085.0,"[[3.983572, -0.333946, -0.80899], [2.850692, 0..."


In [6]:
# you can also filter out based on one row without actually loading it
df = pd.read_parquet("dlpno.parquet", filters=[("run_time", "<", 100)], columns=["source", "charge"])
df.head(4)

Unnamed: 0,source,charge
0,/data1/groups/co2_capture/reactant_product_cal...,0
1,/data1/groups/co2_capture/reactant_product_cal...,0
2,/data1/groups/co2_capture/reactant_product_cal...,0
3,/data1/groups/co2_capture/reactant_product_cal...,0


In [7]:
# you can run everything from above using polars as well, and in my experience it uses less memory and is faster
df = pl.read_parquet("dlpno.parquet")
df.head(4)

source,route_section,charge,multiplicity,energy,run_time,input_coordinates
str,str,u8,u8,f64,u32,list[list[f64]]
"""/data1/groups/co2_capture/reac…","""uHF UNO DLPNO-CCSD(T)-F12D cc-…",0,2,-514.989394,2496,"[[-2.360469, -1.761878, -0.109737], [-1.560797, -0.685955, -0.252496], … [-0.165135, -2.679552, -1.395038]]"
"""/data1/groups/co2_capture/reac…","""uHF UNO DLPNO-CCSD(T)-F12D cc-…",0,2,-439.242168,1018,"[[3.029253, 0.698307, -0.169511], [1.874487, -0.255186, 0.038037], … [-3.867946, -0.838983, -0.403688]]"
"""/data1/groups/co2_capture/reac…","""uHF UNO DLPNO-CCSD(T)-F12D cc-…",0,2,-649.692914,1001,"[[2.473871, -0.648153, 0.590555], [1.045148, -0.128762, -0.185433], … [-1.54149, 1.774222, 0.218295]]"
"""/data1/groups/co2_capture/reac…","""uHF UNO DLPNO-CCSD(T)-F12D cc-…",0,2,-447.794979,705,"[[2.92968, 0.128456, -0.032012], [1.868792, -0.537436, -0.010411], … [-0.532366, 3.020587, 0.22006]]"


In [8]:
# a notable difference is that polars sets `memory_map=True` by default (pandas supports it, but is False and accessible via kwarg only)

In [9]:
# to pass filters to polars, you have to use the `pyarrow_options` argument (polars only supports limiting the number of rows in
# in sequential order via `n_rows`)
df = pl.read_parquet(
    "dlpno.parquet",
    columns=["source", "charge"],
    pyarrow_options=dict(
        filters=[("run_time", "<", 100)],
        schema=DLPNO_SCHEMA,
    ),
)
df.head(4)

source,charge
str,u8
"""/data1/groups/co2_capture/reac…",0
"""/data1/groups/co2_capture/reac…",0
"""/data1/groups/co2_capture/reac…",0
"""/data1/groups/co2_capture/reac…",0


In [10]:
# ...or just use polars other functions
df = (
    pl.scan_parquet(
        "dlpno.parquet",
    )  # opens the file, but does not actually read it (LazyFrame)
    .filter(
        pl.col("run_time") < 100,
    )  # sets up our filters, but still does not run the query
    .select(pl.col("source"), pl.col("energy"))
    .collect()  # actually runs the query
)
df.head(4)

source,energy
str,f64
"""/data1/groups/co2_capture/reac…",-170.225353
"""/data1/groups/co2_capture/reac…",-623.790852
"""/data1/groups/co2_capture/reac…",-134.31814
"""/data1/groups/co2_capture/reac…",-515.612084


In [11]:
# the final option is to interact with the data via pyarrow directly, which takes all the same arguments as before but in a slightly
# different setup - this is the single fastest way to read the data
table = pq.ParquetDataset("dlpno.parquet", schema=DLPNO_SCHEMA, filters=[("run_time", "<", 100)]).read(columns=["source", "energy"])
df = table.to_pandas()
df.head(4)

Unnamed: 0,source,energy
0,/data1/groups/co2_capture/reactant_product_cal...,-170.225353
1,/data1/groups/co2_capture/reactant_product_cal...,-623.790852
2,/data1/groups/co2_capture/reactant_product_cal...,-134.31814
3,/data1/groups/co2_capture/reactant_product_cal...,-515.612084
