Skip to content

Commit

Permalink
Merge pull request #296 from SCM-NV/unrestricted
Browse files Browse the repository at this point in the history
ENH: Make the CP2K orbital parser more robust for unrestricted calculations
  • Loading branch information
BvB93 committed May 9, 2022
2 parents 1d254eb + b35fc8d commit 13e9f0f
Show file tree
Hide file tree
Showing 15 changed files with 421 additions and 462 deletions.
16 changes: 7 additions & 9 deletions .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ on:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

defaults:
run:
shell: bash

jobs:
Test:
Expand Down Expand Up @@ -79,13 +83,11 @@ jobs:
update-conda: true

- name: Install dependencies
shell: bash
run: |
case "${{ matrix.special }}" in
"pre-release")
conda create -n test -c conda-forge python=${{ matrix.version }} nbsphinx jupyter pandoc
source $CONDA/bin/activate test
pip install rdkit-pypi
pip install --pre -e .[test] --upgrade --force-reinstall
pip uninstall plams -y; pip install git+https://github.com/SCM-NV/PLAMS@master --upgrade
pip install git+https://github.com/NLeSC/noodles@master --upgrade
Expand All @@ -98,7 +100,6 @@ jobs:
*)
conda create -n test -c conda-forge python=${{ matrix.version }} nbsphinx jupyter pandoc
source $CONDA/bin/activate test
pip install rdkit-pypi
pip install -e .[test]
;;
esac
Expand All @@ -112,7 +113,6 @@ jobs:
run: conda list -n test

- name: Test with pytest
shell: bash
run: |
source $CONDA/bin/activate test
case "${{ matrix.special }}" in
Expand All @@ -134,10 +134,8 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Set up Python 3.10 on ubuntu-latest
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: "3.10"

- name: Install linters
run: pip install pydocstyle pycodestyle mypy "numpy>=1.21" pytest more_itertools types-PyYAML types-setuptools "pyparsing>=3.0.8"
Expand Down
28 changes: 7 additions & 21 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,11 @@ plugins = "numpy.typing.mypy_plugin"
show_error_codes = true

[[tool.mypy.overrides]]
module = "scm.plams.*"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "scm.*"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "noodles.*"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "rdkit.*"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "pandas.*"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "pyparsing.*"
module = [
"scm.plams.*",
"scm.*",
"noodles.*",
"rdkit.*",
"pandas.*",
]
ignore_missing_imports = true
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def readme():

tests_require = tests_no_optional_require.copy()
tests_require += docs_require
tests_require.append('rdkit-pypi')

setup(
name='qmflows',
Expand Down
36 changes: 31 additions & 5 deletions src/qmflows/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,23 @@ class CGF(NamedTuple):
class InfoMO(NamedTuple):
"""Energies and coefficients of the molecular orbitals."""

eigenvalues: np.ndarray # Orbitals eigenvalues
eigenvectors: np.ndarray # Orbitals eigenvectors
#: Orbital eigenvalues.
eigenvalues: np.ndarray

#: Orbital eigenvectors.
eigenvectors: np.ndarray

def __array__(self, dtype: "None | np.dtype" = None) -> np.ndarray:
"""Convert this instance into a structured array."""
struc_dtype = np.dtype([
("eigenvalues", "f8"),
("eigenvectors", "f8", (self.eigenvectors.shape[0],)),
])

ret = np.empty(self.eigenvalues.shape[0], dtype=struc_dtype)
ret["eigenvalues"] = self.eigenvalues
ret["eigenvectors"] = np.abs(self.eigenvectors.T)
return ret if dtype is None else ret.astype(dtype)


class MO_metadata(NamedTuple):
Expand All @@ -80,10 +95,21 @@ class MO_metadata(NamedTuple):
"""

nOccupied: int
nOrbitals: int
#: The number of occupied orbitals.
#: Has either 1 or 2 elements depending on whether they're spin-orbitals or not.
nOccupied: "list[int]"

#: The number of orbitals.
#: Has either 1 or 2 elements depending on whether they're spin-orbitals or not.
nOrbitals: "list[int]"

#: The number of basis functions.
nOrbFuns: int
nspinstates: int = 1

@property
def nspinstates(self) -> int:
"""The number of spin states."""
return len(self.nOrbitals)


class ParseWarning(NamedTuple):
Expand Down
210 changes: 210 additions & 0 deletions src/qmflows/parsers/_cp2k_orbital_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""Functions for reading CP2K orbitals."""

import os
import re
import fnmatch
import types
from itertools import islice
from pathlib import Path
from collections import defaultdict

import numpy as np

from ..common import InfoMO, MO_metadata

__all__ = ["read_cp2k_coefficients"]


def read_cp2k_coefficients(
path_mos: "str | os.PathLike[str]",
plams_dir: "None | str | os.PathLike[str]" = None,
) -> "InfoMO | tuple[InfoMO, InfoMO]":
"""Read the MO's from the CP2K output.
First it reads the number of ``Orbitals`` and ``Orbital`` functions from the
cp2k output and then read the molecular orbitals.
Returns
-------
NamedTuple containing the Eigenvalues and the Coefficients
"""
plams_dir = Path(plams_dir) if plams_dir is not None else Path(os.getcwd())
file_out = fnmatch.filter(os.listdir(plams_dir), '*out')[0]
path_out = plams_dir / file_out

orbitals_info = read_cp2k_number_of_orbitals(path_out)
return read_log_file(path_mos, orbitals_info)


def read_log_file(
path: "str | os.PathLike[str]",
orbitals_info: MO_metadata,
) -> "InfoMO | tuple[InfoMO, InfoMO]":
"""
Read the orbitals from the Log file.
Notes
-----
IF IT IS A UNRESTRICTED CALCULATION, THERE ARE TWO SEPARATED SET OF MO
FOR THE ALPHA AND BETA
Parameters
----------
path : path-like object
Path to the file containing the MO coefficients
orbitals_info : MO_metadata
A named tuple with the number of orbital functions and the number of MOs.
Note that the latter is an upper bound to the number of MOs printed in ``path``.
Returns
-------
InfoMO or tuple[InfoMO, InfoMO]
Molecular orbitals and orbital energies.
Returns a tuple of two ``InfoMO`` objects if its an unrestricted calculation.
"""
n_orb_list = orbitals_info.nOrbitals
n_orb_funcs = orbitals_info.nOrbFuns

unrestricted = orbitals_info.nspinstates == 2
start = _find_mo_start(path, unrestricted=unrestricted)
if not unrestricted:
return read_coefficients(path, n_orb_list[0], n_orb_funcs, start)
else:
return (
read_coefficients(path, n_orb_list[0], n_orb_funcs, start[0]),
read_coefficients(path, n_orb_list[1], n_orb_funcs, start[1]),
)


def _find_mo_start(path: "str | os.PathLike[str]", unrestricted: bool) -> "int | tuple[int, int]":
"""Find the start of the final-most MO range in ``path``.
Returns
-------
int or tuple[int, int]
The start of the MO range (excluding the header).
Returns a two-tuple for unrestricted calculations (for the alpha and beta component)
and a single integer otherwise.
"""
with open(path, "r") as f:
# Find all headers, but skip restarts
enumerator = enumerate((h.strip("MO| \n") for h in f), start=1)
headers = [(i, h) for i, h in enumerator if "EIGENVALUES" in h]
headers = [(i, h) for i, h in headers if "AFTER SCF STEP" not in h]
err = "Failed to identify the start of the {prefix}MO range in {path!r}"

if not unrestricted:
if not len(headers):
raise ValueError(err.format(prefix="", path=os.fspath(path)))
return headers[-1][0]

alphas = [i for i, h in headers if "ALPHA" in h]
betas = [i for i, h in headers if "BETA" in h]
if not len(alphas):
raise ValueError(err.format(prefix="alpha ", path=os.fspath(path)))
if not len(betas):
raise ValueError(err.format(prefix="beta ", path=os.fspath(path)))
return alphas[-1], betas[-1]


def read_coefficients(
path: "str | os.PathLike[str]",
n_orb: int,
n_orb_funcs: int,
start: int = 0,
) -> InfoMO:
"""Read the coefficients from the plain text output.
MO coefficients are stored in Column-major order.
CP2K molecular orbitalsoutput (CP2K <8.2) looks like:
.. code-block::
MO EIGENVALUES, MO OCCUPATION NUMBERS, AND SPHERICAL MO EIGENVECTORS
5 6
-0.2590267204166110 -0.1785544120250688
2.0000000000000000 2.0000000000000000
1 1 C 2s 0.0021482361354044 0.0000000235522485
2 1 C 3s -0.0007100065367389 0.0000000102096730
3 1 C 3py -0.1899052318987045 0.0000000059435027
4 1 C 3pz 0.0000000178537720 -0.5500605729231620
5 1 C 3px 0.3686765614894165 0.0000000228716009
6 1 C 4py 0.0014072130025389 0.0000000019199413
7 1 C 4pz -0.0000000014121887 0.0293850516018881
8 1 C 4px -0.0028383911872079 0.0000000042372601
9 1 C 4d-2 0.0311183981707317 0.0000000014108937
10 1 C 4d-1 0.0000000095952723 0.0253837978837068
11 1 C 4d0 0.0005419630026689 0.0000000391888080
12 1 C 4d+1 -0.0000000210955114 0.0147105486663415
13 1 C 4d+2 0.0534202997324328 0.0000000021056315
"""
energies = np.empty(n_orb, dtype=np.float64)
coefs = np.empty((n_orb_funcs, n_orb), dtype=np.float64)

j0 = 0
j1 = 0
with open(path, "r") as f:
iterator = filter(None, (i.strip("MO| \n") for i in islice(f, start, None)))
for h in iterator:
# Each MO pair is preceded by their indices,
# a lack thereof meaning the end of the file has been reached
headers = h.split()
if not all(i.isnumeric() for i in headers):
break

j1 += len(headers)
energies[j0:j1] = next(iterator).split()
coefs[..., j0:j1] = [i.split()[4:] for i in islice(iterator, 1, 1 + n_orb_funcs)]
j0 += len(headers)

# `n_orb` is an upper bound to the actual printed number of orbitals,
# so some trimming might be required
return InfoMO(energies[:j0], coefs[:, :j0])


_ORBITAL_PATTERN = re.compile((
r"(?P<key>Number of occupied orbitals|Number of molecular orbitals|Number of orbital functions):"
r"\s+"
r"(?P<value>[0-9]+)"
))

# Map CP2K field names to qmflows `MO_metadata` field names
_ORBITAL_NAMES = types.MappingProxyType({
"Number of occupied orbitals": "nOccupied",
"Number of molecular orbitals": "nOrbitals",
"Number of orbital functions": "nOrbFuns",
})

_REQUIRED_KEYS = frozenset(_ORBITAL_NAMES.values())


def read_cp2k_number_of_orbitals(file_name: "str | os.PathLike[str]") -> MO_metadata:
"""Extract the number of (occupied) orbitals and basis functions."""
with open(file_name, "r") as f:
kwargs: "dict[str, list[int]]" = defaultdict(list)
for line in f:
match = _ORBITAL_PATTERN.search(line)
if match is None:
continue

orb_name = _ORBITAL_NAMES[match["key"]]
orb_count = int(match["value"])
kwargs[orb_name].append(orb_count)

# Check if all keys are present and return
if _REQUIRED_KEYS == kwargs.keys():
n_orb_funcs = kwargs.pop("nOrbFuns")
assert len(n_orb_funcs) == 1
return MO_metadata(**kwargs, nOrbFuns=n_orb_funcs[0])
else:
missing_fields = sorted(k for k, v in _ORBITAL_NAMES.items() if v not in kwargs)
missing = "".join(f"\n - {i!r}" for i in missing_fields)
raise ValueError(
f"Failed to parse one or more fields in {os.fspath(file_name)!r}:{missing}"
)

0 comments on commit 13e9f0f

Please sign in to comment.