Merge pull request #296 from SCM-NV/unrestricted

ENH: Make the CP2K orbital parser more robust for unrestricted calculations
SCM-NV · May 9, 2022 · 13e9f0f · 13e9f0f
2 parents 1d254eb + b35fc8d
commit 13e9f0f
Show file tree

Hide file tree

Showing 15 changed files with 421 additions and 462 deletions.
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -8,8 +8,12 @@ on:
   workflow_dispatch:
 
 concurrency:
-    group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-    cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash
 
 jobs:
   Test:
@@ -79,13 +83,11 @@ jobs:
         update-conda: true
 
     - name: Install dependencies
-      shell: bash
       run: |
         case "${{ matrix.special }}" in
           "pre-release")
             conda create -n test -c conda-forge python=${{ matrix.version }} nbsphinx jupyter pandoc
             source $CONDA/bin/activate test
-            pip install rdkit-pypi
             pip install --pre -e .[test] --upgrade --force-reinstall
             pip uninstall plams -y; pip install git+https://github.com/SCM-NV/PLAMS@master --upgrade
             pip install git+https://github.com/NLeSC/noodles@master --upgrade
@@ -98,7 +100,6 @@ jobs:
           *)
             conda create -n test -c conda-forge python=${{ matrix.version }} nbsphinx jupyter pandoc
             source $CONDA/bin/activate test
-            pip install rdkit-pypi
             pip install -e .[test]
             ;;
         esac
@@ -112,7 +113,6 @@ jobs:
       run: conda list -n test
 
     - name: Test with pytest
-      shell: bash
       run: |
         source $CONDA/bin/activate test
         case "${{ matrix.special }}" in
@@ -134,10 +134,8 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
-      - name: Set up Python 3.10 on ubuntu-latest
+      - name: Set up Python
         uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
 
       - name: Install linters
         run: pip install pydocstyle pycodestyle mypy "numpy>=1.21" pytest more_itertools types-PyYAML types-setuptools "pyparsing>=3.0.8"

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,25 +3,11 @@ plugins = "numpy.typing.mypy_plugin"
 show_error_codes = true
 
 [[tool.mypy.overrides]]
-module = "scm.plams.*"
-ignore_missing_imports = true
-
-[[tool.mypy.overrides]]
-module = "scm.*"
-ignore_missing_imports = true
-
-[[tool.mypy.overrides]]
-module = "noodles.*"
-ignore_missing_imports = true
-
-[[tool.mypy.overrides]]
-module = "rdkit.*"
-ignore_missing_imports = true
-
-[[tool.mypy.overrides]]
-module = "pandas.*"
-ignore_missing_imports = true
-
-[[tool.mypy.overrides]]
-module = "pyparsing.*"
+module = [
+    "scm.plams.*",
+    "scm.*",
+    "noodles.*",
+    "rdkit.*",
+    "pandas.*",
+]
 ignore_missing_imports = true
diff --git a/setup.py b/setup.py
@@ -37,6 +37,7 @@ def readme():
 
 tests_require = tests_no_optional_require.copy()
 tests_require += docs_require
+tests_require.append('rdkit-pypi')
 
 setup(
     name='qmflows',

diff --git a/src/qmflows/common.py b/src/qmflows/common.py
@@ -55,8 +55,23 @@ class CGF(NamedTuple):
 class InfoMO(NamedTuple):
     """Energies and coefficients of the molecular orbitals."""
 
-    eigenvalues: np.ndarray  # Orbitals eigenvalues
-    eigenvectors: np.ndarray  # Orbitals eigenvectors
+    #: Orbital eigenvalues.
+    eigenvalues: np.ndarray
+
+    #: Orbital eigenvectors.
+    eigenvectors: np.ndarray
+
+    def __array__(self, dtype: "None | np.dtype" = None) -> np.ndarray:
+        """Convert this instance into a structured array."""
+        struc_dtype = np.dtype([
+            ("eigenvalues", "f8"),
+            ("eigenvectors", "f8", (self.eigenvectors.shape[0],)),
+        ])
+
+        ret = np.empty(self.eigenvalues.shape[0], dtype=struc_dtype)
+        ret["eigenvalues"] = self.eigenvalues
+        ret["eigenvectors"] = np.abs(self.eigenvectors.T)
+        return ret if dtype is None else ret.astype(dtype)
 
 
 class MO_metadata(NamedTuple):
@@ -80,10 +95,21 @@ class MO_metadata(NamedTuple):
 
     """
 
-    nOccupied: int
-    nOrbitals: int
+    #: The number of occupied orbitals.
+    #: Has either 1 or 2 elements depending on whether they're spin-orbitals or not.
+    nOccupied: "list[int]"
+
+    #: The number of orbitals.
+    #: Has either 1 or 2 elements depending on whether they're spin-orbitals or not.
+    nOrbitals: "list[int]"
+
+    #: The number of basis functions.
     nOrbFuns: int
-    nspinstates: int = 1
+
+    @property
+    def nspinstates(self) -> int:
+        """The number of spin states."""
+        return len(self.nOrbitals)
 
 
 class ParseWarning(NamedTuple):

diff --git a/src/qmflows/parsers/_cp2k_orbital_parser.py b/src/qmflows/parsers/_cp2k_orbital_parser.py
@@ -0,0 +1,210 @@
+"""Functions for reading CP2K orbitals."""
+
+import os
+import re
+import fnmatch
+import types
+from itertools import islice
+from pathlib import Path
+from collections import defaultdict
+
+import numpy as np
+
+from ..common import InfoMO, MO_metadata
+
+__all__ = ["read_cp2k_coefficients"]
+
+
+def read_cp2k_coefficients(
+    path_mos: "str | os.PathLike[str]",
+    plams_dir: "None | str | os.PathLike[str]" = None,
+) -> "InfoMO | tuple[InfoMO, InfoMO]":
+    """Read the MO's from the CP2K output.
+
+    First it reads the number of ``Orbitals`` and ``Orbital`` functions from the
+    cp2k output and then read the molecular orbitals.
+
+    Returns
+    -------
+        NamedTuple containing the Eigenvalues and the Coefficients
+    """
+    plams_dir = Path(plams_dir) if plams_dir is not None else Path(os.getcwd())
+    file_out = fnmatch.filter(os.listdir(plams_dir), '*out')[0]
+    path_out = plams_dir / file_out
+
+    orbitals_info = read_cp2k_number_of_orbitals(path_out)
+    return read_log_file(path_mos, orbitals_info)
+
+
+def read_log_file(
+    path: "str | os.PathLike[str]",
+    orbitals_info: MO_metadata,
+) -> "InfoMO | tuple[InfoMO, InfoMO]":
+    """
+    Read the orbitals from the Log file.
+
+    Notes
+    -----
+    IF IT IS A UNRESTRICTED CALCULATION, THERE ARE TWO SEPARATED SET OF MO
+    FOR THE ALPHA AND BETA
+
+    Parameters
+    ----------
+    path : path-like object
+        Path to the file containing the MO coefficients
+    orbitals_info : MO_metadata
+        A named tuple with the number of orbital functions and the number of MOs.
+        Note that the latter is an upper bound to the number of MOs printed in ``path``.
+
+    Returns
+    -------
+    InfoMO or tuple[InfoMO, InfoMO]
+        Molecular orbitals and orbital energies.
+        Returns a tuple of two ``InfoMO`` objects if its an unrestricted calculation.
+
+    """
+    n_orb_list = orbitals_info.nOrbitals
+    n_orb_funcs = orbitals_info.nOrbFuns
+
+    unrestricted = orbitals_info.nspinstates == 2
+    start = _find_mo_start(path, unrestricted=unrestricted)
+    if not unrestricted:
+        return read_coefficients(path, n_orb_list[0], n_orb_funcs, start)
+    else:
+        return (
+            read_coefficients(path, n_orb_list[0], n_orb_funcs, start[0]),
+            read_coefficients(path, n_orb_list[1], n_orb_funcs, start[1]),
+        )
+
+
+def _find_mo_start(path: "str | os.PathLike[str]", unrestricted: bool) -> "int | tuple[int, int]":
+    """Find the start of the final-most MO range in ``path``.
+
+    Returns
+    -------
+    int or tuple[int, int]
+        The start of the MO range (excluding the header).
+        Returns a two-tuple for unrestricted calculations (for the alpha and beta component)
+        and a single integer otherwise.
+
+    """
+    with open(path, "r") as f:
+        # Find all headers, but skip restarts
+        enumerator = enumerate((h.strip("MO| \n") for h in f), start=1)
+        headers = [(i, h) for i, h in enumerator if "EIGENVALUES" in h]
+        headers = [(i, h) for i, h in headers if "AFTER SCF STEP" not in h]
+    err = "Failed to identify the start of the {prefix}MO range in {path!r}"
+
+    if not unrestricted:
+        if not len(headers):
+            raise ValueError(err.format(prefix="", path=os.fspath(path)))
+        return headers[-1][0]
+
+    alphas = [i for i, h in headers if "ALPHA" in h]
+    betas = [i for i, h in headers if "BETA" in h]
+    if not len(alphas):
+        raise ValueError(err.format(prefix="alpha ", path=os.fspath(path)))
+    if not len(betas):
+        raise ValueError(err.format(prefix="beta ", path=os.fspath(path)))
+    return alphas[-1], betas[-1]
+
+
+def read_coefficients(
+    path: "str | os.PathLike[str]",
+    n_orb: int,
+    n_orb_funcs: int,
+    start: int = 0,
+) -> InfoMO:
+    """Read the coefficients from the plain text output.
+
+    MO coefficients are stored in Column-major order.
+    CP2K molecular orbitalsoutput (CP2K <8.2) looks like:
+
+    .. code-block::
+
+        MO EIGENVALUES, MO OCCUPATION NUMBERS, AND SPHERICAL MO EIGENVECTORS
+
+                                  5                      6
+                              -0.2590267204166110    -0.1785544120250688
+
+                               2.0000000000000000     2.0000000000000000
+
+        1      1  C  2s        0.0021482361354044     0.0000000235522485
+        2      1  C  3s       -0.0007100065367389     0.0000000102096730
+        3      1  C  3py      -0.1899052318987045     0.0000000059435027
+        4      1  C  3pz       0.0000000178537720    -0.5500605729231620
+        5      1  C  3px       0.3686765614894165     0.0000000228716009
+        6      1  C  4py       0.0014072130025389     0.0000000019199413
+        7      1  C  4pz      -0.0000000014121887     0.0293850516018881
+        8      1  C  4px      -0.0028383911872079     0.0000000042372601
+        9      1  C  4d-2      0.0311183981707317     0.0000000014108937
+        10     1  C  4d-1      0.0000000095952723     0.0253837978837068
+        11     1  C  4d0       0.0005419630026689     0.0000000391888080
+        12     1  C  4d+1     -0.0000000210955114     0.0147105486663415
+        13     1  C  4d+2      0.0534202997324328     0.0000000021056315
+
+    """
+    energies = np.empty(n_orb, dtype=np.float64)
+    coefs = np.empty((n_orb_funcs, n_orb), dtype=np.float64)
+
+    j0 = 0
+    j1 = 0
+    with open(path, "r") as f:
+        iterator = filter(None, (i.strip("MO| \n") for i in islice(f, start, None)))
+        for h in iterator:
+            # Each MO pair is preceded by their indices,
+            # a lack thereof meaning the end of the file has been reached
+            headers = h.split()
+            if not all(i.isnumeric() for i in headers):
+                break
+
+            j1 += len(headers)
+            energies[j0:j1] = next(iterator).split()
+            coefs[..., j0:j1] = [i.split()[4:] for i in islice(iterator, 1, 1 + n_orb_funcs)]
+            j0 += len(headers)
+
+    # `n_orb` is an upper bound to the actual printed number of orbitals,
+    # so some trimming might be required
+    return InfoMO(energies[:j0], coefs[:, :j0])
+
+
+_ORBITAL_PATTERN = re.compile((
+    r"(?P<key>Number of occupied orbitals|Number of molecular orbitals|Number of orbital functions):"
+    r"\s+"
+    r"(?P<value>[0-9]+)"
+))
+
+# Map CP2K field names to qmflows `MO_metadata` field names
+_ORBITAL_NAMES = types.MappingProxyType({
+    "Number of occupied orbitals": "nOccupied",
+    "Number of molecular orbitals": "nOrbitals",
+    "Number of orbital functions": "nOrbFuns",
+})
+
+_REQUIRED_KEYS = frozenset(_ORBITAL_NAMES.values())
+
+
+def read_cp2k_number_of_orbitals(file_name: "str | os.PathLike[str]") -> MO_metadata:
+    """Extract the number of (occupied) orbitals and basis functions."""
+    with open(file_name, "r") as f:
+        kwargs: "dict[str, list[int]]" = defaultdict(list)
+        for line in f:
+            match = _ORBITAL_PATTERN.search(line)
+            if match is None:
+                continue
+
+            orb_name = _ORBITAL_NAMES[match["key"]]
+            orb_count = int(match["value"])
+            kwargs[orb_name].append(orb_count)
+
+            # Check if all keys are present and return
+            if _REQUIRED_KEYS == kwargs.keys():
+                n_orb_funcs = kwargs.pop("nOrbFuns")
+                assert len(n_orb_funcs) == 1
+                return MO_metadata(**kwargs, nOrbFuns=n_orb_funcs[0])
+        else:
+            missing_fields = sorted(k for k, v in _ORBITAL_NAMES.items() if v not in kwargs)
+            missing = "".join(f"\n  - {i!r}" for i in missing_fields)
+            raise ValueError(
+                f"Failed to parse one or more fields in {os.fspath(file_name)!r}:{missing}"
+            )