diff --git a/.github/workflows/comment-bot.yml b/.github/workflows/comment-bot.yml
index b44ee7ba..2a87fe42 100644
--- a/.github/workflows/comment-bot.yml
+++ b/.github/workflows/comment-bot.yml
@@ -1,25 +1,23 @@
 name: Comment Bot
 on:
-  issue_comment:
-    types: [created]
-  pull_request_review_comment:
-    types: [created]
+  issue_comment: {types: [created]}
+  pull_request_review_comment: {types: [created]}
 jobs:
   tag:  # /tag <tagname> <commit>
     if: startsWith(github.event.comment.body, '/tag ')
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: React Seen
-      uses: actions/github-script@v2
+      uses: actions/github-script@v6
       with:
         script: |
-          const perm = await github.repos.getCollaboratorPermissionLevel({
+          const perm = await github.rest.repos.getCollaboratorPermissionLevel({
             owner: context.repo.owner, repo: context.repo.repo,
             username: context.payload.comment.user.login})
           post = (context.eventName == "issue_comment"
-            ? github.reactions.createForIssueComment
-            : github.reactions.createForPullRequestReviewComment)
+            ? github.rest.reactions.createForIssueComment
+            : github.rest.reactions.createForPullRequestReviewComment)
           if (!["admin", "write"].includes(perm.data.permission)){
             post({
               owner: context.repo.owner, repo: context.repo.repo,
@@ -40,12 +38,12 @@ jobs:
         BODY: ${{ github.event.comment.body }}
         GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
     - name: React Success
-      uses: actions/github-script@v2
+      uses: actions/github-script@v6
       with:
         script: |
           post = (context.eventName == "issue_comment"
-            ? github.reactions.createForIssueComment
-            : github.reactions.createForPullRequestReviewComment)
+            ? github.rest.reactions.createForIssueComment
+            : github.rest.reactions.createForPullRequestReviewComment)
           post({
             owner: context.repo.owner, repo: context.repo.repo,
             comment_id: context.payload.comment.id, content: "rocket"})
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4e93ce37..9e1d7ce7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,15 +2,15 @@ name: Test
 on: [push, pull_request]
 jobs:
   check:
-    if: github.event_name != 'pull_request' || github.repository_owner != 'NiftyPET'
+    if: github.event_name != 'pull_request' || !contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association)
     runs-on: ubuntu-latest
     name: Check
     steps:
-    - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
     - name: set PYSHA
       run: echo "PYSHA=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
-    - uses: actions/cache@v1
+    - uses: actions/cache@v3
       with:
         path: ~/.cache/pre-commit
         key: pre-commit|${{ env.PYSHA }}|${{ hashFiles('.pre-commit-config.yaml') }}
@@ -34,14 +34,14 @@ jobs:
         EVENT: ${{ github.event_name }}
     - run: pre-commit run -a --show-diff-on-failure
   test:
-    if: github.event_name != 'pull_request' || github.repository_owner != 'NiftyPET'
-    name: Test py${{ matrix.python }}
+    if: github.event_name != 'pull_request' || !contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association)
     runs-on: [self-hosted, python, cuda, matlab]
     strategy:
       matrix:
-        python: [3.6, 3.9]
+        python: [3.7, '3.10']
+    name: Test py${{ matrix.python }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
       with:
         fetch-depth: 0
     - name: Run setup-python
@@ -57,10 +57,10 @@ jobs:
     name: PyPI Deploy
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
       with:
         fetch-depth: 0
-    - uses: actions/setup-python@v2
+    - uses: actions/setup-python@v4
     - id: dist
       uses: casperdcl/deploy-pypi@v2
       with:
@@ -70,7 +70,6 @@ jobs:
         password: ${{ secrets.PYPI_TOKEN }}
         upload: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags') }}
       env:
-        PATHTOOLS: ${{ github.workspace }}/NiftyPET_tools
         HMUDIR: ${{ github.workspace }}
     - id: meta
       name: Changelog
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b5956c1f..b157f736 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ default_language_version:
   python: python3
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.0.1
+  rev: v4.3.0
   hooks:
   - id: check-added-large-files
   - id: check-case-conflict
@@ -31,17 +31,20 @@ repos:
   - id: flake8
     args: [-j8]
     additional_dependencies:
+    - flake8-broken-line
     - flake8-bugbear
     - flake8-comprehensions
     - flake8-debugger
+    - flake8-isort
     - flake8-string-format
 - repo: https://github.com/google/yapf
-  rev: v0.31.0
+  rev: v0.32.0
   hooks:
   - id: yapf
     args: [-i]
+    additional_dependencies: [toml]
 - repo: https://github.com/PyCQA/isort
-  rev: 5.9.3
+  rev: 5.10.1
   hooks:
   - id: isort
 - repo: https://github.com/doublify/pre-commit-clang-format
diff --git a/README.rst b/README.rst
index e51061e6..f8b58bb1 100644
--- a/README.rst
+++ b/README.rst
@@ -37,7 +37,7 @@ It's also recommended (but not required) to use `conda`.
     export HMUDIR=$HOME/mmr_hardwareumaps
     # cross-platform install
     conda install -c conda-forge python=3 \
-      ipykernel numpy scipy scikit-image matplotlib ipywidgets
+      ipykernel numpy scipy scikit-image matplotlib ipywidgets dipy nibabel pydicom
     pip install "nipet>=2"
 
 External CMake Projects
diff --git a/examples/demo.ipynb b/examples/demo.ipynb
index afde8439..438d01c6 100644
--- a/examples/demo.ipynb
+++ b/examples/demo.ipynb
@@ -12,10 +12,10 @@
     "Mathematically:\n",
     "\n",
     "$$\n",
-    "{\\bf\\theta}^{(k+1)} = {{\\bf\\theta}^{(k)} \\over \\sum_n{{\\bf H}^T{\\bf X}_n^T{\\bf A}^T{\\bf N}^T{\\bf 1}}}\n",
+    "{\\bf y}^{(k+1)} = {{\\bf y}^{(k)} \\over \\sum_n{{\\bf H}^T{\\bf X}_n^T{\\bf A}^T{\\bf N}^T{\\bf 1}}}\n",
     "    \\circ\n",
     "    \\sum_n{ {\\bf H}^T{\\bf X}_n^T{\\bf A}^T{\\bf N}^T\n",
-    "        { {\\bf m} \\over {\\bf NA}{\\bf X}_n{\\bf H}{\\bf\\theta}^{(k)} + {\\bf r} + {\\bf s} }\n",
+    "        { {\\bf m} \\over {\\bf NA}{\\bf X}_n{\\bf H}{\\bf y}^{(k)} + {\\bf r} + {\\bf s} }\n",
     "    },\n",
     "$$\n",
     "\n",
@@ -26,8 +26,8 @@
     "\n",
     "----\n",
     "\n",
-    "- Author: Casper O. da Costa-Luis [casper.dcl@{physics.org|ieee.org|kcl.ac.uk}](mailto:casper.dcl@physics.org)\n",
-    "- Date: 2019-20\n",
+    "- Author: Casper O. da Costa-Luis [casper.dcl@{physics.org|ieee.org|ucl.ac.uk|kcl.ac.uk}](mailto:casper.dcl@physics.org)\n",
+    "- Date: 2019-21\n",
     "\n",
     "----\n",
     "\n",
@@ -41,29 +41,30 @@
    "outputs": [],
    "source": [
     "# imports\n",
-    "from __future__ import print_function, division\n",
     "%matplotlib notebook\n",
     "\n",
-    "from niftypet import nipet\n",
-    "from niftypet import nimpa\n",
+    "import logging\n",
+    "from functools import partial\n",
+    "from os import path\n",
+    "from pathlib import Path\n",
     "\n",
+    "import cuvec as cu\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
-    "from scipy.ndimage.filters import gaussian_filter\n",
+    "from scipy.ndimage import gaussian_filter\n",
     "from tqdm.auto import trange\n",
-    "from brainweb import volshow\n",
     "\n",
-    "from collections import OrderedDict\n",
-    "from os import path\n",
-    "import functools\n",
-    "import logging\n",
-    "logging.basicConfig(level=logging.INFO, format=nipet.LOG_FORMAT)\n",
+    "from niftypet import nipet, nimpa\n",
     "\n",
+    "logging.basicConfig(level=logging.INFO, format=nipet.LOG_FORMAT)\n",
     "print(nipet.gpuinfo())\n",
     "# get all the scanner parameters\n",
     "mMRpars = nipet.get_mmrparams()\n",
+    "mMRpars['Cnt']['LOG'] = logging.INFO\n",
     "# conversion for Gaussian sigma/[voxel] to FWHM/[mm]\n",
-    "SIGMA2FWHMmm = (8 * np.log(2))**0.5 * np.array([mMRpars['Cnt']['SO_VX' + i] for i in 'ZYX']) * 10"
+    "SIGMA2FWHMmm = (8 * np.log(2))**0.5 * np.array([mMRpars['Cnt']['SO_VX' + i] for i in 'ZYX']) * 10\n",
+    "# image-space PSF function\n",
+    "imsmooth = partial(nimpa.imsmooth, Cnt=mMRpars['Cnt'])"
    ]
   },
   {
@@ -79,17 +80,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "folderin = \"amyloidPET_FBP_TP0\"\n",
-    "\n",
+    "folderin = Path(\"amyloidPET_FBP_TP0\")\n",
     "# automatically categorise the input data\n",
-    "#logging.getLogger().setLevel(logging.INFO)\n",
     "datain = nipet.classify_input(folderin, mMRpars)\n",
-    "\n",
     "# output path\n",
-    "opth = path.join(datain['corepath'], 'niftyout')\n",
-    "# switch on verbose mode\n",
-    "#logging.getLogger().setLevel(logging.DEBUG)\n",
-    "\n",
+    "opth = Path(datain['corepath']) / \"niftyout\"\n",
+    "# show categorised data\n",
     "datain"
    ]
   },
@@ -100,7 +96,23 @@
    "outputs": [],
    "source": [
     "# hardware mu-map (bed, head/neck coils)\n",
-    "mu_h = nipet.hdw_mumap(datain, [1,2,4], mMRpars, outpath=opth, use_stored=True)"
+    "mu_h = nipet.hdw_mumap(datain, [1, 2, 4], mMRpars, outpath=opth, use_stored=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create histogram\n",
+    "mMRpars['Cnt']['BTP'] = 0\n",
+    "mSino = nipet.mmrhist(datain, mMRpars, outpath=opth, store=True, use_stored=True)\n",
+    "if False:                                                   # enable parametric bootstrap\n",
+    "    mMRpars['Cnt']['BTP'] = 2\n",
+    "    totCnt = 3e6                                            # total counts to simulate\n",
+    "    mMRpars['Cnt']['BTPRT'] = totCnt / mSino['psino'].sum() # count level ratio relative to original\n",
+    "    mSino = nipet.mmrhist(datain, mMRpars, outpath=opth / \"BTP\" / f\"{totCnt:.3g}\", store=True)"
    ]
   },
   {
@@ -110,22 +122,23 @@
    "outputs": [],
    "source": [
     "# MR-based human mu-map\n",
-    "\n",
-    "# UTE-based object mu-map aligned (need UTE sequence or T1 for pseudo-CT)\n",
-    "#mu_o = nipet.align_mumap(\n",
-    "#    datain,\n",
-    "#    scanner_params=mMRpars,\n",
-    "#    outpath=opth,\n",
-    "#    t0=0, t1=0, # when both times are 0, will use full data\n",
-    "#    itr=2,      # number of iterations used for recon to which registering MR/UTE\n",
-    "#    petopt='ac',# what PET image to use (ac-just attenuation corrected)\n",
-    "#    musrc='ute',# source of mu-map (ute/pct)\n",
-    "#    ute_name='UTE2', # which UTE to use (UTE1/2 shorter/faster)\n",
-    "#    verbose=True,\n",
-    "#)\n",
-    "\n",
-    "#> the same as above without any faff though (no alignment)\n",
-    "mu_o = nipet.obj_mumap(datain, mMRpars, outpath=opth, store=True)"
+    "if True:                        # UTE-based object mu-map aligned (need UTE sequence or T1 for pseudo-CT)\n",
+    "    mu_o = nipet.align_mumap(\n",
+    "        datain,\n",
+    "        scanner_params=mMRpars,\n",
+    "        outpath=opth,\n",
+    "        store=True,\n",
+    "        use_stored=True,\n",
+    "        hst=mSino,\n",
+    "        t0=0,\n",
+    "        t1=0,                   # when both times are 0, will use full data\n",
+    "        itr=2,                  # number of iterations used for recon to which registering MR/UTE\n",
+    "        petopt='ac',            # what PET image to use (ac-just attenuation corrected)\n",
+    "        musrc='pct',            # source of mu-map (ute/pct)\n",
+    "        ute_name='UTE2',        # which UTE to use (UTE1/2 shorter/faster)\n",
+    "    )\n",
+    "else:                           # the same as above without any faff though (no alignment)\n",
+    "    mu_o = nipet.obj_mumap(datain, mMRpars, outpath=opth, store=True)"
    ]
   },
   {
@@ -134,14 +147,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# create histogram\n",
-    "mMRpars['Cnt']['BTP'] = 0\n",
-    "m = nipet.mmrhist(datain, mMRpars, outpath=opth, store=True, use_stored=True)\n",
-    "if False:\n",
-    "    mMRpars['Cnt']['BTP'] = 2  # enable parametric bootstrap\n",
-    "    totCnt = 3e6\n",
-    "    mMRpars['Cnt']['BTPRT'] = totCnt / m['psino'].sum()  # ratio count level relative to the original\n",
-    "    m = nipet.mmrhist(datain, mMRpars, outpath=path.join(opth, 'BTP', '%.3g' % totCnt), store=True)"
+    "try:\n",
+    "    mu = mu_h['im'] + mu_o['im'] # needs HW mu-maps\n",
+    "except (NameError, KeyError):\n",
+    "    mu = mu_o['im']\n",
+    "    mu_h = {'im': np.zeros_like(mu_o['im'])}"
    ]
   },
   {
@@ -157,10 +167,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "try:  # needs HW mu-maps\n",
-    "    volshow(mu_o['im'] + mu_h['im'], cmaps=['bone'], titles=[r\"$\\mu$-map\"])\n",
-    "except:\n",
-    "    volshow(mu_o['im'], cmaps=['bone'])"
+    "nimpa.imscroll(mu, titles=[r\"$\\mu$-map\"], cmap='bone');"
    ]
   },
   {
@@ -170,11 +177,11 @@
    "outputs": [],
    "source": [
     "# sinogram index (<127 for direct sinograms, >=127 for oblique sinograms)\n",
-    "volshow([m['psino'], m['dsino']],\n",
-    "        titles=[\"Prompt sinogram (%.3gM)\" % (m['psino'].sum() / 1e6),\n",
-    "               \"Delayed sinogram (%.3gM)\" % (m['dsino'].sum() / 1e6)],\n",
-    "        cmaps=['inferno'] * 2, xlabels=[\"\", \"bins\"], ylabels=[\"angles\"] * 2, ncols=2, colorbars=[1]*2,\n",
-    "       figsize=(9.5, 3.5), tight_layout=5);"
+    "nimpa.imscroll(\n",
+    "    {\n",
+    "        f\"Prompt sinogram ({mSino['psino'].sum() / 1e6:.3g}M)\": mSino['psino'],\n",
+    "        f\"Delayed sinogram ({mSino['dsino'].sum() / 1e6:.3g}M)\": mSino['dsino']}, cmap='inferno',\n",
+    "    fig=plt.figure(num=2, figsize=(9.5, 3.5)));"
    ]
   },
   {
@@ -188,7 +195,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## OSEM"
+    "## OSEM\n",
+    "\n",
+    "Note that since $\\bf A$ and $\\bf N$ are both diagonal matrix operators, the reconstruction equation can be slightly rewritten to reduce the number of calculations required per iteration:\n",
+    "\n",
+    "$$\n",
+    "{\\bf y}^{(k+1)} = {{\\bf y}^{(k)} \\over \\sum_n{{\\bf H}^T{\\bf X}_n^T{\\bf A}^T{\\bf N}^T{\\bf 1}}}\n",
+    "    \\circ\n",
+    "    \\sum_n{ {\\bf H}^T{\\bf X}_n^T\n",
+    "        { {\\bf m} \\over {\\bf X}_n{\\bf H}{\\bf y}^{(k)} + ({\\bf r} + {\\bf s})/{({\\bf A}^T{\\bf N}^T {\\bf 1})} }\n",
+    "    },\n",
+    "$$"
    ]
   },
   {
@@ -197,20 +214,105 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# built-in default: 14 subsets\n",
-    "recon = nipet.mmrchain(\n",
-    "    datain, mMRpars,\n",
-    "    frames=['timings', [3000, 3600]],\n",
-    "    itr=4,\n",
-    "    histo=m,\n",
-    "    mu_h=mu_h,\n",
-    "    mu_o=mu_o,\n",
-    "    psf='measured',\n",
-    "    outpath=opth,\n",
-    "    fcomment=\"niftypet-recon\",\n",
-    "    store_img=True)\n",
-    "\n",
-    "volshow(recon['im'][:, 100:-100, 100:-100], cmaps=['magma']);"
+    "# setup\n",
+    "psf = nipet.prj.mmrrec.psf_config('measured', mMRpars['Cnt'])\n",
+    "msk = nipet.get_cylinder(mMRpars['Cnt'], rad=29., xo=0., yo=0., unival=1, gpu_dim=True) > 0.9\n",
+    "\n",
+    "## Attenuation, Normalisation & Sensitivity\n",
+    "A = nipet.frwd_prj(mu, mMRpars, attenuation=True, dev_out=True) # imsmooth(mu, psf=psf)\n",
+    "N = nipet.mmrnorm.get_norm_sino(datain, mMRpars, mSino, gpu_dim=True)\n",
+    "AN = cu.asarray(A * N)\n",
+    "\n",
+    "Sn = 14                   # number of subsets\n",
+    "sinoTIdx = [None] * Sn    # subset indices\n",
+    "sen = [None] * Sn         # sensitivity images for each subset\n",
+    "sen_inv_msk = [None] * Sn # masked inverse sensitivity image\n",
+    "\n",
+    "for n in trange(Sn, unit=\"subset\"):\n",
+    "    sinoTIdx[n] = cu.asarray(nipet.prj.mmrrec.get_subsets14(n, mMRpars)[0], 'int32')\n",
+    "    sen[n] = nipet.back_prj(nimpa.isub(AN, sinoTIdx[n]), mMRpars, isub=sinoTIdx[n], dev_out=True,\n",
+    "                            sync=False)\n",
+    "    imsmooth(sen[n], psf=psf, output_array=sen[n])\n",
+    "    assert sen[n].shape == (mMRpars['Cnt']['SZ_IMX'], mMRpars['Cnt']['SZ_IMY'],\n",
+    "                            mMRpars['Cnt']['SZ_IMZ'])\n",
+    "    sen_inv_msk[n] = cu.zeros_like(sen[n])\n",
+    "    sen_inv_msk[n][msk] = np.float32(1) / sen[n][msk]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Randoms\n",
+    "\n",
+    "r = nipet.randoms(mSino, mMRpars)[0]\n",
+    "print(\"Randoms: %.3g%%\" % (r.sum() / mSino['psino'].sum() * 100))\n",
+    "\n",
+    "## Scatter\n",
+    "\n",
+    "# Estimated image from two OSEM iterations (implicitly using voxel-driven scatter model)\n",
+    "eim = nipet.mmrchain(datain, mMRpars, mu_h=mu_h, mu_o=mu_o, itr=2, histo=mSino, outpath=opth)['im']\n",
+    "# Recalculate scatter\n",
+    "s = nipet.vsm(datain, (mu_h['im'], mu_o['im']), eim, mMRpars, histo=mSino, rsino=r)\n",
+    "\n",
+    "# normalised randoms + scatter in GPU dimensions\n",
+    "r = nipet.mmraux.remgaps(r, mMRpars['txLUT'], mMRpars['Cnt'])\n",
+    "s = nipet.mmraux.remgaps(s, mMRpars['txLUT'], mMRpars['Cnt'])\n",
+    "rs_AN = nimpa.div(r + s, AN, default=1)\n",
+    "print(\"Scatter: %.3g%%\" % (s.sum() / mSino['psino'].sum() * 100))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y = cu.ones_like(sen[0])             # reconstructed image\n",
+    "Hy, XHy, crr, bim, mul = (None,) * 5 # temporary variables\n",
+    "m = cu.asarray(nipet.mmraux.remgaps(mSino['psino'], mMRpars['txLUT'], mMRpars['Cnt']))\n",
+    "rs_AN_sub = [nimpa.isub(rs_AN, idx) for idx in sinoTIdx]\n",
+    "\n",
+    "for k in trange(4, desc=\"OSEM\"):\n",
+    "    for n in trange(Sn, unit=\"subset\", leave=False):\n",
+    "        # forward-project current reconstruction to sinogram space\n",
+    "        Hy = imsmooth(y, psf=psf, output_array=Hy, sync=False)\n",
+    "        XHy = nipet.frwd_prj(Hy, mMRpars, isub=sinoTIdx[n], attenuation=False, dev_out=True,\n",
+    "                             fullsino_out=False, output=XHy, sync=False)\n",
+    "        # add randoms and scatter estimates\n",
+    "        if False:\n",
+    "            # recalculate scatter\n",
+    "            s = nipet.vsm(datain, (mu_h['im'], mu_o['im']), y, mMRpars, histo=mSino, rsino=r)\n",
+    "            s = nipet.mmraux.remgaps(s, mMRpars['txLUT'], mMRpars['Cnt'])\n",
+    "            rs_AN = nimpa.div(nimpa.add(nimpa.isub(r, sinoTIdx[n]), nimpa.isub(s, sinoTIdx[n])),\n",
+    "                              nimpa.isub(AN, sinoTIdx[n]), default=1)\n",
+    "            XHy = nimpa.add(XHy, rs_AN, output=XHy, sync=False)\n",
+    "        else:\n",
+    "            XHy = nimpa.add(XHy, rs_AN_sub[n], output=XHy, sync=False)\n",
+    "\n",
+    "        # measured sinogram subset\n",
+    "        crr = nimpa.isub(m, sinoTIdx[n], output=crr, sync=False)\n",
+    "        # corrections\n",
+    "        crr = nimpa.div(crr, XHy, default=1, output=crr, sync=False)\n",
+    "        # back-project corrections to image space\n",
+    "        bim = nipet.back_prj(crr, mMRpars, isub=sinoTIdx[n], dev_out=True, output=bim, sync=False)\n",
+    "        mul = imsmooth(bim, psf=psf, output_array=mul, sync=False)\n",
+    "        # apply FOV mask and normalise with scanner sensitivity\n",
+    "        mul = nimpa.mul(mul, sen_inv_msk[n], output=mul, sync=False)\n",
+    "        # update reconstructed image\n",
+    "        y = nimpa.mul(y, mul, output=y, sync=False)\n",
+    "cu.dev_sync()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nimpa.imscroll(nipet.img.mmrimg.convert2e7(y, mMRpars['Cnt']), cmap='magma', vmin=0, vmax=np.percentile(y, 99.95));"
    ]
   },
   {
@@ -226,25 +328,37 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# setup\n",
+    "psf = nipet.prj.mmrrec.psf_config('measured', mMRpars['Cnt'])\n",
+    "msk = nipet.get_cylinder(mMRpars['Cnt'], rad=29., xo=0., yo=0., unival=1, gpu_dim=True) > 0.9\n",
+    "\n",
     "## Randoms\n",
     "\n",
-    "r = nipet.randoms(m, mMRpars)[0]\n",
-    "print(\"Randoms: %.3g%%\" % (r.sum() / m['psino'].sum() * 100))\n",
+    "r = nipet.randoms(mSino, mMRpars)[0]\n",
+    "print(\"Randoms: %.3g%%\" % (r.sum() / mSino['psino'].sum() * 100))\n",
     "\n",
     "## Scatter\n",
     "\n",
-    "# One OSEM iteration estimate (implicitly using voxel-driven scatter model)\n",
-    "eim = nipet.mmrchain(datain, mMRpars, mu_h=mu_h, mu_o=mu_o, itr=1, histo=m, outpath=opth)['im']\n",
+    "# Estimated image from two OSEM iterations (implicitly using voxel-driven scatter model)\n",
+    "eim = nipet.mmrchain(datain, mMRpars, mu_h=mu_h, mu_o=mu_o, itr=2, histo=mSino, outpath=opth)['im']\n",
     "# Recalculate scatter\n",
-    "s = nipet.vsm(datain, (mu_h['im'], mu_o['im']), eim, mMRpars, histo=m, rsino=r)\n",
-    "print(\"Scatter: %.3g%%\" % (s.sum() / m['psino'].sum() * 100))\n",
+    "s = nipet.vsm(datain, (mu_h['im'], mu_o['im']), eim, mMRpars, histo=mSino, rsino=r)\n",
+    "print(\"Scatter: %.3g%%\" % (s.sum() / mSino['psino'].sum() * 100))\n",
+    "\n",
+    "# normalised randoms + scatter in GPU dimensions\n",
+    "r = nipet.mmraux.remgaps(r, mMRpars['txLUT'], mMRpars['Cnt'])\n",
+    "s = nipet.mmraux.remgaps(s, mMRpars['txLUT'], mMRpars['Cnt'])\n",
     "\n",
     "## Attenuation, Normalisation & Sensitivity\n",
+    "A = nipet.frwd_prj(mu, mMRpars, attenuation=True, dev_out=True)\n",
+    "N = nipet.mmrnorm.get_norm_sino(datain, mMRpars, mSino, gpu_dim=True)\n",
+    "AN = cu.asarray(A * N)\n",
+    "rs_AN = nimpa.div(r + s, AN, default=1)\n",
     "\n",
-    "A = nipet.frwd_prj(mu_h['im'] + mu_o['im'], mMRpars, attenuation=True)\n",
-    "N = nipet.mmrnorm.get_norm_sino(datain, mMRpars, m)\n",
-    "AN = A * N\n",
-    "sim = nipet.back_prj(AN, mMRpars)"
+    "sim = nipet.back_prj(AN, mMRpars, dev_out=True, sync=False)\n",
+    "imsmooth(sim, psf=psf, output_array=sim)\n",
+    "sim_inv_msk = cu.zeros_like(sim)\n",
+    "sim_inv_msk[msk] = np.float32(1) / sim[msk]"
    ]
   },
   {
@@ -253,9 +367,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "volshow(OrderedDict([(\"Prompts\", m['psino']), (\"Delayed\", m['dsino']), (\"Attenuation\", A),\n",
-    "                     (\"Scatter\", s), (\"Randoms\", r), (\"Normalisation\", N)]),\n",
-    "        cmaps=['inferno']*6, colorbars=[1]*6, ncols=3, figsize=(9.5, 6));"
+    "# \"Attenuation\": A, \"Normalisation\": N, \"Scatter\": s, \"Randoms\": r\n",
+    "nimpa.imscroll({\"Prompts\": mSino['psino'], \"Delayed\": mSino['dsino']}, cmap='inferno',\n",
+    "               fig=plt.figure(num=4, figsize=(9.5, 3.5)));"
    ]
   },
   {
@@ -265,17 +379,31 @@
    "outputs": [],
    "source": [
     "## MLEM with RM\n",
+    "y = [np.ones_like(sim)]              # reconstructed image\n",
+    "Hy, XHy, crr, bim, mul = (None,) * 5 # temporary variables\n",
+    "m = cu.asarray(nipet.mmraux.remgaps(mSino['psino'], mMRpars['txLUT'], mMRpars['Cnt']))\n",
     "\n",
-    "psf = functools.partial(gaussian_filter, sigma=2.5 / SIGMA2FWHMmm)\n",
-    "msk = nipet.img.mmrimg.get_cylinder(mMRpars['Cnt'], rad=29., xo=0., yo=0., unival=1, gpu_dim=False) <= 0.9\n",
-    "sim_inv = 1 / psf(sim)\n",
-    "sim_inv[msk] = 0\n",
-    "rs = r + s\n",
-    "ANm = AN * m['psino']\n",
-    "recon_mlem = [np.ones_like(sim)]\n",
     "for k in trange(4 * 14, desc=\"MLEM\"):\n",
-    "    fprj = AN * nipet.frwd_prj(psf(recon_mlem[-1]), mMRpars) + rs\n",
-    "    recon_mlem.append(recon_mlem[-1] * sim_inv * psf(nipet.back_prj(ANm / fprj, mMRpars)))"
+    "    # forward-project current reconstruction to sinogram space\n",
+    "    Hy = imsmooth(y[-1], psf=psf, output_array=Hy, sync=False)\n",
+    "    XHy = nipet.frwd_prj(Hy, mMRpars, dev_out=True, fullsino_out=False, output=XHy, sync=False)\n",
+    "    # add randoms and scatter estimates\n",
+    "    if False:\n",
+    "        # recalculate scatter\n",
+    "        s = nipet.vsm(datain, (mu_h['im'], mu_o['im']), y[-1], mMRpars, histo=mSino, rsino=r)\n",
+    "        s = nipet.mmraux.remgaps(s, mMRpars['txLUT'], mMRpars['Cnt'])\n",
+    "        rs_AN = nimpa.div(r + s, AN, default=1)\n",
+    "    XHy = nimpa.add(XHy, rs_AN, output=XHy, sync=False)\n",
+    "\n",
+    "    # corrections\n",
+    "    crr = nimpa.div(m, XHy, default=1, output=crr, sync=False)\n",
+    "    # back-project corrections to image space\n",
+    "    bim = nipet.back_prj(crr, mMRpars, dev_out=True, output=bim, sync=False)\n",
+    "    mul = imsmooth(bim, psf=psf, output_array=mul, sync=False)\n",
+    "    # apply FOV mask and normalise with scanner sensitivity\n",
+    "    mul = nimpa.mul(mul, sim_inv_msk, output=mul, sync=False)\n",
+    "    # update reconstructed image\n",
+    "    y.append(nimpa.mul(y[-1], mul, sync=False))"
    ]
   },
   {
@@ -285,9 +413,11 @@
    "outputs": [],
    "source": [
     "# central slice across iterations\n",
-    "volshow(\n",
-    "    np.asanyarray(recon_mlem[1::5])[:, :, 90:-100, 110:-110],\n",
-    "    cmaps=['magma'] * len(recon_mlem[1::5]), ncols=4, figsize=(9.5, 7), frameon=False);"
+    "nimpa.imscroll(\n",
+    "    {\n",
+    "        f\"{k}\": nipet.img.mmrimg.convert2e7(y[k], mMRpars['Cnt'])[:, 90:-100, 110:-110]\n",
+    "        for k in range(7, len(y), 7)}, cmap='magma', vmin=0, vmax=np.percentile(y[-1], 99.95),\n",
+    "    fig=plt.figure(num=6, figsize=(9.5, 3.5)));"
    ]
   }
  ],
diff --git a/niftypet/__init__.py b/niftypet/__init__.py
index 8db66d3d..69e3be50 100644
--- a/niftypet/__init__.py
+++ b/niftypet/__init__.py
@@ -1 +1 @@
-__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+__path__ = __import__('pkgutil').extend_path(__path__, __name__)
diff --git a/niftypet/nipet/CMakeLists.txt b/niftypet/nipet/CMakeLists.txt
index 3dac1042..70b59188 100644
--- a/niftypet/nipet/CMakeLists.txt
+++ b/niftypet/nipet/CMakeLists.txt
@@ -6,6 +6,7 @@ project(mmr_auxe)
 file(GLOB SRC LIST_DIRECTORIES false "src/*.cu")
 include_directories(src)
 include_directories(${Python3_INCLUDE_DIRS})
+include_directories(${CUVEC_INCLUDE_DIRS})
 include_directories(${Python3_NumPy_INCLUDE_DIRS})
 
 add_library(${PROJECT_NAME} SHARED ${SRC})
@@ -19,6 +20,7 @@ if(SKBUILD)
 python_extension_module(${PROJECT_NAME})
 endif()
 set_target_properties(${PROJECT_NAME} PROPERTIES
+  CXX_STANDARD 11
   #VERSION ${CMAKE_PROJECT_VERSION} SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
   PREFIX ""  # remove shared lib prefix to make importable
   INTERFACE_${PROJECT_NAME}_MAJOR_VERSION ${CMAKE_PROJECT_VERSION_MAJOR})
diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 25c7e116..94ce9ff3 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -46,7 +46,7 @@
 from .img.mmrimg import align_mumap
 from .img.mmrimg import convert2dev as im_e72dev
 from .img.mmrimg import convert2e7 as im_dev2e7
-from .img.mmrimg import get_cylinder, hdw_mumap, obj_mumap, pct_mumap
+from .img.mmrimg import get_cylinder, hdw_mumap, obj_mumap
 from .img.pipe import mmrchain
 from .lm.mmrhist import dynamic_timings, mmrhist, randoms
 from .mmraux import explore_input as classify_input
diff --git a/niftypet/nipet/img/auximg.py b/niftypet/nipet/img/auximg.py
index 3cfbce4b..9e59f14b 100644
--- a/niftypet/nipet/img/auximg.py
+++ b/niftypet/nipet/img/auximg.py
@@ -76,9 +76,10 @@ def dynamic_timings(flist, offset=0):
     Arguments:
       flist: can be 1D list of time duration for each dynamic frame, e.g.:
             flist = [15, 15, 15, 15, 30, 30, 30, ...]
-        or a 2D list of lists having 2 entries:
+        or a 2D list of lists having 2 entries per definition:
         first for the number of repetitions and the other for the frame duration, e.g.:
-            flist = [[4,15], [3,15], ...].
+            flist = [[4, 15], [8, 30], ...],
+        meaning 4x15s, then 8x30s, etc.
       offset: adjusts for the start time (usually when prompts are strong enough over randoms)
     Returns (dict):
       'timings': [[0, 15], [15, 30], [30, 45], [45, 60], [60, 90], [90, 120], [120, 150], ...]
@@ -119,8 +120,8 @@ def dynamic_timings(flist, offset=0):
         tsum = 0
         # list of frame timings
         t_frames = []
-        for i in range(0, farray.shape[0]):
-            for _ in range(0, farray[i, 0]):
+        for i in range(farray.shape[0]):
+            for _ in range(farray[i, 0]):
                 # frame start time
                 t0 = tsum
                 tsum += farray[i, 1]
@@ -131,5 +132,5 @@ def dynamic_timings(flist, offset=0):
                 frms[fi] = farray[i, 1]
                 fi += 1
     else:
-        raise TypeError('Unrecognised data input.')
+        raise TypeError('Unrecognised time frame definitions.')
     return {'total': tsum, 'frames': frms, 'timings': t_frames}
diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index cff4e6c0..ed3e0c4a 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -1,5 +1,4 @@
 """Image functions for PET data reconstruction and processing."""
-
 import glob
 import logging
 import math
@@ -7,7 +6,6 @@
 import os
 import re
 import shutil
-from subprocess import run
 
 import nibabel as nib
 import numpy as np
@@ -278,17 +276,10 @@ def mudcm2nii(datain, Cnt):
     im = np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32)
     nimpa.array2nii(im, B, os.path.join(os.path.dirname(datain['mumapDCM']), 'muref.nii.gz'))
     # -------------------------------------------------------------------------------------
-    fmu = os.path.join(os.path.dirname(datain['mumapDCM']), 'mu_r.nii.gz')
-    if os.path.isfile(Cnt['RESPATH']):
-        run([
-            Cnt['RESPATH'], '-ref',
-            os.path.join(os.path.dirname(datain['mumapDCM']), 'muref.nii.gz'), '-flo',
-            os.path.join(os.path.dirname(datain['mumapDCM']), 'mu.nii.gz'), '-res', fmu, '-pad',
-            '0'])
-    else:
-        log.error('path to resampling executable is incorrect!')
-        raise IOError('Error launching NiftyReg for image resampling.')
-
+    opth = os.path.dirname(datain['mumapDCM'])
+    fmu = os.path.join(opth, 'mu_r.nii.gz')
+    nimpa.resample_dipy(os.path.join(opth, 'muref.nii.gz'), os.path.join(opth, 'mu.nii.gz'),
+                        fimout=fmu, intrp=1, dtype_nifti=np.float32)
     return fmu
 
 
@@ -346,23 +337,12 @@ def obj_mumap(
     for d in resdcm:
         os.remove(d)
 
-    # convert the DICOM mu-map images to nii
-    run([Cnt['DCM2NIIX'], '-f', fnii + tstmp, '-o', fmudir, datain['mumapDCM']])
-    # piles for the T1w, pick one:
-    fmunii = glob.glob(os.path.join(fmudir, '*' + fnii + tstmp + '*.nii*'))[0]
-    # fmunii = glob.glob( os.path.join(datain['mumapDCM'], '*converted*.nii*') )
-    # fmunii = fmunii[0]
+    # > convert the DICOM mu-map images to NIfTI
+    fmunii = nimpa.dcm2nii(datain['mumapDCM'], fnii + tstmp, outpath=fmudir)
 
-    # the converted nii image resample to the reference size
+    # > resampled the NIfTI converted image to the reference shape/size
     fmu = os.path.join(fmudir, comment + 'mumap_tmp.nii.gz')
-    if os.path.isfile(Cnt['RESPATH']):
-        cmd = [Cnt['RESPATH'], '-ref', fmuref, '-flo', fmunii, '-res', fmu, '-pad', '0']
-        if log.getEffectiveLevel() > logging.INFO:
-            cmd.append('-voff')
-        run(cmd)
-    else:
-        log.error('path to resampling executable is incorrect!')
-        raise IOError('Path to executable is incorrect!')
+    nimpa.resample_dipy(fmuref, fmunii, fimout=fmu, intrp=1, dtype_nifti=np.float32)
 
     nim = nib.load(fmu)
     # get the affine transform
@@ -411,7 +391,7 @@ def align_mumap(
     datain,
     scanner_params=None,
     outpath='',
-    reg_tool='niftyreg',
+    reg_tool='dipy',
     use_stored=False,
     hst=None,
     t0=0,
@@ -460,9 +440,8 @@ def align_mumap(
     # ---------------------------------------------------------------------------
     # > used stored if requested
     if use_stored:
-        fmu_stored = fnm + '-aligned-to_t'\
-                     + str(t0)+'-'+str(t1)+'_'+petopt.upper()\
-                     + fcomment
+        fmu_stored = fnm + '-aligned-to_t' + str(t0) + '-' + str(
+            t1) + '_' + petopt.upper() + fcomment
         fmupath = os.path.join(opth, fmu_stored + '.nii.gz')
 
         if os.path.isfile(fmupath):
@@ -496,8 +475,8 @@ def align_mumap(
             if 'txLUT' in scanner_params:
                 hst = mmrhist(datain, scanner_params, t0=t0, t1=t1)
             else:
-                raise ValueError('Full scanner are parameters not provided\
-                     but are required for histogramming.')
+                raise ValueError(
+                    'Full scanner are parameters not provided but are required for histogramming.')
 
     # ========================================================
     # -get hardware mu-map
@@ -565,9 +544,7 @@ def align_mumap(
         if musrc == 'ute' and ute_name in datain and os.path.exists(datain[ute_name]):
             # change to NIfTI if the UTE sequence is in DICOM files (folder)
             if os.path.isdir(datain[ute_name]):
-                fnew = os.path.basename(datain[ute_name])
-                run([Cnt['DCM2NIIX'], '-f', fnew, datain[ute_name]])
-                fute = glob.glob(os.path.join(datain[ute_name], fnew + '*nii*'))[0]
+                fute = nimpa.dcm2nii(datain[ute_name], os.path.basename(datain[ute_name]))
             elif os.path.isfile(datain[ute_name]):
                 fute = datain[ute_name]
 
@@ -580,8 +557,7 @@ def align_mumap(
                     fpet,
                     fute,
                     outpath=os.path.join(outpath, 'PET', 'positioning'),
-                    executable=Cnt['REGPATH'],
-                    omp=multiprocessing.cpu_count() / 2,                 # pcomment=fcomment,
+                    omp=multiprocessing.cpu_count() // 2,                # pcomment=fcomment,
                     rigOnly=True,
                     affDirect=False,
                     maxit=5,
@@ -597,6 +573,15 @@ def align_mumap(
                     ffwhm=15.,                                           # pillilitres
                     fthrsh=0.05,
                     verbose=verbose)
+
+            elif reg_tool == 'dipy':
+                regdct = nimpa.affine_dipy(fpet, fute, nbins=32, metric='MI',
+                                           level_iters=[10000, 1000,
+                                                        200], sigmas=[3.0, 1.0,
+                                                                      0.0], factors=[4, 2, 1],
+                                           outpath=os.path.join(outpath, 'PET', 'positioning'),
+                                           faffine=None, pickname='ref', fcomment='', rfwhm=2.,
+                                           ffwhm=2., verbose=verbose)
             else:
                 raise ValueError('unknown registration tool requested')
 
@@ -614,8 +599,7 @@ def align_mumap(
                     fpet,
                     ft1w,
                     outpath=os.path.join(outpath, 'PET', 'positioning'),
-                    executable=Cnt['REGPATH'],
-                    omp=multiprocessing.cpu_count() / 2,
+                    omp=multiprocessing.cpu_count() // 2,
                     rigOnly=True,
                     affDirect=False,
                     maxit=5,
@@ -631,6 +615,16 @@ def align_mumap(
                     ffwhm=15.,                                           # pillilitres
                     fthrsh=0.05,
                     verbose=verbose)
+
+            elif reg_tool == 'dipy':
+                regdct = nimpa.affine_dipy(fpet, ft1w, nbins=32, metric='MI',
+                                           level_iters=[10000, 1000,
+                                                        200], sigmas=[3.0, 1.0,
+                                                                      0.0], factors=[4, 2, 1],
+                                           outpath=os.path.join(outpath, 'PET', 'positioning'),
+                                           faffine=None, pickname='ref', fcomment='', rfwhm=2.,
+                                           ffwhm=2., verbose=verbose)
+
             else:
                 raise ValueError('unknown registration tool requested')
 
@@ -666,9 +660,7 @@ def align_mumap(
             # convert the DICOM mu-map images to nii
             if 'mumapDCM' not in datain:
                 raise IOError('DICOM with the UTE mu-map are not given.')
-            run([Cnt['DCM2NIIX'], '-f', fnii + tstmp, '-o', opth, datain['mumapDCM']])
-            # piles for the T1w, pick one:
-            fflo = glob.glob(os.path.join(opth, '*' + fnii + tstmp + '*.nii*'))[0]
+            fflo = nimpa.dcm2nii(datain['mumapDCM'], fnii + tstmp, outpath=opth)
         else:
             if os.path.isfile(datain['UTE']):
                 fflo = datain['UTE']
@@ -676,12 +668,13 @@ def align_mumap(
                 raise IOError('The provided NIfTI UTE path is not valid.')
 
     # > call the resampling routine to get the pCT/UTE in place
-    if reg_tool == "spm":
+    if reg_tool == 'spm':
         nimpa.resample_spm(fpet, fflo, faff_mrpet, fimout=freg, del_ref_uncmpr=True,
                            del_flo_uncmpr=True, del_out_uncmpr=True)
+    elif reg_tool == 'dipy':
+        nimpa.resample_dipy(fpet, fflo, faff=faff_mrpet, fimout=freg)
     else:
-        nimpa.resample_niftyreg(fpet, fflo, faff_mrpet, fimout=freg, executable=Cnt['RESPATH'],
-                                verbose=verbose)
+        nimpa.resample_niftyreg(fpet, fflo, faff_mrpet, fimout=freg, verbose=verbose)
 
     # -get the NIfTI of registered image
     nim = nib.load(freg)
@@ -712,9 +705,8 @@ def align_mumap(
     if store or store_npy:
         nimpa.create_dir(opth)
         if faff == '':
-            fname = fnm + '-aligned-to_t'\
-                    + str(t0)+'-'+str(t1)+'_'+petopt.upper()\
-                    + fcomment
+            fname = fnm + '-aligned-to_t' + str(t0) + '-' + str(
+                t1) + '_' + petopt.upper() + fcomment
         else:
             fname = fnm + '-aligned-to-given-affine' + fcomment
     if store_npy:
@@ -736,181 +728,6 @@ def align_mumap(
     return mu_dct
 
 
-# ================================================================================
-# PSEUDO CT MU-MAP
-# --------------------------------------------------------------------------------
-
-
-def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac', faff='', fpet='',
-              fcomment='', outpath='', store_npy=False, store=False, verbose=False):
-    '''
-    GET THE MU-MAP from pCT IMAGE (which is in T1w space)
-    * the mu-map will be registered to PET which will be reconstructed for time frame t0-t1
-    * it f0 and t1 are not given the whole LM dataset will be reconstructed
-    * the reconstructed PET can be attenuation and scatter corrected or NOT using petopt
-    '''
-    if hst is None:
-        hst = []
-
-    # constants, transaxial and axial LUTs are extracted
-    Cnt = scanner_params['Cnt']
-
-    if not os.path.isfile(faff):
-        from niftypet.nipet.prj import mmrrec
-
-        # histogram the list data if needed
-        if not hst:
-            from niftypet.nipet.lm import mmrhist
-            hst = mmrhist(datain, scanner_params, t0=t0, t1=t1)
-
-    # get hardware mu-map
-    if datain.get("hmumap", "").endswith(".npz") and os.path.isfile(datain["hmumap"]):
-        muh = np.load(datain["hmumap"], allow_pickle=True)["hmu"]
-        (log.info if verbose else log.debug)('loaded hardware mu-map from file:\n{}'.format(
-            datain['hmumap']))
-    elif outpath:
-        hmupath = os.path.join(outpath, "mumap-hdw", "hmumap.npz")
-        if os.path.isfile(hmupath):
-            muh = np.load(hmupath, allow_pickle=True)["hmu"]
-            datain['hmumap'] = hmupath
-        else:
-            raise IOError('Invalid path to the hardware mu-map')
-    else:
-        log.error('The hardware mu-map is required first.')
-        raise IOError('Could not find the hardware mu-map!')
-
-    if not {'MRT1W#', 'T1nii', 'T1bc'}.intersection(datain):
-        log.error('no MR T1w images required for co-registration!')
-        raise IOError('Missing MR data')
-    # ----------------------------------
-
-    # output dictionary
-    mu_dct = {}
-    if not os.path.isfile(faff):
-        # first recon pet to get the T1 aligned to it
-        if petopt == 'qnt':
-            # ---------------------------------------------
-            # OPTION 1 (quantitative recon with all corrections using MR-based mu-map)
-            # get UTE object mu-map (may not be in register with the PET data)
-            mudic = obj_mumap(datain, Cnt)
-            muo = mudic['im']
-            # reconstruct PET image with UTE mu-map to which co-register T1w
-            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=3, itr=itr,
-                                    fwhm=0., fcomment=fcomment + '_qntUTE',
-                                    outpath=os.path.join(outpath, 'PET',
-                                                         'positioning'), store_img=True)
-        elif petopt == 'nac':
-            # ---------------------------------------------
-            # OPTION 2 (recon without any corrections for scatter and attenuation)
-            # reconstruct PET image with UTE mu-map to which co-register T1w
-            muo = np.zeros(muh.shape, dtype=muh.dtype)
-            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=1, itr=itr,
-                                    fwhm=0., fcomment=fcomment + '_NAC',
-                                    outpath=os.path.join(outpath, 'PET',
-                                                         'positioning'), store_img=True)
-        elif petopt == 'ac':
-            # ---------------------------------------------
-            # OPTION 3 (recon with attenuation correction only but no scatter)
-            # reconstruct PET image with UTE mu-map to which co-register T1w
-            mudic = obj_mumap(datain, Cnt, outpath=outpath)
-            muo = mudic['im']
-            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=1, itr=itr,
-                                    fwhm=0., fcomment=fcomment + '_AC',
-                                    outpath=os.path.join(outpath, 'PET',
-                                                         'positioning'), store_img=True)
-
-        fpet = recout.fpet
-        mu_dct['fpet'] = fpet
-
-        # ------------------------------
-        # get the affine transformation
-        ft1w = nimpa.pick_t1w(datain)
-        try:
-            regdct = nimpa.coreg_spm(fpet, ft1w,
-                                     outpath=os.path.join(outpath, 'PET', 'positioning'))
-        except Exception:
-            regdct = nimpa.affine_niftyreg(
-                fpet,
-                ft1w,
-                outpath=os.path.join(outpath, 'PET', 'positioning'), # pcomment=fcomment,
-                executable=Cnt['REGPATH'],
-                omp=multiprocessing.cpu_count() / 2,
-                rigOnly=True,
-                affDirect=False,
-                maxit=5,
-                speed=True,
-                pi=50,
-                pv=50,
-                smof=0,
-                smor=0,
-                rmsk=True,
-                fmsk=True,
-                rfwhm=15.,                                           # pillilitres
-                rthrsh=0.05,
-                ffwhm=15.,                                           # pillilitres
-                fthrsh=0.05,
-                verbose=verbose)
-
-        faff = regdct['faff']
-        # ------------------------------
-
-    # pCT file name
-    if outpath == '':
-        pctdir = os.path.dirname(datain['pCT'])
-    else:
-        pctdir = os.path.join(outpath, 'mumap-obj')
-    mmraux.create_dir(pctdir)
-    fpct = os.path.join(pctdir, 'pCT_r_tmp' + fcomment + '.nii.gz')
-
-    # > call the resampling routine to get the pCT in place
-    if os.path.isfile(Cnt['RESPATH']):
-        cmd = [
-            Cnt['RESPATH'], '-ref', fpet, '-flo', datain['pCT'], '-trans', faff, '-res', fpct,
-            '-pad', '0']
-        if log.getEffectiveLevel() > logging.INFO:
-            cmd.append('-voff')
-        run(cmd)
-    else:
-        log.error('path to resampling executable is incorrect!')
-        raise IOError('Incorrect path to executable!')
-
-    # get the NIfTI of the pCT
-    nim = nib.load(fpct)
-    A = nim.get_sform()
-    pct = nim.get_fdata(dtype=np.float32)
-    pct = pct[:, ::-1, ::-1]
-    pct = np.transpose(pct, (2, 1, 0))
-    # convert the HU units to mu-values
-    mu = hu2mu(pct)
-    # get rid of negatives
-    mu[mu < 0] = 0
-
-    # return image dictionary with the image itself and other parameters
-    mu_dct['im'] = mu
-    mu_dct['affine'] = A
-    mu_dct['faff'] = faff
-
-    if store:
-        # now save to numpy array and NIfTI in this folder
-        if outpath == '':
-            pctumapdir = os.path.join(datain['corepath'], 'mumap-obj')
-        else:
-            pctumapdir = os.path.join(outpath, 'mumap-obj')
-        mmraux.create_dir(pctumapdir)
-        # > Numpy
-        if store_npy:
-            fnp = os.path.join(pctumapdir, "mumap-pCT.npz")
-            np.savez(fnp, mu=mu, A=A)
-
-        # > NIfTI
-        fmu = os.path.join(pctumapdir, 'mumap-pCT' + fcomment + '.nii.gz')
-        nimpa.array2nii(mu[::-1, ::-1, :], A, fmu)
-        mu_dct['fim'] = fmu
-        datain['mumapCT'] = fmu
-
-    return mu_dct
-
-
 # ********************************************************************************
 # GET HARDWARE MU-MAPS with positions and offsets
 # --------------------------------------------------------------------------------
@@ -1034,9 +851,7 @@ def rd_hmu(fh):
 
 
 def get_hmupos(datain, parts, Cnt, outpath=''):
-    # check if registration executable exists
-    if not os.path.isfile(Cnt['RESPATH']):
-        raise IOError('No registration executable found!')
+
     # ----- get positions from the DICOM list-mode file -----
     ihdr, csainfo = mmraux.hdr_lm(datain, Cnt)
     # pable position origin
@@ -1101,7 +916,7 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
     nimpa.array2nii(np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32), B,
                     fref)
 
-    # pefine a dictionary of all positions/offsets of hardware mu-maps
+    # define a dictionary of all positions/offsets of hardware mu-maps
     hmupos = [None] * 5
     hmupos[0] = {
         'TabPosOrg': tpozyx, # prom DICOM of LM file
@@ -1142,15 +957,12 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
         A[2, 3] = -10 * ((s[0] - 1) * vs[0] - vpos[0])
         nimpa.array2nii(im[::-1, ::-1, :], A, hmupos[i]['niipath'])
 
-        # resample using nify.reg
+        # > resample using DIPY in nimpa
         fout = os.path.join(os.path.dirname(hmupos[0]['niipath']),
                             'r' + os.path.basename(hmupos[i]['niipath']).split('.')[0] + '.nii.gz')
-        cmd = [
-            Cnt['RESPATH'], '-ref', hmupos[0]['niipath'], '-flo', hmupos[i]['niipath'], '-res',
-            fout, '-pad', '0']
-        if log.getEffectiveLevel() > logging.INFO:
-            cmd.append('-voff')
-        run(cmd)
+
+        nimpa.resample_dipy(hmupos[0]['niipath'], hmupos[i]['niipath'], fimout=fout, intrp=1,
+                            dtype_nifti=np.float32)
 
     return hmupos
 
@@ -1284,31 +1096,30 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
             ft1w = datain['T1bc']
         elif os.path.isdir(datain['MRT1W']):
             # create file name for the converted NIfTI image
-            fnii = 'converted'
-            run([Cnt['DCM2NIIX'], '-f', fnii, datain['T1nii']])
-            ft1nii = glob.glob(os.path.join(datain['T1nii'], '*converted*.nii*'))
-            ft1w = ft1nii[0]
+            ft1w = nimpa.dcm2nii(datain['T1nii'], 'converted')
         else:
             raise IOError('Disaster: no T1w image!')
 
-        # putput for the T1w in register with PET
-        ft1out = os.path.join(os.path.dirname(ft1w), 'T1w_r' + '.nii.gz')
-        # pext file fo rthe affine transform T1w->PET
-        faff = os.path.join(os.path.dirname(ft1w), fcomment + 'mr2pet_affine' + '.txt')
+        # > putput for the T1w in register with PET
+        # ft1out = os.path.join(os.path.dirname(ft1w), 'T1w_r' + '.nii.gz')
+        # > pext file fo rthe affine transform T1w->PET
+        # faff = os.path.join(os.path.dirname(ft1w), fcomment + 'mr2pet_affine' + '.txt')
         # time.strftime('%d%b%y_%H.%M',time.gmtime())
         # > call the registration routine
-        if os.path.isfile(Cnt['REGPATH']):
-            cmd = [
-                Cnt['REGPATH'], '-ref', recute.fpet, '-flo', ft1w, '-rigOnly', '-speeeeed', '-aff',
-                faff, '-res', ft1out]
-            if log.getEffectiveLevel() > logging.INFO:
-                cmd.append('-voff')
-            run(cmd)
-        else:
-            raise IOError('Path to registration executable is incorrect!')
+        # cmd = [
+        #     Cnt['REGPATH'], '-ref', recute.fpet, '-flo', ft1w, '-rigOnly', '-speeeeed', '-aff',
+        #     faff, '-res', ft1out]
+        # TODO: add `-pad 0`, drop `-inter 1`?
+        # pi=50, pv=50, smof=0, smor=0,
+        # maxit=5,
+        regdct = nimpa.affine_niftyreg(recute.fpet, ft1w, rigOnly=True, speed=True,
+                                       outpath=os.path.dirname(ft1w),
+                                       fname_aff=fcomment + 'mr2pet_affine' + '.txt',
+                                       omp=multiprocessing.cpu_count() // 2,
+                                       verbose=log.getEffectiveLevel() <= logging.INFO)
 
         # pet the pCT mu-map with the above faff
-        pmudic = pct_mumap(datain, txLUT_, axLUT_, Cnt, faff=faff, fpet=recute.fpet,
+        pmudic = pct_mumap(datain, txLUT_, axLUT_, Cnt, faff=regdct['faff'], fpet=recute.fpet,
                            fcomment=fcomment)
         mup = pmudic['im']
 
@@ -1319,3 +1130,177 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
         muh = muh[2 * Cnt['RNG_STRT']:2 * Cnt['RNG_END'], :, :]
         muo = muo[2 * Cnt['RNG_STRT']:2 * Cnt['RNG_END'], :, :]
         return [muh, muo]
+
+
+# # ================================================================================
+# # PSEUDO CT MU-MAP
+# # --------------------------------------------------------------------------------
+
+# def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac', faff='', fpet='',
+#               fcomment='', outpath='', store_npy=False, store=False, verbose=False):
+#     '''
+#     GET THE MU-MAP from pCT IMAGE (which is in T1w space)
+#     * the mu-map will be registered to PET which will be reconstructed for time frame t0-t1
+#     * it f0 and t1 are not given the whole LM dataset will be reconstructed
+#     * the reconstructed PET can be attenuation and scatter corrected or NOT using petopt
+#     '''
+#     if hst is None:
+#         hst = []
+
+#     # constants, transaxial and axial LUTs are extracted
+#     Cnt = scanner_params['Cnt']
+
+#     if not os.path.isfile(faff):
+#         from niftypet.nipet.prj import mmrrec
+
+#         # histogram the list data if needed
+#         if not hst:
+#             from niftypet.nipet.lm import mmrhist
+#             hst = mmrhist(datain, scanner_params, t0=t0, t1=t1)
+
+#     # get hardware mu-map
+#     if datain.get("hmumap", "").endswith(".npz") and os.path.isfile(datain["hmumap"]):
+#         muh = np.load(datain["hmumap"], allow_pickle=True)["hmu"]
+#         (log.info if verbose else log.debug)('loaded hardware mu-map from file:\n{}'.format(
+#             datain['hmumap']))
+#     elif outpath:
+#         hmupath = os.path.join(outpath, "mumap-hdw", "hmumap.npz")
+#         if os.path.isfile(hmupath):
+#             muh = np.load(hmupath, allow_pickle=True)["hmu"]
+#             datain['hmumap'] = hmupath
+#         else:
+#             raise IOError('Invalid path to the hardware mu-map')
+#     else:
+#         log.error('The hardware mu-map is required first.')
+#         raise IOError('Could not find the hardware mu-map!')
+
+#     if not {'MRT1W#', 'T1nii', 'T1bc'}.intersection(datain):
+#         log.error('no MR T1w images required for co-registration!')
+#         raise IOError('Missing MR data')
+#     # ----------------------------------
+
+#     # output dictionary
+#     mu_dct = {}
+#     if not os.path.isfile(faff):
+#         # first recon pet to get the T1 aligned to it
+#         if petopt == 'qnt':
+#             # ---------------------------------------------
+#             # OPTION 1 (quantitative recon with all corrections using MR-based mu-map)
+#             # get UTE object mu-map (may not be in register with the PET data)
+#             mudic = obj_mumap(datain, Cnt)
+#             muo = mudic['im']
+#             # reconstruct PET image with UTE mu-map to which co-register T1w
+#             recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=3, itr=itr,
+#                                     fwhm=0., fcomment=fcomment + '_qntUTE',
+#                                     outpath=os.path.join(outpath, 'PET',
+#                                                          'positioning'), store_img=True)
+#         elif petopt == 'nac':
+#             # ---------------------------------------------
+#             # OPTION 2 (recon without any corrections for scatter and attenuation)
+#             # reconstruct PET image with UTE mu-map to which co-register T1w
+#             muo = np.zeros(muh.shape, dtype=muh.dtype)
+#             recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=1, itr=itr,
+#                                     fwhm=0., fcomment=fcomment + '_NAC',
+#                                     outpath=os.path.join(outpath, 'PET',
+#                                                          'positioning'), store_img=True)
+#         elif petopt == 'ac':
+#             # ---------------------------------------------
+#             # OPTION 3 (recon with attenuation correction only but no scatter)
+#             # reconstruct PET image with UTE mu-map to which co-register T1w
+#             mudic = obj_mumap(datain, Cnt, outpath=outpath)
+#             muo = mudic['im']
+#             recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=1, itr=itr,
+#                                     fwhm=0., fcomment=fcomment + '_AC',
+#                                     outpath=os.path.join(outpath, 'PET',
+#                                                          'positioning'), store_img=True)
+
+#         fpet = recout.fpet
+#         mu_dct['fpet'] = fpet
+
+#         # ------------------------------
+#         # get the affine transformation
+#         ft1w = nimpa.pick_t1w(datain)
+#         try:
+#             regdct = nimpa.coreg_spm(fpet, ft1w,
+#                                      outpath=os.path.join(outpath, 'PET', 'positioning'))
+#         except Exception:
+#             regdct = nimpa.affine_niftyreg(
+#                 fpet,
+#                 ft1w,
+#                 outpath=os.path.join(outpath, 'PET', 'positioning'), # pcomment=fcomment,
+#                 executable=Cnt['REGPATH'],
+#                 omp=multiprocessing.cpu_count() // 2,
+#                 rigOnly=True,
+#                 affDirect=False,
+#                 maxit=5,
+#                 speed=True,
+#                 pi=50,
+#                 pv=50,
+#                 smof=0,
+#                 smor=0,
+#                 rmsk=True,
+#                 fmsk=True,
+#                 rfwhm=15.,                                           # pillilitres
+#                 rthrsh=0.05,
+#                 ffwhm=15.,                                           # pillilitres
+#                 fthrsh=0.05,
+#                 verbose=verbose)
+
+#         faff = regdct['faff']
+#         # ------------------------------
+
+#     # pCT file name
+#     if outpath == '':
+#         pctdir = os.path.dirname(datain['pCT'])
+#     else:
+#         pctdir = os.path.join(outpath, 'mumap-obj')
+#     mmraux.create_dir(pctdir)
+#     fpct = os.path.join(pctdir, 'pCT_r_tmp' + fcomment + '.nii.gz')
+
+#     # > call the resampling routine to get the pCT in place
+#     if os.path.isfile(Cnt['RESPATH']):
+#         cmd = [
+#             Cnt['RESPATH'], '-ref', fpet, '-flo', datain['pCT'], '-trans', faff, '-res', fpct,
+#             '-pad', '0']
+#         if log.getEffectiveLevel() > logging.INFO:
+#             cmd.append('-voff')
+#         run(cmd)
+#     else:
+#         log.error('path to resampling executable is incorrect!')
+#         raise IOError('Incorrect path to executable!')
+
+#     # get the NIfTI of the pCT
+#     nim = nib.load(fpct)
+#     A = nim.get_sform()
+#     pct = nim.get_fdata(dtype=np.float32)
+#     pct = pct[:, ::-1, ::-1]
+#     pct = np.transpose(pct, (2, 1, 0))
+#     # convert the HU units to mu-values
+#     mu = hu2mu(pct)
+#     # get rid of negatives
+#     mu[mu < 0] = 0
+
+#     # return image dictionary with the image itself and other parameters
+#     mu_dct['im'] = mu
+#     mu_dct['affine'] = A
+#     mu_dct['faff'] = faff
+
+#     if store:
+#         # now save to numpy array and NIfTI in this folder
+#         if outpath == '':
+#             pctumapdir = os.path.join(datain['corepath'], 'mumap-obj')
+#         else:
+#             pctumapdir = os.path.join(outpath, 'mumap-obj')
+#         mmraux.create_dir(pctumapdir)
+#         # > Numpy
+#         if store_npy:
+#             fnp = os.path.join(pctumapdir, "mumap-pCT.npz")
+#             np.savez(fnp, mu=mu, A=A)
+
+#         # > NIfTI
+#         fmu = os.path.join(pctumapdir, 'mumap-pCT' + fcomment + '.nii.gz')
+#         nimpa.array2nii(mu[::-1, ::-1, :], A, fmu)
+#         mu_dct['fim'] = fmu
+#         datain['mumapCT'] = fmu
+
+#     return mu_dct
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index df94e994..46f164b6 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -2,7 +2,6 @@
 import logging
 import os
 from numbers import Integral
-from subprocess import call
 from textwrap import dedent
 
 import numpy as np
@@ -47,6 +46,7 @@ def mmrchain(
     trim_interp=0,          # interpolation for upsampling used in PVC
     trim_memlim=True,       # reduced use of memory for machines
                             # with limited memory (slow though)
+    trim_store=True,
     pvcroi=None,            # ROI used for PVC.  If undefined no PVC
                             # is performed.
     pvcreg_tool='niftyreg', # the registration tool used in PVC
@@ -112,19 +112,14 @@ def mmrchain(
         elif (isinstance(frames[0], str) and frames[0] == 'def'
               and all(isinstance(t, list) and len(t) == 2 for t in frames[1:])):
             # get total time and list of all time frames
-            dfrms = dynamic_timings(frames)
-            t_frms = dfrms[1:]
-
+            t_frms = dynamic_timings(frames)['timings'][1:]
         # if 1D:
         elif all(isinstance(t, Integral) for t in frames):
             # get total time and list of all time frames
-            dfrms = dynamic_timings(frames)
-            t_frms = dfrms[1:]
-
+            t_frms = dynamic_timings(frames)['timings'][1:]
         else:
-            log.error('osemdyn: frames definitions are not given\
-                in the correct list format: 1D [15,15,30,30,...]\
-                or 2D list [[2,15], [2,30], ...]')
+            log.error('osemdyn: frames definitions are not given in the correct list format:'
+                      ' 1D [15,15,30,30,...] or 2D list [[2,15], [2,30], ...]')
     else:
         log.error(
             'provided dynamic frames definitions are incorrect (should be a list of definitions).')
@@ -334,16 +329,8 @@ def mmrchain(
             nimpa.create_dir(fmureg)
             # the converted nii image resample to the reference size
             fmu = os.path.join(fmureg, 'mumap_dyn_frm' + str(ifrm) + fcomment + '.nii.gz')
-            # command for resampling
-            if os.path.isfile(Cnt['RESPATH']):
-                cmd = [
-                    Cnt['RESPATH'], '-ref', fmuref, '-flo', muod['fim'], '-trans', faff_frms[ifrm],
-                    '-res', fmu, '-pad', '0']
-                if log.getEffectiveLevel() > log.INFO:
-                    cmd.append('-voff')
-                call(cmd)
-            else:
-                raise IOError('Incorrect path to NiftyReg (resampling) executable.')
+            # TODO: add `-pad 0`, drop `-inter 1`?
+            nimpa.resample_niftyreg(fmuref, muod['fim'], faff_frms[ifrm], fimout=fmu)
             # get the new mu-map from the just resampled file
             muodct = nimpa.getnii(fmu, output='all')
             muo = muodct['im']
@@ -408,6 +395,7 @@ def mmrchain(
         # trim PET and upsample
         petu = nimpa.imtrimup(dynim, affine=image_affine(datain, Cnt), scale=trim_scale,
                               int_order=trim_interp, outpath=petimg, fname=fnm, fcomment=fcomment,
+                              fcomment_pfx=fout + '_', store_img=trim_store,
                               store_img_intrmd=store_img_intrmd, memlim=trim_memlim,
                               verbose=log.getEffectiveLevel())
 
@@ -539,11 +527,11 @@ def mmrchain(
             if fout is None:
                 fpetu = os.path.join(
                     pettrim,
-                    os.path.basename(fpet) + f'_trimmed-upsampled-scale-{trim_scale}')
+                    os.path.basename(fpet) + f'_recon-trimmed-upsampled-scale-{trim_scale}')
             else:
                 fpetu = os.path.join(
                     pettrim,
-                    os.path.basename(fout) + f'_trimmed-upsampled-scale-{trim_scale}')
+                    os.path.basename(fout) + f'_recon-trimmed-upsampled-scale-{trim_scale}')
             # in case of PVC
             if pvcroi:
                 # itertive Yang (iY) added to NIfTI descritoption
diff --git a/niftypet/nipet/lm/mmrhist.py b/niftypet/nipet/lm/mmrhist.py
index 20a99f3d..b347c058 100644
--- a/niftypet/nipet/lm/mmrhist.py
+++ b/niftypet/nipet/lm/mmrhist.py
@@ -132,7 +132,7 @@ def hist(
     pvs_sgtl = np.right_shift(hstout['pvs'], 8).astype(np.float32)
     pvs_crnl = np.bitwise_and(hstout['pvs'], 255).astype(np.float32)
 
-    cmass = Cnt['SO_VXZ'] * ndi.filters.gaussian_filter(hstout['mss'], cmass_sig, mode='mirror')
+    cmass = Cnt['SO_VXZ'] * ndi.gaussian_filter(hstout['mss'], cmass_sig, mode='mirror')
     log.debug(
         'centre of mass of axial radiodistribution (filtered with Gaussian of SD ={}):  COMPLETED.'
         .format(cmass_sig))
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 60b45d0d..54584220 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 from textwrap import dedent
 
+import cuvec as cu
 import numpy as np
 import pydicom as dcm
 from miutil.fdio import hasext
@@ -1022,7 +1023,7 @@ def get_dicoms(dfile, datain, Cnt):
         else:
             f0 = csahdr.find('RadionuclideCodeSequence')
             if f0 < 0:
-                print('w> could not find isotope name.  enter manually into Cnt[' 'ISOTOPE' ']')
+                log.warning("Could not find isotope name. Enter manually into Cnt['ISOTOPE']")
                 return None
             istp_coded = re.search(r'(?<=CodeValue:)\S*', csahdr[f0:f0 + 100]).group()
             if istp_coded == 'C-111A1': Cnt['ISOTOPE'] = 'F18'
@@ -1031,7 +1032,7 @@ def get_dicoms(dfile, datain, Cnt):
             elif istp_coded == 'C-128A2': Cnt['ISOTOPE'] = 'Ge68'
             elif istp_coded == 'C-131A3': Cnt['ISOTOPE'] = 'Ga68'
             else:
-                print('w> could not find isotope name.  enter manually into Cnt[' 'ISOTOPE' ']')
+                log.warning("Could not find isotope name. Enter manually into Cnt['ISOTOPE']")
                 return None
         # ---
 
@@ -1142,18 +1143,21 @@ def putgaps(s, txLUT, Cnt, sino_no=0):
     return sino.astype(s.dtype)
 
 
-def remgaps(sino, txLUT, Cnt):
-
+def remgaps(sino, txLUT, Cnt, output=None):
     # number of sino planes (2D sinos) depends on the span used
     nsinos = sino.shape[0]
 
-    # preallocate output sino without gaps, always in float
-    s = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
+    # preallocate output sino without gaps
+    if output is None:
+        output = cu.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
+    else:
+        assert output.shape == txLUT['Naw'], nsinos
+        assert output.dtype == np.dtype('float32')
     # fill the sino with gaps
-    mmr_auxe.rgaps(s, sino.astype(np.float32), txLUT, Cnt)
+    mmr_auxe.rgaps(output, cu.asarray(sino, dtype=np.float32), txLUT, Cnt)
 
     # return in the same data type as the input sino
-    return s.astype(sino.dtype)
+    return output.astype(sino.dtype)
 
 
 def mmrinit():
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index 675edd8f..683155d4 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -44,7 +44,7 @@ def trnx_prj(scanner_params, sino=None, im=None):
 
 
 def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=False,
-             fullsino_out=True, output=None):
+             fullsino_out=True, output=None, sync=True):
     """
     Calculate forward projection (a set of sinograms) for the provided input image.
     Arguments:
@@ -60,6 +60,7 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
             calculations (attenuation=True), the exponential of the negative of the integrated
             mu-values along LOR path is taken at the end.
         output(CuVec, optional) -- output sinogram.
+        sync(bool) -- whether to `cudaDeviceSynchronize()` after.
     """
     # Get particular scanner parameters: Constants, transaxial and axial LUTs
     Cnt = scanner_params['Cnt']
@@ -121,7 +122,7 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
         assert sinog.shape == out_shape
         assert sinog.dtype == np.dtype('float32')
     # --------------------
-    petprj.fprj(sinog.cuvec, cu.asarray(ims).cuvec, txLUT, axLUT, isub, Cnt, att)
+    petprj.fprj(sinog, cu.asarray(ims), txLUT, axLUT, isub, Cnt, att, sync=sync)
     # --------------------
 
     # get the sinogram bins in a full sinogram if requested
@@ -143,7 +144,7 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
 # ------------------------------------------------------------------------
 
 
-def back_prj(sino, scanner_params, isub=ISUB_DEFAULT, dev_out=False):
+def back_prj(sino, scanner_params, isub=ISUB_DEFAULT, dev_out=False, output=None, sync=True):
     '''
     Calculate forward projection for the provided input image.
     Arguments:
@@ -152,6 +153,8 @@ def back_prj(sino, scanner_params, isub=ISUB_DEFAULT, dev_out=False):
             transaxial and axial look up tables (LUT).
         isub -- array of transaxial indices of all sinograms (angles x bins) used for subsets;
             when the first element is negative, all transaxial bins are used (as in pure EM-ML).
+        output(CuVec, optional) -- output image.
+        sync(bool) -- whether to `cudaDeviceSynchronize()` after.
     '''
     # Get particular scanner parameters: Constants, transaxial and axial LUTs
     Cnt = scanner_params['Cnt']
@@ -199,10 +202,17 @@ def back_prj(sino, scanner_params, isub=ISUB_DEFAULT, dev_out=False):
         nvz = Cnt['rSZ_IMZ']
     else:
         nvz = Cnt['SZ_IMZ']
-    bimg = cu.zeros((Cnt['SZ_IMX'], Cnt['SZ_IMY'], nvz), dtype=np.float32)
+
+    out_shape = Cnt['SZ_IMX'], Cnt['SZ_IMY'], nvz
+    if output is None:
+        bimg = cu.zeros(out_shape, dtype=np.float32)
+    else:
+        bimg = cu.asarray(output)
+        assert bimg.shape == out_shape
+        assert bimg.dtype == np.dtype('float32')
 
     # > run back-projection
-    petprj.bprj(bimg.cuvec, cu.asarray(sinog).cuvec, txLUT, axLUT, isub, Cnt)
+    petprj.bprj(bimg, cu.asarray(sinog), txLUT, axLUT, isub, Cnt, sync=sync)
 
     if not dev_out:
         # > change from GPU optimised image dimensions to the standard Siemens shape
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 0f9d0d57..a92b16eb 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -105,6 +105,7 @@ def psf_config(psf, Cnt):
         [x, y, z]: list or Numpy array of separate FWHM of the PSF for each direction
         ndarray: 3 x 2*RSZ_PSF_KRNL+1 Numpy array directly defining the kernel in each direction
     '''
+
     def _config(fwhm3, check_len=True):
         # resolution modelling by custom kernels
         if check_len:
@@ -222,18 +223,16 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
         asng = np.ones(psng.shape, dtype=np.float32)
     else:
         # > check if the attenuation sino is given as an array
-        if isinstance(attnsino, np.ndarray) \
-                and attnsino.shape==(Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']):
+        if isinstance(attnsino, np.ndarray) and attnsino.shape == (Cnt['NSN11'], Cnt['NSANGLES'],
+                                                                   Cnt['NSBINS']):
             asng = mmraux.remgaps(attnsino, txLUT, Cnt)
             log.info('using provided attenuation factor sinogram')
-        elif isinstance(attnsino, np.ndarray) \
-                and attnsino.shape==(Cnt['Naw'], Cnt['NSN11']):
+        elif isinstance(attnsino, np.ndarray) and attnsino.shape == (Cnt['Naw'], Cnt['NSN11']):
             asng = attnsino
             log.info('using provided attenuation factor sinogram')
         else:
             asng = cu.zeros(psng.shape, dtype=np.float32)
-            petprj.fprj(asng.cuvec,
-                        cu.asarray(mus).cuvec, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt,
+            petprj.fprj(asng, cu.asarray(mus), txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt,
                         1)
     # > combine attenuation and normalisation
     ansng = asng * nsng
@@ -242,8 +241,8 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     # ========================================================================
     # Randoms
     # -------------------------------------------------------------------------
-    if isinstance(randsino, np.ndarray) \
-            and randsino.shape==(Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']):
+    if isinstance(randsino,
+                  np.ndarray) and randsino.shape == (Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']):
         rsino = randsino
         rsng = mmraux.remgaps(randsino, txLUT, Cnt)
     else:
@@ -294,8 +293,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
         sinoTIdx[n, 0] = Nprj
         sinoTIdx[n, 1:], s = get_subsets14(n, scanner_params)
         # sensitivity image
-        petprj.bprj(tmpsens.cuvec,
-                    cu.asarray(ansng[sinoTIdx[n, 1:], :]).cuvec, txLUT, axLUT, sinoTIdx[n, 1:],
+        petprj.bprj(tmpsens, cu.asarray(ansng[sinoTIdx[n, 1:], :]), txLUT, axLUT, sinoTIdx[n, 1:],
                     Cnt)
         imgsens[n] = tmpsens
     del tmpsens
@@ -433,8 +431,8 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     im_smo = None
     fsmo = None
     if fwhm > 0:
-        im_smo = ndi.filters.gaussian_filter(im, fwhm2sig(fwhm, voxsize=Cnt['SZ_VOXY'] * 10),
-                                             mode='mirror')
+        im_smo = ndi.gaussian_filter(im, fwhm2sig(fwhm, voxsize=Cnt['SZ_VOXY'] * 10),
+                                     mode='mirror')
 
         if store_img:
             fsmo = fpet.split('.nii.gz')[0] + '_smo-' + str(fwhm).replace('.', '-') + 'mm.nii.gz'
diff --git a/niftypet/nipet/prj/mmrsim.py b/niftypet/nipet/prj/mmrsim.py
index 2332fa85..f84c0208 100644
--- a/niftypet/nipet/prj/mmrsim.py
+++ b/niftypet/nipet/prj/mmrsim.py
@@ -43,8 +43,7 @@ def simulate_sino(
         raise ValueError('The shapes of the PET and CT images are inconsistent.')
 
     if simulate_3d:
-        if petim.ndim != 3 \
-                or petim.shape != (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
+        if petim.ndim != 3 or petim.shape != (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
             raise ValueError('The input image shape does not match the scanner image size.')
         if petim.max() > 200:
             log.warning('the PET image may have too large intensities for robust simulation.')
@@ -152,8 +151,7 @@ def simulate_recon(
     psfkernel = mmrrec.psf_config(psf, Cnt)
 
     if simulate_3d:
-        if ctim.ndim!=3 \
-                or ctim.shape!=(Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
+        if ctim.ndim != 3 or ctim.shape != (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
             raise ValueError('The CT/mu-map image does not match the scanner image shape.')
     else:
         # > 2D case with reduced rings
@@ -259,8 +257,7 @@ def simulate_recon(
             sinoTIdx[n, 1:], s = mmrrec.get_subsets14(n, scanner_params)
 
             # > sensitivity image
-            petprj.bprj(tmpsim.cuvec,
-                        cu.asarray(attsino[sinoTIdx[n, 1:], :]).cuvec, txLUT, axLUT,
+            petprj.bprj(tmpsim, cu.asarray(attsino[sinoTIdx[n, 1:], :]), txLUT, axLUT,
                         sinoTIdx[n, 1:], Cnt)
             sim[n] = tmpsim
         del tmpsim
diff --git a/niftypet/nipet/prj/src/prj_module.cu b/niftypet/nipet/prj/src/prj_module.cu
index 9309fea7..b54e7362 100644
--- a/niftypet/nipet/prj/src/prj_module.cu
+++ b/niftypet/nipet/prj/src/prj_module.cu
@@ -28,16 +28,16 @@ Copyrights: 2019
 
 //--- Available functions
 static PyObject *trnx_prj(PyObject *self, PyObject *args);
-static PyObject *frwd_prj(PyObject *self, PyObject *args);
-static PyObject *back_prj(PyObject *self, PyObject *args);
+static PyObject *frwd_prj(PyObject *self, PyObject *args, PyObject *kwargs);
+static PyObject *back_prj(PyObject *self, PyObject *args, PyObject *kwargs);
 static PyObject *osem_rec(PyObject *self, PyObject *args);
 //---
 
 //> Module Method Table
 static PyMethodDef petprj_methods[] = {
     {"tprj", trnx_prj, METH_VARARGS, "Transaxial projector."},
-    {"fprj", frwd_prj, METH_VARARGS, "PET forward projector."},
-    {"bprj", back_prj, METH_VARARGS, "PET back projector."},
+    {"fprj", (PyCFunction)frwd_prj, METH_VARARGS | METH_KEYWORDS, "PET forward projector."},
+    {"bprj", (PyCFunction)back_prj, METH_VARARGS | METH_KEYWORDS, "PET back projector."},
     {"osem", osem_rec, METH_VARARGS, "OSEM reconstruction of PET data."},
     {NULL, NULL, 0, NULL} // Sentinel
 };
@@ -229,7 +229,7 @@ static PyObject *trnx_prj(PyObject *self, PyObject *args) {
 // F O R W A R D   P R O J E C T O R
 //------------------------------------------------------------------------------
 
-static PyObject *frwd_prj(PyObject *self, PyObject *args) {
+static PyObject *frwd_prj(PyObject *self, PyObject *args, PyObject *kwargs) {
   // Structure of constants
   Cnst Cnt;
 
@@ -243,21 +243,26 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
   PyObject *o_txLUT;
 
   // input image to be forward projected  (reshaped for GPU execution)
-  PyCuVec<float> *o_im;
+  PyCuVec<float> *o_im = NULL;
 
   // subsets for OSEM, first the default
   PyObject *o_subs;
 
   // output projection sino
-  PyCuVec<float> *o_prjout;
+  PyCuVec<float> *o_prjout = NULL;
 
   // flag for attenuation factors to be found based on mu-map; if 0 normal emission projection is
   // used
   int att;
+
+  bool SYNC = true; // whether to ensure deviceToHost copy on return
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "OOOOOOi", (PyObject **)&o_prjout, (PyObject **)&o_im, &o_txLUT,
-                        &o_axLUT, &o_subs, &o_mmrcnst, &att))
+  static const char *kwds[] = {"sino", "im",  "txLUT", "axLUT", "subs",
+                               "cnst", "att", "sync",  NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&O&OOOOi|b", (char **)kwds, &asPyCuVec_f,
+                                   &o_prjout, &asPyCuVec_f, &o_im, &o_txLUT, &o_axLUT, &o_subs,
+                                   &o_mmrcnst, &att, &SYNC))
     return NULL;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -310,8 +315,7 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
 
   /* If that didn't work, throw an exception. */
   if (p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
-      p_aw2ali == NULL || p_s2c == NULL || !o_im || p_crs == NULL || p_subs == NULL || !o_prjout ||
-      p_li2rng == NULL) {
+      p_aw2ali == NULL || p_s2c == NULL || p_crs == NULL || p_subs == NULL || p_li2rng == NULL) {
     // axLUTs
     Py_XDECREF(p_li2rno);
     Py_XDECREF(p_li2sn);
@@ -372,8 +376,35 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
   HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
 
   //<><><><><><><<><><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><><><><><><<><><><><><><><><><><>
-  gpu_fprj(o_prjout->vec.data(), o_im->vec.data(), li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs,
-           Nprj, Naw, N0crs, Cnt, att);
+  //--- TRANSAXIAL COMPONENT
+  float4 *d_crs;
+  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
+  HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
+
+  short2 *d_s2c;
+  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
+
+  float *d_tt;
+  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT * AW * sizeof(float)));
+
+  unsigned char *d_tv;
+  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV * AW * sizeof(unsigned char)));
+  HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV * AW * sizeof(unsigned char)));
+
+  // array of subset projection bins
+  int *d_subs;
+  HANDLE_ERROR(cudaMalloc(&d_subs, Nprj * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nprj * sizeof(int), cudaMemcpyHostToDevice));
+
+  gpu_fprj(o_prjout->vec.data(), o_im->vec.data(), li2rng, li2sn, li2nos, d_s2c, aw2ali, d_crs,
+           d_subs, d_tt, d_tv, Nprj, Naw, Cnt, att, SYNC);
+
+  HANDLE_ERROR(cudaFree(d_subs));
+  HANDLE_ERROR(cudaFree(d_tv));
+  HANDLE_ERROR(cudaFree(d_tt));
+  HANDLE_ERROR(cudaFree(d_s2c));
+  HANDLE_ERROR(cudaFree(d_crs));
   //<><><><><><><><<><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><><><><><><<><><><><><><><><><><>
 
   // Clean up
@@ -396,7 +427,7 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
 //==============================================================================
 // B A C K   P R O J E C T O R
 //------------------------------------------------------------------------------
-static PyObject *back_prj(PyObject *self, PyObject *args) {
+static PyObject *back_prj(PyObject *self, PyObject *args, PyObject *kwargs) {
 
   // Structure of constants
   Cnst Cnt;
@@ -411,18 +442,22 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   PyObject *o_txLUT;
 
   // sino to be back projected to image (both reshaped for GPU execution)
-  PyCuVec<float> *o_sino;
+  PyCuVec<float> *o_sino = NULL;
 
   // subsets for OSEM, first the default
   PyObject *o_subs;
 
   // output backprojected image
-  PyCuVec<float> *o_bimg;
+  PyCuVec<float> *o_bimg = NULL;
 
+  bool SYNC = true; // whether to ensure deviceToHost copy on return
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "OOOOOO", (PyObject **)&o_bimg, (PyObject **)&o_sino, &o_txLUT,
-                        &o_axLUT, &o_subs, &o_mmrcnst))
+
+  static const char *kwds[] = {"bimg", "sino", "txLUT", "axLUT", "subs", "cnst", "sync", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&O&OOOO|b", (char **)kwds, &asPyCuVec_f,
+                                   &o_bimg, &asPyCuVec_f, &o_sino, &o_txLUT, &o_axLUT, &o_subs,
+                                   &o_mmrcnst, &SYNC))
     return NULL;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -448,7 +483,6 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   // transaxial sino LUTs:
   PyObject *pd_crs = PyDict_GetItemString(o_txLUT, "crs");
   PyObject *pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
-  PyObject *pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
 
   //-- get the arrays from the dictionaries
   // axLUTs
@@ -461,12 +495,10 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   p_li2rng = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
 
   // sino to crystal, crystals
-  PyArrayObject *p_s2c = NULL, *p_crs = NULL, *p_aw2ali = NULL;
+  PyArrayObject *p_s2c = NULL, *p_crs = NULL;
   p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, NPY_ARRAY_IN_ARRAY);
   p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
 
-  p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
   // subsets if using e.g., OSEM
   PyArrayObject *p_subs = NULL;
   p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
@@ -474,8 +506,7 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
 
   /* If that didn't work, throw an exception. */
   if (p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
-      p_aw2ali == NULL || p_s2c == NULL || !o_sino || p_crs == NULL || p_subs == NULL ||
-      p_li2rng == NULL || !o_bimg) {
+      p_s2c == NULL || p_crs == NULL || p_subs == NULL || p_li2rng == NULL) {
     // axLUTs
     Py_XDECREF(p_li2rno);
     Py_XDECREF(p_li2sn);
@@ -483,8 +514,6 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
     Py_XDECREF(p_li2nos);
     Py_XDECREF(p_li2rng);
 
-    // 2D sino LUT
-    Py_XDECREF(p_aw2ali);
     // sino 2 crystals
     Py_XDECREF(p_s2c);
     Py_XDECREF(p_crs);
@@ -496,7 +525,6 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
 
   int *subs_ = (int *)PyArray_DATA(p_subs);
   short *s2c = (short *)PyArray_DATA(p_s2c);
-  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
   short *li2sn;
   if (Cnt.SPN == 11) {
     li2sn = (short *)PyArray_DATA(p_li2sn);
@@ -510,7 +538,6 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   int Nprj = PyArray_DIM(p_subs, 0);
   int N0crs = PyArray_DIM(p_crs, 0);
   int N1crs = PyArray_DIM(p_crs, 1);
-  int Naw = PyArray_DIM(p_aw2ali, 0);
 
   int *subs;
   if (subs_[0] == -1) {
@@ -534,8 +561,34 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
 
   //<><><<><><><><><><><><><><><><><><><><><<><><><><<><><><><><><><><><><><><><><><><><<><><><><><><>
-  gpu_bprj(o_bimg->vec.data(), o_sino->vec.data(), li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs,
-           Nprj, Naw, N0crs, Cnt);
+  float4 *d_crs;
+  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
+  HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
+
+  short2 *d_s2c;
+  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
+
+  float *d_tt;
+  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT * AW * sizeof(float)));
+
+  unsigned char *d_tv;
+  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV * AW * sizeof(unsigned char)));
+  HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV * AW * sizeof(unsigned char)));
+
+  // array of subset projection bins
+  int *d_subs;
+  HANDLE_ERROR(cudaMalloc(&d_subs, Nprj * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nprj * sizeof(int), cudaMemcpyHostToDevice));
+
+  gpu_bprj(o_bimg->vec.data(), o_sino->vec.data(), li2rng, li2sn, li2nos, d_s2c, d_crs, d_subs,
+           d_tt, d_tv, Nprj, Cnt, SYNC);
+
+  HANDLE_ERROR(cudaFree(d_subs));
+  HANDLE_ERROR(cudaFree(d_tv));
+  HANDLE_ERROR(cudaFree(d_tt));
+  HANDLE_ERROR(cudaFree(d_s2c));
+  HANDLE_ERROR(cudaFree(d_crs));
   //<><><><><><><><><><><>><><><><><><><><><<><><><><<><><><><><><><><><><><><><><><><><<><><><><><><>
 
   // Clean up
@@ -544,7 +597,6 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   Py_DECREF(p_li2sn);
   Py_DECREF(p_li2sn1);
   Py_DECREF(p_li2nos);
-  Py_DECREF(p_aw2ali);
   Py_DECREF(p_s2c);
   Py_DECREF(p_crs);
   Py_DECREF(p_subs);
@@ -619,7 +671,6 @@ static PyObject *osem_rec(PyObject *self, PyObject *args) {
   // transaxial sino LUTs:
   PyObject *pd_crs = PyDict_GetItemString(o_txLUT, "crs");
   PyObject *pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
-  PyObject *pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
 
   //-- get the arrays from the dictionaries
   // output back-projection image
@@ -659,10 +710,6 @@ static PyObject *osem_rec(PyObject *self, PyObject *args) {
   p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT8, NPY_ARRAY_IN_ARRAY);
   p_li2rng = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
 
-  // 2D sino index LUT:
-  PyArrayObject *p_aw2ali = NULL;
-  p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
   // sino to crystal, crystals
   PyArrayObject *p_s2c = NULL, *p_crs = NULL;
   p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, NPY_ARRAY_IN_ARRAY);
@@ -673,7 +720,7 @@ static PyObject *osem_rec(PyObject *self, PyObject *args) {
   if (p_imgout == NULL || p_rcnmsk == NULL || p_subs == NULL || p_psng == NULL || p_rsng == NULL ||
       p_ssng == NULL || p_nsng == NULL || p_asng == NULL || p_imgsens == NULL ||
       p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
-      p_aw2ali == NULL || p_s2c == NULL || p_crs == NULL || p_krnl == NULL) {
+      p_s2c == NULL || p_crs == NULL || p_krnl == NULL) {
     //> output image
     PyArray_DiscardWritebackIfCopy(p_imgout);
     Py_XDECREF(p_imgout);
@@ -699,8 +746,6 @@ static PyObject *osem_rec(PyObject *self, PyObject *args) {
     Py_XDECREF(p_li2sn);
     Py_XDECREF(p_li2sn1);
     Py_XDECREF(p_li2nos);
-    //> 2D sinogram LUT
-    Py_XDECREF(p_aw2ali);
     //> sinogram to crystal LUTs
     Py_XDECREF(p_s2c);
     Py_XDECREF(p_crs);
@@ -743,7 +788,6 @@ static PyObject *osem_rec(PyObject *self, PyObject *args) {
   float *li2rng = (float *)PyArray_DATA(p_li2rng);
   float *crs = (float *)PyArray_DATA(p_crs);
   short *s2c = (short *)PyArray_DATA(p_s2c);
-  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
 
   int N0crs = PyArray_DIM(p_crs, 0);
   int N1crs = PyArray_DIM(p_crs, 1);
@@ -787,7 +831,6 @@ static PyObject *osem_rec(PyObject *self, PyObject *args) {
   Py_DECREF(p_li2sn);
   Py_DECREF(p_li2sn1);
   Py_DECREF(p_li2nos);
-  Py_DECREF(p_aw2ali);
   Py_DECREF(p_s2c);
   Py_DECREF(p_crs);
 
diff --git a/niftypet/nipet/prj/src/prjb.cu b/niftypet/nipet/prj/src/prjb.cu
index 8511427b..8ada3d4d 100644
--- a/niftypet/nipet/prj/src/prjb.cu
+++ b/niftypet/nipet/prj/src/prjb.cu
@@ -187,35 +187,13 @@ __global__ void bprj_oblq(const float *sino, float *im, const float *tt, const u
 }
 
 //--------------------------------------------------------------------------------------------------
-void gpu_bprj(float *d_im, float *d_sino, float *li2rng, short *li2sn, char *li2nos, short *s2c,
-              int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt) {
-
+void gpu_bprj(float *d_im, float *d_sino, float *li2rng, short *li2sn, char *li2nos, short2 *d_s2c,
+              float4 *d_crs, int *d_subs, float *d_tt, unsigned char *d_tv, int Nprj, Cnst Cnt,
+              bool _sync) {
   int dev_id;
   cudaGetDevice(&dev_id);
   if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
-  //--- TRANSAXIAL COMPONENT
-  float4 *d_crs;
-  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
-  HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
-
-  short2 *d_s2c;
-  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
-  HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
-
-  float *d_tt;
-  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT * AW * sizeof(float)));
-
-  unsigned char *d_tv;
-  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV * AW * sizeof(unsigned char)));
-  HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV * AW * sizeof(unsigned char)));
-
-  // array of subset projection bins
-  int *d_subs;
-  HANDLE_ERROR(cudaMalloc(&d_subs, Nprj * sizeof(int)));
-  HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nprj * sizeof(int), cudaMemcpyHostToDevice));
-  //---
-
   //-----------------------------------------------------------------
   // RINGS: either all or a subset of rings can be used for fast calc.
   //-----------------------------------------------------------------
@@ -266,9 +244,11 @@ void gpu_bprj(float *d_im, float *d_sino, float *li2rng, short *li2sn, char *li2
   cudaMemcpyToSymbol(c_li2nos, li2nos, nil2r_c * sizeof(char));
 
   cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start, 0);
+  if (_sync) {
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start, 0);
+  }
 
   if (Cnt.LOG <= LOGDEBUG) printf("i> calculating image through back projection... ");
 
@@ -313,20 +293,18 @@ void gpu_bprj(float *d_im, float *d_sino, float *li2rng, short *li2sn, char *li2
     if (Cnt.LOG <= LOGDEBUG) printf("i> reduced the axial (z) image size to %d\n", nvz);
   }
 
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
-  // cudaDeviceSynchronize();
-  float elapsedTime;
-  cudaEventElapsedTime(&elapsedTime, start, stop);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
-
-  HANDLE_ERROR(cudaFree(d_tt));
-  HANDLE_ERROR(cudaFree(d_tv));
-  HANDLE_ERROR(cudaFree(d_subs));
-  HANDLE_ERROR(cudaFree(d_crs));
-  HANDLE_ERROR(cudaFree(d_s2c));
+  if (_sync) {
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    // cudaDeviceSynchronize();
+    float elapsedTime;
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  } else {
+    if (Cnt.LOG <= LOGDEBUG) printf("DONE.\n");
+  }
 }
 
 //=======================================================================
diff --git a/niftypet/nipet/prj/src/prjb.h b/niftypet/nipet/prj/src/prjb.h
index d03b4e19..0c018965 100644
--- a/niftypet/nipet/prj/src/prjb.h
+++ b/niftypet/nipet/prj/src/prjb.h
@@ -7,16 +7,12 @@
 #define PRJB_H
 
 // used from Python
-void gpu_bprj(float *d_im, float *d_sino, float *li2rng, short *li2sn, char *li2nos, short *s2c,
-              int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt);
+void gpu_bprj(float *d_im, float *d_sino, float *li2rng, short *li2sn, char *li2nos, short2 *d_s2c,
+              float4 *d_crs, int *d_subs, float *d_tt, unsigned char *d_tv, int Nprj, Cnst Cnt,
+              bool _sync = true);
 
 // to be used within CUDA C reconstruction
-void rec_bprj(float *d_bimg, float *d_sino, int *sub, int Nprj,
-
-              float *d_tt, unsigned char *d_tv,
-
-              float *li2rng, short *li2sn, char *li2nos,
-
-              Cnst Cnt);
+void rec_bprj(float *d_bimg, float *d_sino, int *sub, int Nprj, float *d_tt, unsigned char *d_tv,
+              float *li2rng, short *li2sn, char *li2nos, Cnst Cnt);
 
 #endif
diff --git a/niftypet/nipet/prj/src/prjf.cu b/niftypet/nipet/prj/src/prjf.cu
index 11434bc8..5fe70325 100644
--- a/niftypet/nipet/prj/src/prjf.cu
+++ b/niftypet/nipet/prj/src/prjf.cu
@@ -206,35 +206,13 @@ __global__ void fprj_oblq(float *sino, const float *im, const float *tt, const u
 }
 
 //--------------------------------------------------------------------------------------------------
-void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2nos, short *s2c,
-              int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt,
-              char att) {
+void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2nos, short2 *d_s2c,
+              int *aw2ali, float4 *d_crs, int *d_subs, float *d_tt, unsigned char *d_tv, int Nprj,
+              int Naw, Cnst Cnt, char att, bool _sync) {
   int dev_id;
   cudaGetDevice(&dev_id);
   if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
-  //--- TRANSAXIAL COMPONENT
-  float4 *d_crs;
-  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
-  HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
-
-  short2 *d_s2c;
-  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
-  HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
-
-  float *d_tt;
-  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT * AW * sizeof(float)));
-
-  unsigned char *d_tv;
-  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV * AW * sizeof(unsigned char)));
-  HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV * AW * sizeof(unsigned char)));
-
-  // array of subset projection bins
-  int *d_subs;
-  HANDLE_ERROR(cudaMalloc(&d_subs, Nprj * sizeof(int)));
-  HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nprj * sizeof(int), cudaMemcpyHostToDevice));
-  //---
-
   //-----------------------------------------------------------------
   // RINGS: either all or a subset of rings can be used (span-1 feature only)
   //-----------------------------------------------------------------
@@ -289,24 +267,16 @@ void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2no
     HANDLE_ERROR(cudaGetLastError());
   }
 
-  // float *d_li2rng;  HANDLE_ERROR( cudaMalloc(&d_li2rng, N0li*N1li*sizeof(float)) );
-  // HANDLE_ERROR( cudaMemcpy( d_li2rng, li2rng, N0li*N1li*sizeof(float), cudaMemcpyHostToDevice)
-  // );
-
-  // int *d_li2sn;  HANDLE_ERROR(cudaMalloc(&d_li2sn, N0li*N1li*sizeof(int)) );
-  // HANDLE_ERROR( cudaMemcpy( d_li2sn, li2sn, N0li*N1li*sizeof(int), cudaMemcpyHostToDevice) );
-
-  // int *d_li2nos;  HANDLE_ERROR( cudaMalloc(&d_li2nos, N1li*sizeof(int)) );
-  // HANDLE_ERROR( cudaMemcpy( d_li2nos, li2nos, N1li*sizeof(int), cudaMemcpyHostToDevice) );
-
   cudaMemcpyToSymbol(c_li2rng, li2rng, nil2r_c * sizeof(float2));
   cudaMemcpyToSymbol(c_li2sn, li2sn, nil2r_c * sizeof(short2));
   cudaMemcpyToSymbol(c_li2nos, li2nos, nil2r_c * sizeof(char));
 
   cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start, 0);
+  if (_sync) {
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start, 0);
+  }
 
   if (Cnt.LOG <= LOGDEBUG) printf("i> calculating sinograms via forward projection...");
 
@@ -333,21 +303,20 @@ void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2no
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
-  // cudaDeviceSynchronize();
-  float elapsedTime;
-  cudaEventElapsedTime(&elapsedTime, start, stop);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  if (_sync) {
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    // cudaDeviceSynchronize();
+    float elapsedTime;
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  } else {
+    if (Cnt.LOG <= LOGDEBUG) printf("DONE.\n");
+  }
 
   if (nvz < SZ_IMZ) HANDLE_ERROR(cudaFree(d_im));
-  HANDLE_ERROR(cudaFree(d_tt));
-  HANDLE_ERROR(cudaFree(d_tv));
-  HANDLE_ERROR(cudaFree(d_subs));
-  HANDLE_ERROR(cudaFree(d_crs));
-  HANDLE_ERROR(cudaFree(d_s2c));
 }
 
 //=======================================================================
diff --git a/niftypet/nipet/prj/src/prjf.h b/niftypet/nipet/prj/src/prjf.h
index a11512cb..08a66294 100644
--- a/niftypet/nipet/prj/src/prjf.h
+++ b/niftypet/nipet/prj/src/prjf.h
@@ -6,9 +6,9 @@
 #ifndef PRJF_H
 #define PRJF_H
 
-void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2nos, short *s2c,
-              int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt,
-              char att);
+void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2nos, short2 *d_s2c,
+              int *aw2ali, float4 *d_crs, int *d_subs, float *d_tt, unsigned char *d_tv, int Nprj,
+              int Naw, Cnst Cnt, char att, bool _sync = true);
 
 void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
 
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 3245b3bc..8f5dfebf 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -445,8 +445,8 @@ def vsm(
 
     # -smooth for defining the sino scatter only regions
     if fwhm_input > 0.:
-        mu_sctonly = ndi.filters.gaussian_filter(mmrimg.convert2dev(muo, Cnt),
-                                                 fwhm2sig(fwhm_input, Cnt), mode='mirror')
+        mu_sctonly = ndi.gaussian_filter(mmrimg.convert2dev(muo, Cnt), fwhm2sig(fwhm_input, Cnt),
+                                         mode='mirror')
     else:
         mu_sctonly = muo
 
@@ -466,8 +466,8 @@ def vsm(
 
     # > smooth before scaling/down-sampling the mu-map and emission images
     if fwhm_input > 0.:
-        muim = ndi.filters.gaussian_filter(muo + muh, fwhm2sig(fwhm_input, Cnt), mode='mirror')
-        emim = ndi.filters.gaussian_filter(em, fwhm2sig(fwhm_input, Cnt), mode='mirror')
+        muim = ndi.gaussian_filter(muo + muh, fwhm2sig(fwhm_input, Cnt), mode='mirror')
+        emim = ndi.gaussian_filter(em, fwhm2sig(fwhm_input, Cnt), mode='mirror')
     else:
         muim = muo + muh
         emim = em
@@ -478,7 +478,7 @@ def vsm(
     # -smooth the mu-map for mask creation.
     # the mask contains voxels for which attenuation ray LUT is found.
     if fwhm_input > 0.:
-        smomu = ndi.filters.gaussian_filter(muim, fwhm2sig(fwhm_input, Cnt), mode='mirror')
+        smomu = ndi.gaussian_filter(muim, fwhm2sig(fwhm_input, Cnt), mode='mirror')
         mumsk = np.int8(smomu > 0.003)
     else:
         mumsk = np.int8(muim > 0.001)
@@ -571,8 +571,7 @@ def vsm(
     currentspan = Cnt['SPN']
     Cnt['SPN'] = 1
     atto = cu.zeros((txLUT['Naw'], Cnt['NSN1']), dtype=np.float32)
-    petprj.fprj(atto.cuvec,
-                cu.asarray(mu_sctonly).cuvec, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
+    petprj.fprj(atto, cu.asarray(mu_sctonly), txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
     atto = mmraux.putgaps(atto, txLUT, Cnt)
     # --------------------------------------------------------------
     # > get norm components setting the geometry and axial to ones
@@ -609,9 +608,9 @@ def vsm(
         eim = eim[:, ::-1, ::-1]
         eim = np.transpose(eim, (2, 1, 0))
 
-        em_sctonly = ndi.filters.gaussian_filter(eim, fwhm2sig(.6, Cnt), mode='mirror')
+        em_sctonly = ndi.gaussian_filter(eim, fwhm2sig(.6, Cnt), mode='mirror')
         msk = np.float32(em_sctonly > 0.07 * np.max(em_sctonly))
-        msk = ndi.filters.gaussian_filter(msk, fwhm2sig(.6, Cnt), mode='mirror')
+        msk = ndi.gaussian_filter(msk, fwhm2sig(.6, Cnt), mode='mirror')
         msk = np.float32(msk > 0.01)
         msksn = mmrprj.frwd_prj(msk, txLUT, axLUT, Cnt)
 
diff --git a/niftypet/nipet/src/aux_module.cu b/niftypet/nipet/src/aux_module.cu
index 3af8972d..d6be873b 100644
--- a/niftypet/nipet/src/aux_module.cu
+++ b/niftypet/nipet/src/aux_module.cu
@@ -10,12 +10,13 @@ Copyrights: 2018
 #define PY_SSIZE_T_CLEAN
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION // NPY_API_VERSION
 
+#include "Python.h"
 #include "auxmath.h"
 #include "def.h"
 #include "norm.h"
+#include "numpy/arrayobject.h"
+#include "pycuvec.cuh"
 #include "scanner_0.h"
-#include <Python.h>
-#include <numpy/arrayobject.h>
 #include <stdlib.h>
 
 //=== START PYTHON INIT ===
@@ -386,26 +387,14 @@ static PyObject *mmr_pgaps(PyObject *self, PyObject *args) {
 
 //====================================================================================================
 static PyObject *mmr_rgaps(PyObject *self, PyObject *args) {
-
-  // output sino with gaps removed
-  PyObject *o_sng;
-
-  // transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
-  PyObject *o_txLUT;
-
-  // Dictionary of scanner constants
-  PyObject *o_mmrcnst;
-
-  // input sino to be reformated with gaps removed
-  PyObject *o_sino;
-
-  // Structure of constants
-  Cnst Cnt;
-
-  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "OOOO", &o_sng, &o_sino, &o_txLUT, &o_mmrcnst)) return NULL;
-  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  PyCuVec<float> *o_sng = NULL;  // output sino with gaps removed
+  PyCuVec<float> *o_sino = NULL; // input sino to be reformated with gaps removed
+  PyObject *o_txLUT;   // transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
+  PyObject *o_mmrcnst; // Dictionary of scanner constants
+  Cnst Cnt;            // Structure of constants
+  if (!PyArg_ParseTuple(args, "O&O&OO", &asPyCuVec_f, &o_sng, &asPyCuVec_f, &o_sino, &o_txLUT,
+                        &o_mmrcnst))
+    return NULL;
 
   /* Interpret the input objects as... PyLong_AsLong*/
   PyObject *pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
@@ -427,45 +416,23 @@ static PyObject *mmr_rgaps(PyObject *self, PyObject *args) {
   PyObject *pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
 
   // input sino and the above 2D LUT
-  PyArrayObject *p_sino = NULL;
-  p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
   PyArrayObject *p_aw2ali = NULL;
   p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+  if (p_aw2ali == NULL) { Py_XDECREF(p_aw2ali); }
+  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
 
   // number of sinogram from the shape of the sino (can be any number especially when using reduced
   // ring number)
-  int snno = (int)PyArray_DIM(p_sino, 0);
-
-  // output sino
-  PyArrayObject *p_sng = NULL;
-  p_sng = (PyArrayObject *)PyArray_FROM_OTF(o_sng, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-
-  if (p_sino == NULL || p_aw2ali == NULL || p_sino == NULL) {
-    Py_XDECREF(p_aw2ali);
-    Py_XDECREF(p_sino);
-
-    PyArray_DiscardWritebackIfCopy(p_sng);
-    Py_XDECREF(p_sng);
-  }
-
-  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
-  float *sino = (float *)PyArray_DATA(p_sino);
-  float *sng = (float *)PyArray_DATA(p_sng);
-
-  // sets the device on which to calculate
-  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+  int snno = o_sino->shape[0];
 
   //<><><><><><><><><><><><><><><><><><><><><><>
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
   // Run the conversion to GPU sinos
-  remove_gaps(sng, sino, snno, aw2ali, Cnt);
+  remove_gaps(o_sng->vec.data(), o_sino->vec.data(), snno, aw2ali, Cnt);
   //<><><><><><><><><><><><><><><><><><><><><><>
 
   // Clean up
   Py_DECREF(p_aw2ali);
-  Py_DECREF(p_sino);
-
-  PyArray_ResolveWritebackIfCopy(p_sng);
-  Py_DECREF(p_sng);
 
   Py_INCREF(Py_None);
   return Py_None;
diff --git a/pyproject.toml b/pyproject.toml
index 786e72f6..554a3f78 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
 requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4",
-            "cuvec>=2.5.0", "ninst>=0.10.0", "numpy>=1.14", "miutil[cuda]>=0.4.0",
+            "cuvec>=2.8.0", "ninst>=0.12.0", "numpy>=1.14", "miutil[cuda]>=0.4.0",
             "scikit-build>=0.11.0", "cmake>=3.18", "ninja"]
 
 [tool.setuptools_scm]
diff --git a/setup.cfg b/setup.cfg
index 06f66b3e..8b38d2e8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,6 +31,7 @@ classifiers=
     Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
     Programming Language :: Python :: 3 :: Only
     Topic :: Scientific/Engineering :: Medical Science Apps.
 [options]
@@ -39,19 +40,19 @@ setup_requires=
     setuptools>=42
     wheel
     setuptools_scm[toml]
-    cuvec>=2.5.0
+    cuvec>=2.8.0
     miutil[cuda]>=0.4.0
-    ninst>=0.10.0
+    ninst>=0.12.0
     numpy>=1.14
     scikit-build>=0.11.0
     cmake>=3.18
     ninja
 install_requires=
-    cuvec>=2.5.0
+    cuvec>=2.10.0
     miutil>=0.6.0
     nibabel>=2.4.0
-    nimpa>=2.0.0
-    ninst>=0.7.0
+    nimpa>=2.4.0
+    ninst>=0.12.0
     numpy>=1.14
     pydicom>=1.0.2
     setuptools
diff --git a/tests/test_amyloid_pvc.py b/tests/test_amyloid_pvc.py
index a7bd979f..671a7986 100644
--- a/tests/test_amyloid_pvc.py
+++ b/tests/test_amyloid_pvc.py
@@ -66,7 +66,7 @@ def muhdct(mMRpars, datain, tmp_path_factory, worker_id):
 
     opth = str(tmp_path.parent / "muhdct")
     flock = FileLock(opth + ".lock")
-    with flock.acquire(poll_intervall=0.5): # xdist, force auto-reuse via flock
+    with flock.acquire(poll_interval=0.5): # xdist, force auto-reuse via flock
         return nipet.hdw_mumap(datain, [1, 2, 4], mMRpars, outpath=opth, use_stored=True)