diff --git a/openmdao/approximation_schemes/approximation_scheme.py b/openmdao/approximation_schemes/approximation_scheme.py
index a58bd83c85..65ea042d49 100644
--- a/openmdao/approximation_schemes/approximation_scheme.py
+++ b/openmdao/approximation_schemes/approximation_scheme.py
@@ -1,9 +1,11 @@
 """Base class used to define the interface for derivative approximation schemes."""
 import time
 from collections import defaultdict
+from itertools import repeat
 import numpy as np
 
 from openmdao.core.constants import INT_DTYPE
+from openmdao.vectors.vector import _full_slice
 from openmdao.utils.array_utils import get_input_idx_split
 import openmdao.utils.coloring as coloring_mod
 from openmdao.utils.general_utils import _convert_auto_ivc_to_conn_name, LocalRangeIterable
@@ -134,8 +136,8 @@ def add_approximation(self, abs_key, system, kwargs):
         raise NotImplementedError("add_approximation has not been implemented")
 
     def _init_colored_approximations(self, system):
-        is_group = _is_group(system)
-        is_total = is_group and system.pathname == ''
+        is_total = system.pathname == ''
+        is_semi = _is_group(system) and not is_total
         self._colored_approx_groups = []
 
         # don't do anything if the coloring doesn't exist yet
@@ -155,17 +157,17 @@ def _init_colored_approximations(self, system):
             if is_total:
                 ccol2vcol = np.empty(coloring._shape[1], dtype=INT_DTYPE)
 
-            ordered_wrt_iter = list(system._jac_wrt_iter())
             colored_start = colored_end = 0
-            for abs_wrt, cstart, cend, _, cinds, _ in ordered_wrt_iter:
+            for abs_wrt, cstart, cend, vec, cinds, _ in system._jac_wrt_iter():
                 if wrt_matches is None or abs_wrt in wrt_matches:
                     colored_end += cend - cstart
-                    ccol2jcol[colored_start:colored_end] = np.arange(cstart, cend, dtype=INT_DTYPE)
+                    ccol2jcol[colored_start:colored_end] = range(cstart, cend)
                     if is_total and abs_wrt in out_slices:
                         slc = out_slices[abs_wrt]
-                        rng = np.arange(slc.start, slc.stop)
-                        if cinds is not None:
-                            rng = rng[cinds]
+                        if cinds is None or cinds is _full_slice:
+                            rng = range(slc.start, slc.stop)
+                        else:
+                            rng = np.arange(slc.start, slc.stop)[cinds]
                         ccol2vcol[colored_start:colored_end] = rng
                     colored_start = colored_end
 
@@ -198,7 +200,6 @@ def _init_colored_approximations(self, system):
         inputs = system._inputs
 
         from openmdao.core.implicitcomponent import ImplicitComponent
-        is_semi = is_group and not is_total
         use_full_cols = is_semi or isinstance(system, ImplicitComponent)
 
         for cols, nzrows in coloring.color_nonzero_iter('fwd'):
@@ -259,7 +260,7 @@ def _init_approximations(self, system):
 
                 if wrt in approx_wrt_idx:
                     if vec is None:
-                        vec_idx = None
+                        vec_idx = repeat(None, approx_wrt_idx[wrt].shaped_array().size)
                     else:
                         # local index into var
                         vec_idx = approx_wrt_idx[wrt].shaped_array(copy=True)
@@ -271,15 +272,9 @@ def _init_approximations(self, system):
                             in_idx = [list(in_idx)]
                             vec_idx = [vec_idx]
                 else:
-                    if vec is None:  # remote wrt
-                        if wrt in abs2meta['input']:
-                            vec_idx = range(abs2meta['input'][wrt]['global_size'])
-                        else:
-                            vec_idx = range(abs2meta['output'][wrt]['global_size'])
-                    else:
-                        vec_idx = LocalRangeIterable(system, wrt)
-                        if directional:
-                            vec_idx = [v for v in vec_idx if v is not None]
+                    vec_idx = LocalRangeIterable(system, wrt)
+                    if directional and vec is not None:
+                        vec_idx = [v for v in vec_idx if v is not None]
 
                     # Directional derivatives for quick deriv checking.
                     # Place the indices in a list so that they are all stepped at the same time.
@@ -445,6 +440,8 @@ def _vec_ind_iter(self, vec_ind_list):
             entry = [[None, None]]
             ent0 = entry[0]
             for vec, vec_idxs in vec_ind_list:
+                if vec_idxs is None:
+                    continue
                 for vinds in vec_idxs:
                     ent0[0] = vec
                     ent0[1] = vinds
@@ -452,7 +449,7 @@ def _vec_ind_iter(self, vec_ind_list):
 
     def _uncolored_column_iter(self, system, approx_groups):
         """
-        Perform approximations and yields (column_index, column) for each jac column.
+        Perform approximations and yield (column_index, column) for each jac column.
 
         Parameters
         ----------
@@ -477,9 +474,10 @@ def _uncolored_column_iter(self, system, approx_groups):
             solution array corresponding to the jacobian column at the given column index
         """
         total = system.pathname == ''
-        ordered_of_iter = list(system._jac_of_iter())
         if total:
-            tot_result = np.zeros(ordered_of_iter[-1][2])
+            for _, _, end, _, _ in system._jac_of_iter():
+                pass
+            tot_result = np.zeros(end)
 
         total_or_semi = total or _is_group(system)
 
diff --git a/openmdao/core/component.py b/openmdao/core/component.py
index 615034ebe6..bf56bbd27e 100644
--- a/openmdao/core/component.py
+++ b/openmdao/core/component.py
@@ -1006,17 +1006,6 @@ def _update_dist_src_indices(self, abs_in2out, all_abs2meta, all_abs2idx, all_si
                         if dist_in:
                             offset = np.sum(sizes_in[:iproc, i])
                             end = offset + sizes_in[iproc, i]
-                        else:
-                            if src.startswith('_auto_ivc.'):
-                                nzs = np.nonzero(vout_sizes)[0]
-                                if nzs.size == 1:
-                                    # special case where we have a 'distributed' auto_ivc output
-                                    # that has a nonzero value in only one proc, so we can treat
-                                    # it like a non-distributed output. This happens in cases
-                                    # where an auto_ivc output connects to a variable that is
-                                    # remote on at least one proc.
-                                    offset = 0
-                                    end = vout_sizes[nzs[0]]
 
                     # total sizes differ and output is distributed, so can't determine mapping
                     if offset is None:
@@ -1776,7 +1765,7 @@ def _get_dist_nz_dresids(self):
         """
         nzresids = []
         dresids = self._dresiduals.asarray()
-        for of, start, end, _full_slice, dist_sizes in self._jac_of_iter():
+        for of, start, end, _, dist_sizes in self._jac_of_iter():
             if dist_sizes is not None:
                 if np.any(dresids[start:end]):
                     nzresids.append(of)
diff --git a/openmdao/core/group.py b/openmdao/core/group.py
index e6a6846459..33deeeb9c0 100644
--- a/openmdao/core/group.py
+++ b/openmdao/core/group.py
@@ -3,7 +3,7 @@
 from collections import Counter, defaultdict
 from collections.abc import Iterable
 
-from itertools import product, chain
+from itertools import product, chain, repeat
 from numbers import Number
 import inspect
 from fnmatch import fnmatchcase
@@ -26,11 +26,11 @@
 from openmdao.utils.array_utils import array_connection_compatible, _flatten_src_indices, \
     shape_to_len
 from openmdao.utils.general_utils import common_subpath, all_ancestors, \
-    convert_src_inds, ContainsAll, shape2tuple, get_connection_owner, ensure_compatible, \
-    _src_name_iter, meta2src_iter
+    convert_src_inds, _contains_all, shape2tuple, get_connection_owner, ensure_compatible, \
+    meta2src_iter, get_rev_conns
 from openmdao.utils.units import is_compatible, unit_conversion, _has_val_mismatch, _find_unit, \
     _is_unitless, simplify_unit
-from openmdao.utils.graph_utils import get_sccs_topo, get_out_of_order_nodes, get_hybrid_graph
+from openmdao.utils.graph_utils import get_sccs_topo, get_out_of_order_nodes
 from openmdao.utils.mpi import MPI, check_mpi_exceptions, multi_proc_exception_check
 import openmdao.utils.coloring as coloring_mod
 from openmdao.utils.indexer import indexer, Indexer
@@ -186,12 +186,18 @@ class Group(System):
         Sorted list of pathnames of components that are executed prior to the optimization loop.
     _post_components : list of str or None
         Sorted list of pathnames of components that are executed after the optimization loop.
-    _abs_desvars : set
-        Set of absolute design variable names.
-    _abs_responses : set
-        Set of absolute response names.
+    _abs_desvars : dict or None
+        Dict of absolute design variable metadata.
+    _abs_responses : dict or None
+        Dict of absolute response metadata.
     _relevance_graph : nx.DiGraph
         Graph of relevance connections.  Always None except in the top level Group.
+    _fd_rev_xfer_correction_dist : dict
+        If this group is using finite difference to compute derivatives,
+        this is the set of inputs that are upstream of a distributed response
+        within this group, keyed by active response.  These determine if contributions
+        from all ranks will be added together to get the correct input values when derivatives
+        in the larger model are being solved using reverse mode.
     """
 
     def __init__(self, **kwargs):
@@ -222,6 +228,7 @@ def __init__(self, **kwargs):
         self._abs_desvars = None
         self._abs_responses = None
         self._relevance_graph = None
+        self._fd_rev_xfer_correction_dist = {}
 
         # TODO: we cannot set the solvers with property setters at the moment
         # because our lint check thinks that we are defining new attributes
@@ -787,17 +794,15 @@ def _init_relevance(self, mode):
         dict
             The relevance dictionary.
         """
-        abs_desvars = self.get_design_vars(recurse=True, get_sizes=False, use_prom_ivc=False)
-        abs_responses = self.get_responses(recurse=True, get_sizes=False, use_prom_ivc=False)
-        self._abs_desvars = set(_src_name_iter(abs_desvars))
-        self._abs_responses = set(_src_name_iter(abs_responses))
+        self._abs_desvars = self.get_design_vars(recurse=True, get_sizes=False, use_prom_ivc=False)
+        self._abs_responses = self.get_responses(recurse=True, get_sizes=False, use_prom_ivc=False)
         assert self.pathname == '', "Relevance can only be initialized on the top level System."
 
         if self._use_derivatives:
-            return self.get_relevant_vars(abs_desvars,
-                                          self._check_alias_overlaps(abs_responses), mode)
+            return self.get_relevant_vars(self._abs_desvars,
+                                          self._check_alias_overlaps(self._abs_responses), mode)
 
-        return {'@all': ({'input': ContainsAll(), 'output': ContainsAll()}, ContainsAll())}
+        return {'@all': ({'input': _contains_all, 'output': _contains_all}, _contains_all)}
 
     def get_relevance_graph(self, desvars, responses):
         """
@@ -818,20 +823,24 @@ def get_relevance_graph(self, desvars, responses):
         if self._relevance_graph is not None:
             return self._relevance_graph
 
-        conns = self._conn_global_abs_in2out
-        graph = get_hybrid_graph(conns)
+        graph = self.get_hybrid_graph()
+        outmeta = self._var_allprocs_abs2meta['output']
 
         # now add design vars and responses to the graph
         for dv in meta2src_iter(desvars.values()):
             if dv not in graph:
-                graph.add_node(dv, type_='out')
+                graph.add_node(dv, type_='out',
+                               dist=outmeta[dv]['distributed'] if dv in outmeta else None)
                 graph.add_edge(dv.rpartition('.')[0], dv)
+            graph.nodes[dv]['isdv'] = True
 
         resps = set(meta2src_iter(responses.values()))
         for res in resps:
             if res not in graph:
-                graph.add_node(res, type_='out')
-                graph.add_edge(res.rpartition('.')[0], res)
+                graph.add_node(res, type_='out',
+                               dist=outmeta[res]['distributed'] if res in outmeta else None)
+                graph.add_edge(res.rpartition('.')[0], res, isresponse=True)
+            graph.nodes[res]['isresponse'] = True
 
         # figure out if we can remove any edges based on zero partials we find
         # in components.  By default all component connected outputs
@@ -866,6 +875,46 @@ def get_relevance_graph(self, desvars, responses):
         self._relevance_graph = graph
         return graph
 
+    def get_hybrid_graph(self):
+        """
+        Return a graph of all variables and components in the model.
+
+        Each component is connected to each of its input and output variables, and
+        those variables are connected to other variables based on the connections
+        in the model.
+
+        This results in a smaller graph (fewer edges) than would be the case for a pure variable
+        graph where all inputs to a particular component would have to be connected to all outputs
+        from that component.
+
+        Returns
+        -------
+        networkx.DiGraph
+            Graph of all variables and components in the model.
+        """
+        graph = nx.DiGraph()
+        tgtmeta = self._var_allprocs_abs2meta['input']
+        srcmeta = self._var_allprocs_abs2meta['output']
+
+        for tgt, src in self._conn_global_abs_in2out.items():
+            if src not in graph:
+                dist = srcmeta[src]['distributed'] if src in srcmeta else None
+                graph.add_node(src, type_='out', dist=dist,
+                               isdv=False, isresponse=False)
+
+            dist = tgtmeta[tgt]['distributed'] if tgt in tgtmeta else None
+            graph.add_node(tgt, type_='in', dist=dist,
+                           isdv=False, isresponse=False)
+
+            # create edges to/from the component
+            graph.add_edge(src.rpartition('.')[0], src)
+            graph.add_edge(tgt, tgt.rpartition('.')[0])
+
+            # connect the variables src and tgt
+            graph.add_edge(src, tgt)
+
+        return graph
+
     def get_relevant_vars(self, desvars, responses, mode):
         """
         Find all relevant vars between desvars and responses.
@@ -902,20 +951,19 @@ def get_relevant_vars(self, desvars, responses, mode):
         for dvmeta in desvars.values():
             desvar = dvmeta['source']
             dvset = set(self.all_connected_nodes(graph, desvar))
-            parallel_deriv_color = dvmeta['parallel_deriv_color']
-            if parallel_deriv_color:
+            if dvmeta['parallel_deriv_color']:
                 pd_dv_locs[desvar] = set(self.all_connected_nodes(graph, desvar, local=True))
-                pd_err_chk[parallel_deriv_color][desvar] = pd_dv_locs[desvar]
+                pd_err_chk[dvmeta['parallel_deriv_color']][desvar] = pd_dv_locs[desvar]
 
             for resmeta in responses.values():
                 response = resmeta['source']
                 if response not in rescache:
                     rescache[response] = set(self.all_connected_nodes(grev, response))
-                    parallel_deriv_color = resmeta['parallel_deriv_color']
-                    if parallel_deriv_color:
+                    if resmeta['parallel_deriv_color']:
                         pd_res_locs[response] = set(self.all_connected_nodes(grev, response,
                                                                              local=True))
-                        pd_err_chk[parallel_deriv_color][response] = pd_res_locs[response]
+                        pd_err_chk[resmeta['parallel_deriv_color']][response] = \
+                            pd_res_locs[response]
 
                 common = dvset.intersection(rescache[response])
 
@@ -1161,11 +1209,13 @@ def _get_var_offsets(self):
 
     def _get_jac_col_scatter(self):
         """
-        Return source and target indices for a scatter from the output vector to a jacobian column.
+        Return source and target indices for a scatter from output vector to total jacobian column.
 
         If the transfer involves remote or distributed variables, the indices will be global.
         Otherwise they will be converted to local.
 
+        This is only called on the top level system.
+
         Returns
         -------
         ndarray
@@ -1265,12 +1315,14 @@ def _final_setup(self, comm, mode):
             # must call this before vector setup because it determines if we need to alloc commplex
             self._setup_partials()
 
+        self._fd_rev_xfer_correction_dist = {}
+
         self._problem_meta['relevant'] = self._init_relevance(mode)
 
         self._setup_vectors(self._get_root_vectors())
 
         # Transfers do not require recursion, but they have to be set up after the vector setup.
-        self._setup_transfers()
+        self._setup_transfers(self._abs_desvars, self._abs_responses)
 
         # Same situation with solvers, partials, and Jacobians.
         # If we're updating, we just need to re-run setup on these, but no recursion necessary.
@@ -1525,8 +1577,30 @@ def _set_auto_order(self, strongcomps, orders):
                           "`allow_post_setup_reorder` to True or to manually set the execution "
                           "order to the recommended order using `set_order`.")
 
+    def _check_nondist_sizes(self):
+        # verify that nondistributed variables have same size across all procs
+        abs2idx = self._var_allprocs_abs2idx
+        for io in ('input', 'output'):
+            sizes = self._var_sizes[io]
+            for abs_name, meta in self._var_allprocs_abs2meta[io].items():
+                if not meta['distributed']:
+                    vsizes = sizes[:, abs2idx[abs_name]]
+                    unique = set(vsizes)
+                    unique.discard(0)
+                    if len(unique) > 1:
+                        # sizes differ, now find which procs don't agree
+                        rnklist = []
+                        for sz in unique:
+                            rnklist.append((sz, [i for i, s in enumerate(vsizes) if s == sz]))
+                        msg = ', '.join([f"rank(s) {r} have size {s}" for s, r in rnklist])
+                        self._collect_error(f"{self.msginfo}: Size of {io} '{abs_name}' "
+                                            f"differs between processes ({msg}).",
+                                            ident=('size', abs_name))
+
     def _top_level_post_sizes(self):
         # this runs after the variable sizes are known
+        self._check_nondist_sizes()
+
         self._setup_global_shapes()
 
         self._resolve_ambiguous_input_meta()
@@ -1767,6 +1841,7 @@ def _setup_var_data(self):
             self._group_inputs[n] = lst.copy()  # must copy the list manually
 
         self._has_distrib_vars = False
+        self._has_fd_group = self._owns_approx_jac
         abs_in2prom_info = self._problem_meta['abs_in2prom_info']
 
         # sort the subsystems alphabetically in order to make the ordering
@@ -1777,6 +1852,8 @@ def _setup_var_data(self):
             self._has_output_adder |= subsys._has_output_adder
             self._has_resid_scaling |= subsys._has_resid_scaling
             self._has_distrib_vars |= subsys._has_distrib_vars
+            if len(subsys._subsystems_allprocs) > 0:
+                self._has_fd_group |= subsys._has_fd_group
 
             var_maps = subsys._get_promotion_maps()
 
@@ -1833,7 +1910,8 @@ def _setup_var_data(self):
             if self._gather_full_data():
                 raw = (allprocs_discrete, allprocs_prom2abs_list, allprocs_abs2meta,
                        self._has_output_scaling, self._has_output_adder,
-                       self._has_resid_scaling, self._group_inputs, self._has_distrib_vars)
+                       self._has_resid_scaling, self._group_inputs, self._has_distrib_vars,
+                       self._has_fd_group)
             else:
                 raw = (
                     {'input': {}, 'output': {}},
@@ -1844,6 +1922,7 @@ def _setup_var_data(self):
                     False,
                     {},
                     False,
+                    False,
                 )
 
             gathered = self.comm.allgather(raw)
@@ -1857,11 +1936,13 @@ def _setup_var_data(self):
 
             myrank = self.comm.rank
             for rank, (proc_discrete, proc_prom2abs_list, proc_abs2meta,
-                       oscale, oadd, rscale, ginputs, has_dist_vars) in enumerate(gathered):
+                       oscale, oadd, rscale, ginputs, has_dist_vars,
+                       has_fd_group) in enumerate(gathered):
                 self._has_output_scaling |= oscale
                 self._has_output_adder |= oadd
                 self._has_resid_scaling |= rscale
                 self._has_distrib_vars |= has_dist_vars
+                self._has_fd_group |= has_fd_group
 
                 if rank != myrank:
                     for p, mlist in ginputs.items():
@@ -2612,23 +2693,6 @@ def is_unresolved(graph, node):
                     return True
             return False
 
-        def get_rev_conn():
-            """
-            Return a dict mapping each connected input to a list of its connected outputs.
-
-            Returns
-            -------
-            dict
-                Dict mapping each connected input to a list of its connected outputs.
-            """
-            rev = {}
-            for tgt, src in conn.items():
-                if src in rev:
-                    rev[src].append(tgt)
-                else:
-                    rev[src] = [tgt]
-            return rev
-
         def meta2node_data(meta):
             """
             Return a dict containing select metadata for the given variable.
@@ -2684,7 +2748,7 @@ def meta2node_data(meta):
                         graph.add_edge(abs_from, name, multi=False)
                     else:
                         if rev_conn is None:
-                            rev_conn = get_rev_conn()
+                            rev_conn = get_rev_conns(self._conn_global_abs_in2out)
                         if name in rev_conn:  # connected output
                             for inp in rev_conn[name]:
                                 inmeta = all_abs2meta_in[inp]
@@ -3109,10 +3173,17 @@ def _transfer(self, vec_name, mode, sub=None):
             if xfer is not None:
                 if self._has_input_scaling:
                     vec_inputs.scale_to_norm(mode='rev')
-                    xfer._transfer(vec_inputs, self._vectors['output'][vec_name], mode)
+
+                xfer._transfer(vec_inputs, self._vectors['output'][vec_name], mode)
+
+                if self._problem_meta['parallel_deriv_color'] is None:
+                    key = (sub, 'nocolor')
+                    if key in self._transfers['rev']:
+                        xfer = self._transfers['rev'][key]
+                        xfer._transfer(vec_inputs, self._vectors['output'][vec_name], mode)
+
+                if self._has_input_scaling:
                     vec_inputs.scale_to_phys(mode='rev')
-                else:
-                    xfer._transfer(vec_inputs, self._vectors['output'][vec_name], mode)
 
     def _discrete_transfer(self, sub):
         """
@@ -3173,11 +3244,18 @@ def _discrete_transfer(self, sub):
                                 src_val = src_sys._discrete_outputs[src]
                             tgt_sys._discrete_inputs[tgt] = src_val
 
-    def _setup_transfers(self):
+    def _setup_transfers(self, desvars, responses):
         """
         Compute all transfers that are owned by this system.
+
+        Parameters
+        ----------
+        desvars : dict
+            Dictionary of all design variable metadata. Keyed by absolute source name or alias.
+        responses : dict
+            Dictionary of all response variable metadata. Keyed by absolute source name or alias.
         """
-        self._vector_class.TRANSFER._setup_transfers(self)
+        self._vector_class.TRANSFER._setup_transfers(self, desvars, responses)
         if self._conn_discrete_in2out:
             self._vector_class.TRANSFER._setup_discrete_transfers(self)
 
@@ -3695,12 +3773,50 @@ def _apply_linear(self, jac, rel_systems, mode, scope_out=None, scope_in=None):
         if jac is not None:
             with self._matvec_context(scope_out, scope_in, mode) as vecs:
                 d_inputs, d_outputs, d_residuals = vecs
+
                 jac._apply(self, d_inputs, d_outputs, d_residuals, mode)
+
+                # _fd_rev_xfer_correction_dist is used to correct for the fact that we don't
+                # do reverse transfers internal to an FD group.  Reverse transfers
+                # are constructed such that derivative values are correct when transferred into
+                # system doutput variables, taking into account distributed inputs.
+                # Since the transfers are not correcting for those issues, we need to do it here.
+
+                # If we have a distributed constraint/obj within the FD group and that con/obj is,
+                # active, we perform essentially an allreduce on the d_inputs vars that connect to
+                # outside systems so they'll include the contribution from all procs.
+                if self._fd_rev_xfer_correction_dist and mode == 'rev':
+                    seed_vars = self._problem_meta['seed_vars']
+                    if seed_vars is not None:
+                        seed_vars = [n for n in seed_vars if n in self._fd_rev_xfer_correction_dist]
+                        slices = self._dinputs.get_slice_dict()
+                        inarr = self._dinputs.asarray()
+                        data = {}
+                        for seed_var in seed_vars:
+                            for inp in self._fd_rev_xfer_correction_dist[seed_var]:
+                                if inp not in data:
+                                    if inp in slices:  # inp is a local input
+                                        arr = inarr[slices[inp]]
+                                        if np.any(arr):
+                                            data[inp] = arr
+                                        else:
+                                            data[inp] = None  # don't send an array of zeros
+                                    else:
+                                        data[inp] = None  # prevent possible MPI hangs
+
+                        if data:
+                            myrank = self.comm.rank
+                            for rank, d in enumerate(self.comm.allgather(data)):
+                                if rank != myrank:
+                                    for n, val in d.items():
+                                        if val is not None and n in slices:
+                                            inarr[slices[n]] += val
+
         # Apply recursion
         else:
             if mode == 'fwd':
                 self._transfer('linear', mode)
-                if rel_systems is not None:
+                if rel_systems is not None and rel_systems is not _contains_all:
                     for s in self._solver_subsystem_iter(local_only=True):
                         if s.pathname not in rel_systems:
                             # zero out dvecs of irrelevant subsystems
@@ -3712,7 +3828,7 @@ def _apply_linear(self, jac, rel_systems, mode, scope_out=None, scope_in=None):
 
             if mode == 'rev':
                 self._transfer('linear', mode)
-                if rel_systems is not None:
+                if rel_systems is not None and rel_systems is not _contains_all:
                     for s in self._solver_subsystem_iter(local_only=True):
                         if s.pathname not in rel_systems:
                             # zero out dvecs of irrelevant subsystems
@@ -3758,12 +3874,11 @@ def _solve_linear(self, mode, rel_systems, scope_out=_UNDEFINED, scope_in=_UNDEF
 
                 # ExplicitComponent jacobian defined with -1 on diagonal.
                 d_residuals *= -1.0
-
         else:
             self._linear_solver._set_matvec_scope(scope_out, scope_in)
             self._linear_solver.solve(mode, rel_systems)
 
-    def _linearize(self, jac, sub_do_ln=True, rel_systems=ContainsAll()):
+    def _linearize(self, jac, sub_do_ln=True, rel_systems=_contains_all):
         """
         Compute jacobian / factorization. The model is assumed to be in a scaled state.
 
@@ -4025,6 +4140,9 @@ def _jac_of_iter(self):
                 else:
                     path = of
 
+                if not total and path not in self._var_abs2meta['output']:
+                    continue
+
                 meta = abs2meta[path]
                 if meta['distributed']:
                     dist_sizes = sizes[:, abs2idx[path]]
@@ -4099,7 +4217,7 @@ def _jac_wrt_iter(self, wrt_matches=None):
                     elif wrt in local_outs:
                         vec = self._outputs
                     else:
-                        vec = None
+                        vec = None  # remote wrt
                     if wrt in approx_wrt_idx:
                         sub_wrt_idx = approx_wrt_idx[wrt]
                         size = sub_wrt_idx.indexed_src_size
@@ -4107,6 +4225,8 @@ def _jac_wrt_iter(self, wrt_matches=None):
                     else:
                         sub_wrt_idx = _full_slice
                         size = abs2meta[io][wrt][szname]
+                    if vec is None:
+                        sub_wrt_idx = repeat(None, size)
                     end += size
                     dist_sizes = sizes[io][:, toidx[wrt]] if meta['distributed'] else None
                     yield wrt, start, end, vec, sub_wrt_idx, dist_sizes
@@ -4163,6 +4283,9 @@ def _setup_approx_partials(self):
 
         abs2meta = self._var_allprocs_abs2meta
         info = self._coloring_info
+        total = self.pathname == ''
+        nprocs = self.comm.size
+
         responses = self.get_responses(recurse=True, get_sizes=False, use_prom_ivc=False)
 
         if info['coloring'] is not None and (self._owns_approx_of is None or
@@ -4178,13 +4301,32 @@ def _setup_approx_partials(self):
         approx._wrt_meta = {}
         approx._reset()
 
+        sizes_out = self._var_sizes['output']
+        sizes_in = self._var_sizes['input']
+        abs2idx = self._var_allprocs_abs2idx
+
+        self._cross_keys = set()
         approx_keys = self._get_approx_subjac_keys()
         for key in approx_keys:
             left, right = key
-            if left in responses and responses[left]['alias'] is not None:
-                left = responses[left]['source']
-            if right in responses and responses[right]['alias'] is not None:
-                right = responses[right]['source']
+            if total:
+                if left in responses and responses[left]['alias'] is not None:
+                    left = responses[left]['source']
+                if right in responses and responses[right]['alias'] is not None:
+                    right = responses[right]['source']
+            elif nprocs > 1 and self._has_fd_group:
+                sout = sizes_out[:, abs2idx[left]]
+                sin = sizes_in[:, abs2idx[right]]
+                if np.count_nonzero(sout[sin == 0]) > 0 and np.count_nonzero(sin[sout == 0]) > 0:
+                    # we have of and wrt that exist on different procs. Now see if they're relevant
+                    # to each other
+                    for rel in self._relevant.values():
+                        relins = rel['@all'][0]['input']
+                        relouts = rel['@all'][0]['output']
+                        if left in relouts:
+                            if right in relins or right in relouts:
+                                self._cross_keys.add(key)
+                                break
 
             if key in self._subjacs_info:
                 meta = self._subjacs_info[key]
@@ -4220,7 +4362,7 @@ def _setup_approx_partials(self):
 
             approx.add_approximation(key, self, meta)
 
-        if self.pathname:
+        if not total:
             abs_outs = self._var_allprocs_abs2meta['output']
             abs_ins = self._var_allprocs_abs2meta['input']
             # we're taking semi-total derivs for this group. Update _owns_approx_of
@@ -4896,8 +5038,7 @@ def _setup_iteration_lists(self):
         if not designvars or not responses:
             return
 
-        conns = self._conn_global_abs_in2out
-        graph = get_hybrid_graph(conns)
+        graph = self.get_hybrid_graph()
 
         # now add design vars and responses to the graph
         for dv in meta2src_iter(designvars.values()):
diff --git a/openmdao/core/implicitcomponent.py b/openmdao/core/implicitcomponent.py
index ee27bd4557..a690f77b87 100644
--- a/openmdao/core/implicitcomponent.py
+++ b/openmdao/core/implicitcomponent.py
@@ -5,6 +5,7 @@
 
 from openmdao.core.component import Component, _allowed_types
 from openmdao.core.constants import _UNDEFINED, _SetupStatus
+from openmdao.vectors.vector import _full_slice
 from openmdao.recorders.recording_iteration_stack import Recording
 from openmdao.utils.class_util import overrides_method
 from openmdao.utils.array_utils import shape_to_len
@@ -12,9 +13,6 @@
 from openmdao.utils.units import simplify_unit
 
 
-_full_slice = slice(None)
-
-
 def _get_slice_shape_dict(name_shape_iter):
     """
     Return a dict of (slice, shape) tuples using provided names and shapes.
diff --git a/openmdao/core/problem.py b/openmdao/core/problem.py
index 84345c42f9..9904f49ebc 100644
--- a/openmdao/core/problem.py
+++ b/openmdao/core/problem.py
@@ -24,7 +24,7 @@
 from openmdao.core.explicitcomponent import ExplicitComponent
 from openmdao.core.system import System, _OptStatus
 from openmdao.core.group import Group
-from openmdao.core.total_jac import _TotalJacInfo
+from openmdao.core.total_jac import _TotalJacInfo, _contains_all
 from openmdao.core.constants import _DEFAULT_OUT_STREAM, _UNDEFINED
 from openmdao.jacobians.dictionary_jacobian import _CheckingJacobian
 from openmdao.approximation_schemes.complex_step import ComplexStep
@@ -48,7 +48,7 @@
 from openmdao.utils.class_util import overrides_method
 from openmdao.utils.reports_system import get_reports_to_activate, activate_reports, \
     clear_reports, get_reports_dir, _load_report_plugins
-from openmdao.utils.general_utils import ContainsAll, pad_name, LocalRangeIterable, \
+from openmdao.utils.general_utils import _contains_all, pad_name, LocalRangeIterable, \
     _find_dict_meta, env_truthy, add_border, match_includes_excludes, inconsistent_across_procs
 from openmdao.utils.om_warnings import issue_warning, DerivativesWarning, warn_deprecation, \
     OMInvalidCheckDerivativesOptionsWarning
@@ -62,7 +62,6 @@
 
 from openmdao.utils.name_maps import rel_key2abs_key, rel_name2abs_name
 
-_contains_all = ContainsAll()
 
 CITATION = """@article{openmdao_2019,
     Author={Justin S. Gray and John T. Hwang and Joaquim R. R. A.
@@ -1004,6 +1003,11 @@ def setup(self, check=False, logger=None, mode='auto', force_alloc_complex=False
             'model_options': self.model_options,  # A dict of options passed to all systems in tree
             'allow_post_setup_reorder': self.options['allow_post_setup_reorder'],  # see option
             'singular_jac_behavior': 'warn',  # How to handle singular jac conditions
+            'parallel_deriv_color': None,  # None unless derivatives involving a parallel deriv
+                                           # colored dv/response are currently being computed
+            'seed_vars': None,  # set of names of seed variables. Seed variables are those that
+                                # have their derivative value set to 1.0 at the beginning of the
+                                # current derivative solve.
             'coloring_randgen': None,  # If total coloring is being computed, will contain a random
                                        # number generator, else None.
         }
@@ -1636,7 +1640,7 @@ def check_partials(self, out_stream=_DEFAULT_OUT_STREAM, includes=None, excludes
     def check_totals(self, of=None, wrt=None, out_stream=_DEFAULT_OUT_STREAM, compact_print=False,
                      driver_scaling=False, abs_err_tol=1e-6, rel_err_tol=1e-6, method='fd',
                      step=None, form=None, step_calc='abs', show_progress=False,
-                     show_only_incorrect=False, directional=False):
+                     show_only_incorrect=False, directional=False, sort=False):
         """
         Check total derivatives for the model vs. finite difference.
 
@@ -1686,6 +1690,8 @@ def check_totals(self, of=None, wrt=None, out_stream=_DEFAULT_OUT_STREAM, compac
         directional : bool
             If True, compute a single directional derivative for each 'of' in rev mode or each
             'wrt' in fwd mode.
+        sort : bool
+            If True, sort the subjacobian keys alphabetically.
 
         Returns
         -------
@@ -1864,8 +1870,12 @@ def check_totals(self, of=None, wrt=None, out_stream=_DEFAULT_OUT_STREAM, compac
         resp = self.driver._responses
         do_steps = len(Jfds) > 1
 
+        Jcalc_items = Jcalc.items()
+        if sort:
+            Jcalc_items = sorted(Jcalc_items, key=lambda x: x[0])
+
         for Jfd, step in Jfds:
-            for key, val in Jcalc.items():
+            for key, val in Jcalc_items:
                 if key not in data['']:
                     data[''][key] = {}
                 meta = data[''][key]
@@ -1915,7 +1925,7 @@ def check_totals(self, of=None, wrt=None, out_stream=_DEFAULT_OUT_STREAM, compac
 
         _assemble_derivative_data(data, rel_err_tol, abs_err_tol, out_stream, compact_print,
                                   [model], {'': fd_args}, totals=total_info, lcons=lcons,
-                                  show_only_incorrect=show_only_incorrect)
+                                  show_only_incorrect=show_only_incorrect, sort=sort)
 
         if not do_steps:
             _fix_check_data(data)
@@ -2929,7 +2939,7 @@ def _fix_check_data(data):
 def _assemble_derivative_data(derivative_data, rel_error_tol, abs_error_tol, out_stream,
                               compact_print, system_list, global_options, totals=False,
                               indep_key=None, print_reverse=False,
-                              show_only_incorrect=False, lcons=None):
+                              show_only_incorrect=False, lcons=None, sort=False):
     """
     Compute the relative and absolute errors in the given derivatives and print to the out_stream.
 
@@ -2960,6 +2970,8 @@ def _assemble_derivative_data(derivative_data, rel_error_tol, abs_error_tol, out
         Set to True if output should print only the subjacs found to be incorrect.
     lcons : list or None
         For total derivatives only, list of outputs that are actually linear constraints.
+    sort : bool
+        If True, sort subjacobian keys alphabetically.
     """
     suppress_output = out_stream is None
 
@@ -3244,44 +3256,48 @@ def _assemble_derivative_data(derivative_data, rel_error_tol, abs_error_tol, out
                     out_buffer.write(f'\n    MPI Rank {MPI.COMM_WORLD.rank}\n')
                 out_buffer.write('\n')
 
-                # Raw Derivatives
-                if magnitudes[0].forward is not None:
-                    if directional:
-                        out_buffer.write('    Directional Derivative (Jfor)')
-                    else:
-                        out_buffer.write('    Raw Forward Derivative (Jfor)')
-                    out_buffer.write(f"\n    {Jfor}\n\n")
+                with np.printoptions(linewidth=240):
+                    # Raw Derivatives
+                    if magnitudes[0].forward is not None:
+                        if directional:
+                            out_buffer.write('    Directional Derivative (Jfor)')
+                        else:
+                            out_buffer.write('    Raw Forward Derivative (Jfor)')
+                        Jstr = textwrap.indent(str(Jfor), '    ')
+                        out_buffer.write(f"\n{Jstr}\n\n")
 
-                fdtype = fd_opts['method'].upper()
+                    fdtype = fd_opts['method'].upper()
 
-                if magnitudes[0].reverse is not None:
-                    if directional:
-                        if totals:
-                            out_buffer.write('    Directional Derivative (Jrev) Dot Product')
+                    if magnitudes[0].reverse is not None:
+                        if directional:
+                            if totals:
+                                out_buffer.write('    Directional Derivative (Jrev) Dot Product')
+                            else:
+                                out_buffer.write('    Directional Derivative (Jrev)')
                         else:
-                            out_buffer.write('    Directional Derivative (Jrev)')
-                    else:
-                        out_buffer.write('    Raw Reverse Derivative (Jrev)')
-                    out_buffer.write(f"\n    {Jrev}\n\n")
+                            out_buffer.write('    Raw Reverse Derivative (Jrev)')
+                        Jstr = textwrap.indent(str(Jrev), '    ')
+                        out_buffer.write(f"\n{Jstr}\n\n")
 
-                try:
-                    fds = derivative_info['J_fd']
-                except KeyError:
-                    fds = [0.]
+                    try:
+                        fds = derivative_info['J_fd']
+                    except KeyError:
+                        fds = [0.]
 
-                for i in range(len(magnitudes)):
-                    fd = fds[i]
+                    for i in range(len(magnitudes)):
+                        fd = fds[i]
 
-                    if directional:
-                        if totals and magnitudes[i].reverse is not None:
-                            out_buffer.write(f'    Directional {fdtype} Derivative (Jfd) '
-                                             f'Dot Product{stepstrs[i]}\n    {fd}\n\n')
+                        if directional:
+                            if totals and magnitudes[i].reverse is not None:
+                                out_buffer.write(f'    Directional {fdtype} Derivative (Jfd) '
+                                                 f'Dot Product{stepstrs[i]}\n    {fd}\n\n')
+                            else:
+                                out_buffer.write(f"    Directional {fdtype} Derivative (Jfd)"
+                                                 f"{stepstrs[i]}\n    {fd}\n\n")
                         else:
-                            out_buffer.write(f"    Directional {fdtype} Derivative (Jfd)"
-                                             f"{stepstrs[i]}\n    {fd}\n\n")
-                    else:
-                        out_buffer.write(f"    Raw {fdtype} Derivative (Jfd){stepstrs[i]}"
-                                         f"\n    {fd}\n\n")
+                            Jstr = textwrap.indent(str(fd), '    ')
+                            out_buffer.write(f"    Raw {fdtype} Derivative (Jfd){stepstrs[i]}"
+                                             f"\n{Jstr}\n\n")
 
                 out_buffer.write(' -' * 30 + '\n')
 
@@ -3339,7 +3355,7 @@ def _assemble_derivative_data(derivative_data, rel_error_tol, abs_error_tol, out
                   f"{ders}.\nThis can happen if a component 'compute_jacvec_product' "
                   "or 'apply_linear'\nmethod does not properly reduce the value of a distributed "
                   "output when computing the\nderivative of that output with respect to a serial "
-                  "input.\nOpenMDAO 3.25 changed the convention used"
+                  "input.\nOpenMDAO 3.25 changed the convention used "
                   "when transferring data between distributed and non-distributed \nvariables "
                   "within a matrix free component. See POEM 75 for details.")
 
diff --git a/openmdao/core/system.py b/openmdao/core/system.py
index e2c7d0f73b..31deda8486 100644
--- a/openmdao/core/system.py
+++ b/openmdao/core/system.py
@@ -35,7 +35,7 @@
 from openmdao.utils.om_warnings import issue_warning, \
     DerivativesWarning, PromotionWarning, UnusedOptionWarning, UnitsWarning
 from openmdao.utils.general_utils import determine_adder_scaler, \
-    format_as_float_or_array, ContainsAll, all_ancestors, make_set, match_prom_or_abs, \
+    format_as_float_or_array, _contains_all, all_ancestors, make_set, match_prom_or_abs, \
     ensure_compatible, env_truthy, make_traceback, _is_slicer_op
 from openmdao.approximation_schemes.complex_step import ComplexStep
 from openmdao.approximation_schemes.finite_difference import FiniteDifference
@@ -672,8 +672,8 @@ def _jac_wrt_iter(self, wrt_matches=None):
             Starting index.
         int
             Ending index.
-        Vector
-            Either the _outputs or _inputs vector.
+        Vector or None
+            Either the _outputs or _inputs vector if var is local else None.
         slice
             A full slice.
         ndarray or None
@@ -2257,9 +2257,16 @@ def _setup_vectors(self, root_vectors):
             subsys._scale_factors = self._scale_factors
             subsys._setup_vectors(root_vectors)
 
-    def _setup_transfers(self):
+    def _setup_transfers(self, desvars, responses):
         """
         Compute all transfers that are owned by this system.
+
+        Parameters
+        ----------
+        desvars : dict
+            Dictionary of all design variable metadata. Keyed by absolute source name or alias.
+        responses : dict
+            Dictionary of all response variable metadata. Keyed by absolute source name or alias.
         """
         pass
 
@@ -2585,9 +2592,8 @@ def _matvec_context(self, scope_out, scope_in, mode, clear=True):
         """
         Context manager for vectors.
 
-        For the given vec_name, return vectors that use a set of
-        internal variables that are relevant to the current matrix-vector
-        product.  This is called only from _apply_linear.
+        Return vectors that use a set of internal variables that are relevant to the current
+        matrix-vector product.  This is called only from _apply_linear.
 
         Parameters
         ----------
@@ -4569,7 +4575,7 @@ def run_apply_linear(self, mode, scope_out=None, scope_in=None):
             If None, all are in the scope.
         """
         with self._scaled_context_all():
-            self._apply_linear(None, ContainsAll(), mode, scope_out, scope_in)
+            self._apply_linear(None, _contains_all, mode, scope_out, scope_in)
 
     def run_solve_linear(self, mode):
         """
@@ -4583,7 +4589,7 @@ def run_solve_linear(self, mode):
             'fwd' or 'rev'.
         """
         with self._scaled_context_all():
-            self._solve_linear(mode, ContainsAll())
+            self._solve_linear(mode, _contains_all)
 
     def run_linearize(self, sub_do_ln=True):
         """
@@ -5569,10 +5575,10 @@ def _get_input_from_src(self, name, abs_ins, conns, units=None, indices=None,
                 if vshape is not None:
                     val = val.reshape(vshape)
             else:
+                var_idx = self._var_allprocs_abs2idx[src]
+                sizes = self._var_sizes['output'][:, var_idx]
                 if distrib and (sdistrib or dynshape or not slocal) and not get_remote:
-                    var_idx = self._var_allprocs_abs2idx[src]
                     # sizes for src var in each proc
-                    sizes = self._var_sizes['output'][:, var_idx]
                     start = np.sum(sizes[:self.comm.rank])
                     end = start + sizes[self.comm.rank]
                     src_indices = src_indices.shaped_array(copy=True)
@@ -6002,7 +6008,7 @@ def _get_full_dist_shape(self, abs_name, local_shape):
         else:
             io = 'input'
             scope = self
-        # io = 'output' if abs_name in self._var_allprocs_abs2meta['output'] else 'input'
+
         meta = scope._var_allprocs_abs2meta[io][abs_name]
         var_idx = scope._var_allprocs_abs2idx[abs_name]
         global_size = np.sum(scope._var_sizes[io][:, var_idx])
@@ -6291,3 +6297,98 @@ def comm_info_iter(self):
 
             for s in self._subsystems_myproc:
                 yield from s.comm_info_iter()
+
+    def dist_size_iter(self, io, top_comm):
+        """
+        Yield names and distributed ranges of all local and remote variables in this system.
+
+        Parameters
+        ----------
+        io : str
+            Either 'input' or 'output'.
+        top_comm : MPI.Comm or None
+            The top-level MPI communicator.
+
+        Yields
+        ------
+        tuple
+            A tuple of the form ((abs_name, rank), start, end).
+        """
+        sizes = self._var_sizes
+        vmeta = self._var_allprocs_abs2meta
+
+        topranks = np.arange(top_comm.size)
+
+        myrank = self.comm.rank
+        toprank = top_comm.rank
+
+        mytopranks = topranks[toprank - myrank: toprank - myrank + self.comm.size]
+
+        for rank in range(self.comm.size):
+            for ivar, vname in enumerate(vmeta[io]):
+                sz = sizes[io][rank, ivar]
+                if sz > 0:
+                    yield (vname, mytopranks[rank]), sz
+
+    def local_range_iter(self, io):
+        """
+        Yield names and local ranges of all local variables in this system.
+
+        Parameters
+        ----------
+        io : str
+            Either 'input' or 'output'.
+
+        Yields
+        ------
+        tuple
+            A tuple of the form (abs_name, start, end).
+        """
+        vmeta = self._var_allprocs_abs2meta
+
+        offset = 0
+        for vname, size in zip(vmeta[io], self._var_sizes[io][self.comm.rank]):
+            if size > 0:
+                yield vname, offset, offset + size
+            offset += size
+
+    def get_var_dup_info(self, name, io):
+        """
+        Return information about how the given variable is duplicated across MPI processes.
+
+        Parameters
+        ----------
+        name : str
+            Name of the variable.
+        io : str
+            Either 'input' or 'output'.
+
+        Returns
+        -------
+        tuple
+            A tuple of the form (is_duplicated, num_zeros, is_distributed).
+        """
+        nz = np.count_nonzero(self._var_sizes[io][:, self._var_allprocs_abs2idx[name]])
+
+        if self._var_allprocs_abs2meta[io][name]['distributed']:
+            return False, self._var_sizes[io].shape[0] - nz, True  # distributed vars are never dups
+
+        return nz > 1, self._var_sizes[io].shape[0] - nz, False
+
+    def get_var_sizes(self, name, io):
+        """
+        Return the sizes of the given variable on all procs.
+
+        Parameters
+        ----------
+        name : str
+            Name of the variable.
+        io : str
+            Either 'input' or 'output'.
+
+        Returns
+        -------
+        ndarray
+            Array of sizes of the variable on all procs.
+        """
+        return self._var_sizes[io][:, self._var_allprocs_abs2idx[name]]
diff --git a/openmdao/core/tests/test_approx_derivs.py b/openmdao/core/tests/test_approx_derivs.py
index a2c56bafb2..2b94522798 100644
--- a/openmdao/core/tests/test_approx_derivs.py
+++ b/openmdao/core/tests/test_approx_derivs.py
@@ -14,10 +14,10 @@
 from openmdao.test_suite.components.unit_conv import SrcComp, TgtCompC, TgtCompF, TgtCompK
 from openmdao.test_suite.groups.parallel_groups import FanInSubbedIDVC
 from openmdao.test_suite.parametric_suite import parametric_suite
-from openmdao.utils.assert_utils import assert_near_equal, assert_warnings, assert_check_partials, assert_warning
+from openmdao.utils.assert_utils import assert_near_equal, assert_check_partials, \
+    assert_check_totals, assert_warnings
 from openmdao.utils.general_utils import set_pyoptsparse_opt
 from openmdao.utils.mpi import MPI
-from openmdao.utils.om_warnings import OMDeprecationWarning
 from openmdao.utils.testing_utils import use_tempdirs
 
 try:
@@ -2571,13 +2571,12 @@ def compute_partials(self, inputs, J):
         prob = om.Problem(model=model)
         prob.setup(force_alloc_complex=True)
         prob.run_model()
-        data  = prob.check_totals(method='cs', out_stream=None)
-        assert_near_equal(data[('pg.dc1.y', 'iv.x')]['abs error'][0], 0.0, 1e-6)
-        assert_near_equal(data[('pg.dc3.y', 'iv.x')]['abs error'][0], 0.0, 1e-6)
+        assert_check_totals(prob.check_totals(method='cs', out_stream=None))
 
 class CheckTotalsIndices(unittest.TestCase):
 
     def test_w_indices(self):
+        # just checks for indexing error.  Doesn't raise exception if derivs are wrong.
         class TopComp(om.ExplicitComponent):
 
             def setup(self):
@@ -2597,7 +2596,7 @@ def compute_partials(self, inputs, partials):
         prob = om.Problem()
         model = prob.model
 
-        geom = model.add_subsystem('tcomp', TopComp())
+        model.add_subsystem('tcomp', TopComp())
 
         # setting indices here caused an indexing error later on
         model.add_design_var('tcomp.theta_c2_C', lower=-20., upper=20., indices=range(2, 9))
@@ -2609,5 +2608,439 @@ def compute_partials(self, inputs, partials):
         check = prob.check_totals(compact_print=True)
 
 
+def _setup_1ivc_fdgroupwithpar_1sink(size=7):
+
+    prob = om.Problem()
+    model = prob.model
+
+    model.add_subsystem('p', om.IndepVarComp('x', np.ones((size, ))))
+
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+
+    sub.add_subsystem('C3', om.ExecComp('y = x1 + x2', shape=size))
+
+    model.connect('p.x', 'sub.par.C1.x')
+    model.connect('p.x', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.C3.x1')
+    model.connect('sub.par.C2.y', 'sub.C3.x2')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_objective('sub.C3.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+def _setup_2ivcs_fdgroupwithpar_1sink(size=7):
+    prob = om.Problem()
+    model = prob.model
+
+    model.add_subsystem('p', om.IndepVarComp('x', np.ones((size, ))))
+    model.add_subsystem('p2', om.IndepVarComp('x', np.ones((size, ))))
+
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+    sub.add_subsystem('C3', om.ExecComp('y = x1 + x2', shape=size))
+
+    model.connect('p.x', 'sub.par.C1.x')
+    model.connect('p2.x', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.C3.x1')
+    model.connect('sub.par.C2.y', 'sub.C3.x2')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_design_var('p2.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_objective('sub.C3.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
+def _setup_1ivc_dum_fdgroupwithpar_1sink(size=7):
+    prob = om.Problem()
+    model = prob.model
+
+    model.add_subsystem('p', om.IndepVarComp('x', np.ones((size, ))))
+
+    model.add_subsystem('dum', om.ExecComp('y = x', shape=size))
+
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+
+    sub.add_subsystem('C3', om.ExecComp('y = x1 + x2', shape=size))
+
+    model.connect('p.x', 'dum.x')
+    model.connect('dum.y', 'sub.par.C1.x')
+    model.connect('dum.y', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.C3.x1')
+    model.connect('sub.par.C2.y', 'sub.C3.x2')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_objective('sub.C3.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+def _setup_1ivc_dum_fdgroupwithpar_1sink_c4(size=7):
+
+    prob = om.Problem()
+    model = prob.model
+
+    ivc = om.IndepVarComp()
+    ivc.add_output('x', np.ones((size, )))
+
+    model.add_subsystem('p', ivc)
+    model.add_subsystem('dum', om.ExecComp('y = x', shape=size))
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+    sub.add_subsystem('C3', om.ExecComp('y = x1 + x2', shape=size))
+    model.add_subsystem('C4', om.ExecComp('y = x', shape=size))
+
+    model.connect('p.x', 'dum.x')
+    model.connect('dum.y', 'sub.par.C1.x')
+    model.connect('dum.y', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.C3.x1')
+    model.connect('sub.par.C2.y', 'sub.C3.x2')
+    model.connect('sub.C3.y', 'C4.x')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_objective('C4.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
+def _setup_2ivcs_fdgroupwithpar_2sinks(size=7):
+    prob = om.Problem()
+    model = prob.model
+
+    model.add_subsystem('p', om.IndepVarComp('x', np.ones((size, ))))
+    model.add_subsystem('p2', om.IndepVarComp('x', np.ones((size, ))))
+
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+
+    sub.add_subsystem('C3', om.ExecComp('y = x*.5', shape=size))
+    sub.add_subsystem('C4', om.ExecComp('y = x*3.', shape=size))
+
+    model.connect('p.x', 'sub.par.C1.x')
+    model.connect('p2.x', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.C3.x')
+    model.connect('sub.par.C2.y', 'sub.C4.x')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_design_var('p2.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_constraint('sub.C4.y', lower=0.0)
+    model.add_objective('sub.C3.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
+def _setup_2ivcs_fdgroupwith2pars_2sinks(size=7):
+    prob = om.Problem()
+    model = prob.model
+
+    model.add_subsystem('p', om.IndepVarComp('x', np.ones((size, ))))
+    model.add_subsystem('p2', om.IndepVarComp('x', np.ones((size, ))))
+
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+
+    par2 = sub.add_subsystem('par2', om.ParallelGroup())
+    par2.add_subsystem('C1a', om.ExecComp('y = 2.*x', shape=size))
+    par2.add_subsystem('C2a', om.ExecComp('y = 3.*x', shape=size))
+
+    sub.add_subsystem('C3', om.ExecComp('y = x*.5', shape=size))
+    sub.add_subsystem('C4', om.ExecComp('y = x*3.', shape=size))
+
+    model.connect('p.x', 'sub.par.C1.x')
+    model.connect('p2.x', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.par2.C1a.x')
+    model.connect('sub.par.C2.y', 'sub.par2.C2a.x')
+    model.connect('sub.par2.C1a.y', 'sub.C3.x')
+    model.connect('sub.par2.C2a.y', 'sub.C4.x')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_design_var('p2.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_constraint('sub.par2.C1a.y', lower=0.0)
+    model.add_constraint('sub.par2.C2a.y', lower=0.0)
+    model.add_constraint('sub.C4.y', lower=0.0)
+    model.add_objective('sub.C3.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
+def _setup_2ivcs_fdgroupwith2pars_1sink(size=7):
+    prob = om.Problem()
+    model = prob.model
+
+    model.add_subsystem('p', om.IndepVarComp('x', np.ones((size, ))))
+    model.add_subsystem('p2', om.IndepVarComp('x', np.ones((size, ))))
+
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+
+    par2 = sub.add_subsystem('par2', om.ParallelGroup())
+    par2.add_subsystem('C1a', om.ExecComp('y = 2.*x', shape=size))
+    par2.add_subsystem('C2a', om.ExecComp('y = 3.*x', shape=size))
+
+    sub.add_subsystem('C3', om.ExecComp('y = x1 + x2', shape=size))
+
+    model.connect('p.x', 'sub.par.C1.x')
+    model.connect('p2.x', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.par2.C1a.x')
+    model.connect('sub.par.C2.y', 'sub.par2.C2a.x')
+    model.connect('sub.par2.C1a.y', 'sub.C3.x1')
+    model.connect('sub.par2.C2a.y', 'sub.C3.x2')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_design_var('p2.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_constraint('sub.par2.C1a.y', lower=0.0)
+    model.add_constraint('sub.par2.C2a.y', lower=0.0)
+    model.add_objective('sub.C3.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
+def _setup_2ivcs_fdgroupwithcrisscrosspars_2sinks(size=7):
+    prob = om.Problem()
+    model = prob.model
+
+    model.add_subsystem('p', om.IndepVarComp('x', np.ones((size, ))))
+    model.add_subsystem('p2', om.IndepVarComp('x', np.ones((size, ))))
+
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+
+    par2 = sub.add_subsystem('par2', om.ParallelGroup())
+    par2.add_subsystem('C1a', om.ExecComp('y = 2.*x', shape=size))
+    par2.add_subsystem('C2a', om.ExecComp('y = 3.*x', shape=size))
+
+    sub.add_subsystem('C3', om.ExecComp('y = x*.5', shape=size))
+    sub.add_subsystem('C4', om.ExecComp('y = x*3.', shape=size))
+
+    model.connect('p.x', 'sub.par.C1.x')
+    model.connect('p2.x', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.par2.C2a.x')
+    model.connect('sub.par.C2.y', 'sub.par2.C1a.x')
+    model.connect('sub.par2.C1a.y', 'sub.C3.x')
+    model.connect('sub.par2.C2a.y', 'sub.C4.x')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_design_var('p2.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_constraint('sub.par2.C1a.y', lower=0.0)
+    model.add_constraint('sub.par2.C2a.y', lower=0.0)
+    model.add_constraint('sub.C4.y', lower=0.0)
+    model.add_objective('sub.C3.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+def _setup_2ivcs_fdgroupwithcrisscrosspars_1sink(size=7):
+    prob = om.Problem()
+    model = prob.model
+
+    model.add_subsystem('p', om.IndepVarComp('x', np.ones((size, ))))
+    model.add_subsystem('p2', om.IndepVarComp('x', np.ones((size, ))))
+
+    sub = model.add_subsystem('sub', om.Group())
+    par = sub.add_subsystem('par', om.ParallelGroup())
+    par.add_subsystem('C1', om.ExecComp('y = 2.*x', shape=size))
+    par.add_subsystem('C2', om.ExecComp('y = 3.*x', shape=size))
+
+    par2 = sub.add_subsystem('par2', om.ParallelGroup())
+    par2.add_subsystem('C1a', om.ExecComp('y = 2.*x', shape=size))
+    par2.add_subsystem('C2a', om.ExecComp('y = 3.*x', shape=size))
+
+    sub.add_subsystem('C3', om.ExecComp('y = x1 + x2', shape=size))
+
+    model.connect('p.x', 'sub.par.C1.x')
+    model.connect('p2.x', 'sub.par.C2.x')
+    model.connect('sub.par.C1.y', 'sub.par2.C2a.x')
+    model.connect('sub.par.C2.y', 'sub.par2.C1a.x')
+    model.connect('sub.par2.C1a.y', 'sub.C3.x1')
+    model.connect('sub.par2.C2a.y', 'sub.C3.x2')
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_design_var('p2.x', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.par.C1.y', lower=0.0)
+    model.add_constraint('sub.par.C2.y', lower=0.0)
+    model.add_constraint('sub.par2.C1a.y', lower=0.0)
+    model.add_constraint('sub.par2.C2a.y', lower=0.0)
+    model.add_objective('sub.C3.y', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestFDWithParallelSubGroups(unittest.TestCase):
+
+    N_PROCS = 2
+
+    def test_group_fd_inner_par_fwd(self):
+        prob = _setup_1ivc_fdgroupwithpar_1sink(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd'), atol=3e-6)
+
+    def test_group_fd_inner_par_rev(self):
+        prob = _setup_1ivc_fdgroupwithpar_1sink(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        # assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+        assert_check_totals(prob.check_totals(method='fd', show_only_incorrect=True), atol=3e-6)
+
+    def test_group_fd_inner_par2_fwd(self):
+        prob = _setup_2ivcs_fdgroupwithpar_2sinks(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_group_fd_inner_par2_rev(self):
+        prob = _setup_2ivcs_fdgroupwithpar_2sinks(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_2ivcs_fdgroupwithcrisscrosspars_2sinks_fwd(self):
+        prob = _setup_2ivcs_fdgroupwithcrisscrosspars_2sinks(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_2ivcs_fdgroupwithcrisscrosspars_2sinks_rev(self):
+        prob = _setup_2ivcs_fdgroupwithcrisscrosspars_2sinks(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_2ivcs_fdgroupwith2pars_2sinks_fwd(self):
+        prob = _setup_2ivcs_fdgroupwith2pars_2sinks(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_2ivcs_fdgroupwith2pars_2sinks_rev(self):
+        prob = _setup_2ivcs_fdgroupwith2pars_2sinks(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_2ivcs_fdgroupwithcrisscrosspars_1sink_fwd(self):
+        prob = _setup_2ivcs_fdgroupwithcrisscrosspars_1sink(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        prob.compute_totals()
+        import pprint
+        pprint.pprint({n: m['val'] for n,m in prob.model.sub._jacobian._subjacs_info.items()})
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_2ivcs_fdgroupwithcrisscrosspars_1sink_rev(self):
+        prob = _setup_2ivcs_fdgroupwithcrisscrosspars_1sink(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_2ivcs_fdgroupwith2pars_1sink_fwd(self):
+        prob = _setup_2ivcs_fdgroupwith2pars_1sink(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_2ivcs_fdgroupwith2pars_1sink_rev(self):
+        prob = _setup_2ivcs_fdgroupwith2pars_1sink(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_group_fd_inner_par2ivcs_fwd(self):
+        prob = _setup_2ivcs_fdgroupwithpar_1sink(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_group_fd_inner_par2ivcs_rev(self):
+        prob = _setup_2ivcs_fdgroupwithpar_1sink(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd'), atol=3e-6)
+
+    def test_group_fd_inner_par_indirect_fwd(self):
+        prob = _setup_1ivc_dum_fdgroupwithpar_1sink(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_group_fd_inner_par_indirect_rev(self):
+        prob = _setup_1ivc_dum_fdgroupwithpar_1sink(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        # assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+        assert_check_totals(prob.check_totals(method='fd', show_only_incorrect=True), atol=3e-6)
+
+    def test_group_fd_inner_par_indirect2_fwd(self):
+        prob = _setup_1ivc_dum_fdgroupwithpar_1sink_c4(size=7)
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_group_fd_inner_par_indirect2_rev(self):
+        prob = _setup_1ivc_dum_fdgroupwithpar_1sink_c4(size=7)
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        # assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+        assert_check_totals(prob.check_totals(method='fd', show_only_incorrect=True), atol=3e-6)
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestFDWithParallelSubGroups4(TestFDWithParallelSubGroups):
+
+    N_PROCS = 4
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/openmdao/core/tests/test_deriv_transfers.py b/openmdao/core/tests/test_deriv_transfers.py
index ca348a5b99..22696cefc0 100644
--- a/openmdao/core/tests/test_deriv_transfers.py
+++ b/openmdao/core/tests/test_deriv_transfers.py
@@ -26,10 +26,8 @@
 def _test_func_name(func, num, param):
     args = []
     for p in param.args:
-        if isinstance(p, str):
-            p = {p}
-        elif not isinstance(p, Iterable):
-            p = {p}
+        if isinstance(p, str) or not isinstance(p, Iterable):
+            p = [p]
         for item in p:
             try:
                 arg = item.__name__
diff --git a/openmdao/core/tests/test_distrib_derivs.py b/openmdao/core/tests/test_distrib_derivs.py
index 4ff53d0a2a..0cd685a4be 100644
--- a/openmdao/core/tests/test_distrib_derivs.py
+++ b/openmdao/core/tests/test_distrib_derivs.py
@@ -77,8 +77,6 @@ def setup(self):
             allvars.update(v)
 
         sizes, offsets = evenly_distrib_idxs(comm.size, self.arr_size)
-        start = offsets[rank]
-        end = start + sizes[rank]
 
         for name in outs:
             if name not in kwargs or not isinstance(kwargs[name], dict):
@@ -115,6 +113,37 @@ def compute(self, inputs, outputs):
             outputs['outvec'] = inputs['invec'] * 3.0
 
 
+class SimpleMixedDistrib2(om.ExplicitComponent):
+
+    def setup(self):
+        self.add_input('in_dist', shape_by_conn=True, distributed=True)
+        self.add_input('in_nd', shape_by_conn=True)
+        self.add_output('out_dist', copy_shape='in_dist', distributed=True)
+        self.add_output('out_nd', copy_shape='in_nd')
+
+    def compute(self, inputs, outputs):
+        outputs['out_nd'] = inputs['in_nd'] * 3.
+        outputs['out_dist'] = inputs['in_dist'] * 5.
+
+    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+        if mode == 'fwd':
+            if 'out_dist' in d_outputs:
+                if 'in_dist' in d_inputs:
+                    d_outputs['out_dist'] += 5. * d_inputs['in_dist']
+            if 'out_nd' in d_outputs:
+                if 'in_nd' in d_inputs:
+                    d_outputs['out_nd'] += 3. * d_inputs['in_nd']
+
+        else:
+            if 'out_dist' in d_outputs:
+                if 'in_dist' in d_inputs:
+                    d_inputs['in_dist'] += 5. * d_outputs['out_dist']
+
+            if 'out_nd' in d_outputs:
+                if 'in_nd' in d_inputs:
+                    d_inputs['in_nd'] += 3. * d_outputs['out_nd']
+
+
 class MixedDistrib2(om.ExplicitComponent):  # for double diamond case
 
     def setup(self):
@@ -179,6 +208,133 @@ def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
                     d_inputs['in_nd'] += dg_dIs * d_outputs['out_nd']
 
 
+def _setup_ivc_subivc_dist_parab_sum():
+    size = 7
+
+    prob = om.Problem()
+    model = prob.model
+
+    ivc = om.IndepVarComp()
+    ivc.add_output('x', np.ones((size, )))
+    ivc.add_output('y', np.ones((size, )))
+
+    model.add_subsystem('p', ivc, promotes=['*'])
+    sub = model.add_subsystem('sub', om.Group(), promotes=['*'])
+
+    ivc2 = om.IndepVarComp()
+    ivc2.add_output('a', -3.0 + 0.6 * np.arange(size))
+
+    sub.add_subsystem('p2', ivc2, promotes=['*'])
+    sub.add_subsystem('dummy', om.ExecComp(['xd = x', "yd = y"],
+                                            x=np.ones(size), xd=np.ones(size),
+                                            y=np.ones(size), yd=np.ones(size)),
+                        promotes_inputs=['*'])
+
+    sub.add_subsystem("parab", DistParab(arr_size=size), promotes_outputs=['*'], promotes_inputs=['a'])
+    model.add_subsystem('sum', om.ExecComp('f_sum = sum(xd)',
+                                            f_sum=np.ones((size, )),
+                                            xd=np.ones((size, ))),
+                        promotes_outputs=['*'])
+
+    model.promotes('sum', inputs=['xd'])
+
+    sub.connect('dummy.xd', 'parab.x')
+    sub.connect('dummy.yd', 'parab.y')
+
+    model.add_design_var('x', lower=-50.0, upper=50.0)
+    model.add_design_var('y', lower=-50.0, upper=50.0)
+    model.add_constraint('f_xy', lower=0.0)
+    model.add_objective('f_sum', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
+def _setup_ivc_sub_ivcdistparabcons_nosum():
+    # distrib comp is inside of fd group but not a response, and 2 nondistrib
+    # constraints connect to it downstream, both inside and outside of the fd group.
+    size = 7
+
+    prob = om.Problem()
+    model = prob.model
+
+    ivc = om.IndepVarComp()
+    ivc.add_output('x', np.ones((size, )))
+    ivc.add_output('y', np.ones((size, )))
+
+    model.add_subsystem('p', ivc)
+    sub = model.add_subsystem('sub', om.Group())
+
+    sub.add_subsystem('p2', om.IndepVarComp('a', -3.0 + 0.6 * np.arange(size)))
+    sub.add_subsystem('dummy', om.ExecComp(['xd = x', "yd = y"],
+                                            x=np.ones(size), xd=np.ones(size),
+                                            y=np.ones(size), yd=np.ones(size)))
+
+    sub.add_subsystem("parab", DistParab(arr_size=size))
+    sub.add_subsystem("cons", om.ExecComp("c = x*3. + 7.", x=np.ones(size), c=np.ones(size)))
+    # model.add_subsystem('sum', om.ExecComp('f_sum = sum(f_xy)', f_sum=np.ones((size, )), f_xy=np.ones((size, ))))
+
+    model.connect('p.x', 'sub.dummy.x')
+    model.connect('p.y', 'sub.dummy.y')
+    model.connect('sub.p2.a', 'sub.parab.a')
+    model.connect('sub.dummy.xd', 'sub.parab.x')
+    model.connect('sub.dummy.yd', 'sub.parab.y')
+    # model.connect('sub.parab.f_xy', 'sum.f_xy', src_indices=om.slicer[:])
+    model.connect('sub.parab.f_xy', 'sub.cons.x', src_indices=om.slicer[:])
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_design_var('p.y', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.cons.c', lower=0.0)
+    # model.add_objective('sum.f_sum', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
+def _setup_ivc_subivcdistparabconssum_in_sub():
+    # distrib comp is inside of fd group but not a response, and 2 nondistrib
+    # constraints connect to it downstream, both inside of the fd group.
+    size = 7
+
+    prob = om.Problem()
+    model = prob.model
+
+    ivc = om.IndepVarComp()
+    ivc.add_output('x', np.ones((size, )))
+    ivc.add_output('y', np.ones((size, )))
+
+    model.add_subsystem('p', ivc)
+    sub = model.add_subsystem('sub', om.Group())
+
+    sub.add_subsystem('p2', om.IndepVarComp('a', -3.0 + 0.6 * np.arange(size)))
+    sub.add_subsystem('dummy', om.ExecComp(['xd = x', "yd = y"],
+                                            x=np.ones(size), xd=np.ones(size),
+                                            y=np.ones(size), yd=np.ones(size)))
+
+    sub.add_subsystem("parab", DistParab(arr_size=size))
+    sub.add_subsystem("cons", om.ExecComp("c = x*3. + 7.", x=np.ones(size), c=np.ones(size)))
+    sub.add_subsystem('sum', om.ExecComp('f_sum = sum(f_xy)', f_sum=np.ones((size, )), f_xy=np.ones((size, ))))
+
+    model.connect('p.x', 'sub.dummy.x')
+    model.connect('p.y', 'sub.dummy.y')
+    model.connect('sub.p2.a', 'sub.parab.a')
+    model.connect('sub.dummy.xd', 'sub.parab.x')
+    model.connect('sub.dummy.yd', 'sub.parab.y')
+    model.connect('sub.parab.f_xy', 'sub.sum.f_xy', src_indices=om.slicer[:])
+    model.connect('sub.parab.f_xy', 'sub.cons.x', src_indices=om.slicer[:])
+
+    model.add_design_var('p.x', lower=-50.0, upper=50.0)
+    model.add_design_var('p.y', lower=-50.0, upper=50.0)
+    model.add_constraint('sub.cons.c', lower=0.0)
+    model.add_objective('sub.sum.f_sum', index=-1)
+
+    sub.approx_totals(method='fd')
+
+    return prob
+
+
 def _test_func_name(func, num, param):
     args = []
     for p in param.args:
@@ -631,19 +787,16 @@ def test_distrib_voi_fd(self):
         assert_near_equal(J['sum.f_sum', 'p.x']['abs error'].reverse, 0.0, 1e-5)
         assert_near_equal(J['sum.f_sum', 'p.y']['abs error'].reverse, 0.0, 1e-5)
 
-    def test_distrib_voi_group_fd(self):
+    def _setup_distrib_voi_group_fd(self, mode, size=7):
         # Only supports groups where the inputs to the distributed component whose inputs are
         # distributed to procs via src_indices don't cross the boundary.
-        size = 7
-
         prob = om.Problem()
         model = prob.model
 
-        ivc = om.IndepVarComp()
+        ivc = model.add_subsystem('p', om.IndepVarComp(), promotes=['*'])
         ivc.add_output('x', np.ones((size, )))
         ivc.add_output('y', np.ones((size, )))
 
-        model.add_subsystem('p', ivc, promotes=['*'])
         sub = model.add_subsystem('sub', om.Group(), promotes=['*'])
 
         ivc2 = om.IndepVarComp()
@@ -673,10 +826,15 @@ def test_distrib_voi_group_fd(self):
 
         sub.approx_totals(method='fd')
 
-        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.setup(mode=mode, force_alloc_complex=True)
 
         prob.run_model()
 
+        return prob
+
+    def test_distrib_voi_group_fd_fwd(self):
+        size = 7
+        prob = self._setup_distrib_voi_group_fd('fwd', size)
         desvar = prob.driver.get_design_var_values()
         con = prob.driver.get_constraint_values()
 
@@ -685,18 +843,11 @@ def test_distrib_voi_group_fd(self):
                           np.array([27.0, 24.96, 23.64, 23.04, 23.16, 24.0, 25.56]),
                           1e-6)
 
-        J = prob.check_totals(method='fd', show_only_incorrect=True)
-        assert_near_equal(J['sub.parab.f_xy', 'p.x']['abs error'].forward, 0.0, 1e-5)
-        assert_near_equal(J['sub.parab.f_xy', 'p.y']['abs error'].forward, 0.0, 1e-5)
-        assert_near_equal(J['sub.sum.f_sum', 'p.x']['abs error'].forward, 0.0, 1e-5)
-        assert_near_equal(J['sub.sum.f_sum', 'p.y']['abs error'].forward, 0.0, 1e-5)
-
-        # rev mode
-
-        prob.setup(mode='rev', force_alloc_complex=True)
-
-        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), rtol=1e-5)
 
+    def test_distrib_voi_group_fd_rev(self):
+        size = 7
+        prob = self._setup_distrib_voi_group_fd('rev', size)
         desvar = prob.driver.get_design_var_values()
         con = prob.driver.get_constraint_values()
 
@@ -705,11 +856,281 @@ def test_distrib_voi_group_fd(self):
                           np.array([27.0, 24.96, 23.64, 23.04, 23.16, 24.0, 25.56]),
                           1e-6)
 
-        J = prob.check_totals(method='fd', show_only_incorrect=True)
-        assert_near_equal(J['sub.parab.f_xy', 'p.x']['abs error'].reverse, 0.0, 1e-5)
-        assert_near_equal(J['sub.parab.f_xy', 'p.y']['abs error'].reverse, 0.0, 1e-5)
-        assert_near_equal(J['sub.sum.f_sum', 'p.x']['abs error'].reverse, 0.0, 1e-5)
-        assert_near_equal(J['sub.sum.f_sum', 'p.y']['abs error'].reverse, 0.0, 1e-5)
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), rtol=1e-5)
+
+    def test_distrib_voi_group_fd2(self):
+        prob = _setup_ivc_subivc_dist_parab_sum()
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None))
+
+    def test_distrib_voi_group_fd2(self):
+        prob = _setup_ivc_subivc_dist_parab_sum()
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None))
+
+    def test_distrib_voi_group_fd4_fwd(self):
+        prob = _setup_ivc_sub_ivcdistparabcons_nosum()
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_distrib_voi_group_fd4_rev(self):
+        prob = _setup_ivc_sub_ivcdistparabcons_nosum()
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_distrib_voi_group_fd5_fwd(self):
+        prob = _setup_ivc_subivcdistparabconssum_in_sub()
+        prob.setup(mode='fwd', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_distrib_voi_group_fd5_rev(self):
+        prob = _setup_ivc_subivcdistparabconssum_in_sub()
+        prob.setup(mode='rev', force_alloc_complex=True)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_distrib_voi_group_fd_loop(self):
+        # distrib comp is inside of fd group and part of a loop.
+        size = 7
+
+        prob = om.Problem()
+        model = prob.model
+
+        ivc = om.IndepVarComp()
+        ivc.add_output('x', np.ones((size, )))
+
+        model.add_subsystem('p', ivc)
+        sub = model.add_subsystem('sub', om.Group())
+
+        sub.add_subsystem('p2', om.IndepVarComp('a', -3.0 + 0.6 * np.arange(size)))
+        sub.add_subsystem('dummy', om.ExecComp(['xd = x', "yd = y"],
+                                               x=np.ones(size), xd=np.ones(size),
+                                               y=np.ones(size), yd=np.ones(size)))
+
+        sub.add_subsystem("parab", DistParab(arr_size=size))
+        sub.add_subsystem("cons", om.ExecComp("c = x*3. + 7.", x=np.ones(size), c=np.ones(size)))
+        sub.add_subsystem('sum', om.ExecComp('f_sum = sum(f_xy)', f_sum=np.ones((size, )), f_xy=np.ones((size, ))))
+
+        model.connect('p.x', 'sub.dummy.x')
+        model.connect('sub.p2.a', 'sub.parab.a')
+        model.connect('sub.dummy.xd', 'sub.parab.x')
+        model.connect('sub.dummy.yd', 'sub.parab.y')
+        model.connect('sub.parab.f_xy', 'sub.sum.f_xy', src_indices=om.slicer[:])
+        model.connect('sub.parab.f_xy', 'sub.cons.x', src_indices=om.slicer[:])
+        model.connect('sub.cons.c', 'sub.dummy.y')
+
+        model.add_design_var('p.x', lower=-50.0, upper=50.0)
+        model.add_constraint('sub.cons.c', lower=0.0)
+        model.add_objective('sub.sum.f_sum', index=-1)
+
+        sub.approx_totals(method='fd')
+
+        prob.setup(mode='fwd', force_alloc_complex=True)
+
+        prob.run_model()
+
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+        # rev mode
+
+        prob.setup(mode='rev', force_alloc_complex=True)
+
+        prob.run_model()
+
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_distrib_voi_group_nofd(self):
+        # distrib comp output feeds two nondist inputs
+        size = 7
+
+        prob = om.Problem()
+        model = prob.model
+
+        ivc = om.IndepVarComp()
+        ivc.add_output('x', np.ones((size, )))
+        ivc.add_output('y', np.ones((size, )))
+
+        model.add_subsystem('p', ivc)
+
+        model.add_subsystem('p2', om.IndepVarComp('a', -3.0 + 0.6 * np.arange(size)))
+        model.add_subsystem('dummy', om.ExecComp(['xd = x', "yd = y"],
+                                               x=np.ones(size), xd=np.ones(size),
+                                               y=np.ones(size), yd=np.ones(size)))
+
+        model.add_subsystem("parab", DistParab(arr_size=size))
+        model.add_subsystem("cons", om.ExecComp("c = x*3. + 7.", x=np.ones(size), c=np.ones(size)))
+        model.add_subsystem('sum', om.ExecComp('f_sum = sum(f_xy)', f_sum=np.ones((size, )), f_xy=np.ones((size, ))))
+
+        model.connect('p.x', 'dummy.x')
+        model.connect('p.y', 'dummy.y')
+        model.connect('p2.a', 'parab.a')
+        model.connect('dummy.xd', 'parab.x')
+        model.connect('dummy.yd', 'parab.y')
+        model.connect('parab.f_xy', 'sum.f_xy', src_indices=om.slicer[:])
+        model.connect('parab.f_xy', 'cons.x', src_indices=om.slicer[:])
+
+        model.add_design_var('p.x', lower=-50.0, upper=50.0)
+        model.add_design_var('p.y', lower=-50.0, upper=50.0)
+        model.add_constraint('cons.c', lower=0.0)
+        model.add_objective('sum.f_sum', index=-1)
+
+        prob.setup(mode='fwd', force_alloc_complex=True)
+
+        prob.run_model()
+
+        assert_check_totals(prob.check_totals(method='cs', out_stream=None), atol=3e-6)
+
+        # rev mode
+
+        prob.setup(mode='rev', force_alloc_complex=True)
+
+        prob.run_model()
+
+        assert_check_totals(prob.check_totals(method='cs', out_stream=None), atol=3e-6)
+
+    def test_nondistrib_voi_group_fd2(self):
+        # nondistrib comp is inside of fd group but not a response, and 2 nondistrib
+        # constraints connect to it downstream, both inside of the fd group.
+        size = 7
+
+        prob = om.Problem()
+        model = prob.model
+
+        ivc = om.IndepVarComp()
+        ivc.add_output('x', np.ones((size, )))
+        ivc.add_output('y', np.ones((size, )))
+
+        model.add_subsystem('p', ivc)
+        sub = model.add_subsystem('sub', om.Group())
+
+        sub.add_subsystem('p2', om.IndepVarComp('a', -3.0 + 0.6 * np.arange(size)))
+        sub.add_subsystem('dummy', om.ExecComp(['xd = x', "yd = y"],
+                                               x=np.ones(size), xd=np.ones(size),
+                                               y=np.ones(size), yd=np.ones(size)))
+
+        sub.add_subsystem("parab", om.ExecComp('f_xy = x**2 + 3.*xy - y*y + a', shape=size))
+        sub.add_subsystem("cons", om.ExecComp("c = x*3. + 7.", x=np.ones(size), c=np.ones(size)))
+        sub.add_subsystem('sum', om.ExecComp('f_sum = sum(f_xy)', f_sum=np.ones((size, )), f_xy=np.ones((size, ))))
+
+        model.connect('p.x', 'sub.dummy.x')
+        model.connect('p.y', 'sub.dummy.y')
+        model.connect('sub.p2.a', 'sub.parab.a')
+        model.connect('sub.dummy.xd', 'sub.parab.x')
+        model.connect('sub.dummy.yd', 'sub.parab.y')
+        model.connect('sub.parab.f_xy', 'sub.sum.f_xy', src_indices=om.slicer[:])
+        model.connect('sub.parab.f_xy', 'sub.cons.x', src_indices=om.slicer[:])
+
+        model.add_design_var('p.x', lower=-50.0, upper=50.0)
+        model.add_design_var('p.y', lower=-50.0, upper=50.0)
+        model.add_constraint('sub.cons.c', lower=0.0)
+        model.add_objective('sub.sum.f_sum', index=-1)
+
+        sub.approx_totals(method='fd')
+
+        prob.setup(mode='fwd', force_alloc_complex=True)
+
+        prob.run_model()
+
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+        # rev mode
+
+        prob.setup(mode='rev', force_alloc_complex=True)
+
+        prob.run_model()
+
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=3e-6)
+
+    def test_simple_distrib_voi_group_fd(self):
+        size = 3
+
+        prob = om.Problem()
+        model = prob.model
+
+        ivc = om.IndepVarComp()
+        ivc.add_output('x', np.ones(size))
+        ivc.add_output('y', np.ones(size))
+
+        model.add_subsystem('p', ivc, promotes=['*'])
+        sub = model.add_subsystem('sub', om.Group(), promotes=['*'])
+
+        ivc2 = om.IndepVarComp()
+        ivc2.add_output('a', -3.0 + 0.6 * np.arange(size))
+
+        sub.add_subsystem('p2', ivc2, promotes=['*'])
+
+        sub.add_subsystem('dummy', om.ExecComp(['xd = x', "yd = y"],
+                                               x=np.ones(size), xd=np.ones(size),
+                                               y=np.ones(size), yd=np.ones(size)),
+                          promotes_inputs=['*'])
+
+        sub.add_subsystem("parab", DistParab(arr_size=size), promotes_outputs=['*'], promotes_inputs=['a'])
+
+        sub.connect('dummy.xd', 'parab.x')
+        sub.connect('dummy.yd', 'parab.y')
+
+        model.add_design_var('x', lower=-50.0, upper=50.0)
+        model.add_design_var('y', lower=-50.0, upper=50.0)
+        model.add_constraint('f_xy', lower=0.0)
+
+        sub.approx_totals(method='fd')
+
+        prob.setup(mode='rev', force_alloc_complex=True)
+
+        prob.run_model()
+
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=1e-5)
+
+    def test_nondistrib_voi_group_fd(self):
+        size = 7
+
+        prob = om.Problem()
+        model = prob.model
+
+        ivc = om.IndepVarComp()
+        ivc.add_output('x', np.ones((size, )))
+        ivc.add_output('y', np.ones((size, )))
+
+        model.add_subsystem('p', ivc, promotes=['*'])
+        sub = model.add_subsystem('sub', om.Group(), promotes=['*'])
+
+        ivc2 = om.IndepVarComp()
+        ivc2.add_output('a', -3.0 + 0.6 * np.arange(size))
+
+        sub.add_subsystem('p2', ivc2, promotes=['*'])
+        sub.add_subsystem('dummy', om.ExecComp(['xd = x', "yd = y"],
+                                               x=np.ones(size), xd=np.ones(size),
+                                               y=np.ones(size), yd=np.ones(size)),
+                          promotes_inputs=['*'])
+
+        sub.add_subsystem("parab", om.ExecComp('f_xy = x**2 + 3.*xy - y*y + a', shape=size), promotes_outputs=['*'], promotes_inputs=['a'])
+        model.add_subsystem('sum', om.ExecComp('f_sum = sum(f_xy)',
+                                             f_sum=np.ones((size, )),
+                                             f_xy=np.ones((size, ))),
+                          promotes_outputs=['*'])
+
+        model.promotes('sum', inputs=['f_xy'], src_indices=om.slicer[:])
+
+        sub.connect('dummy.xd', 'parab.x')
+        sub.connect('dummy.yd', 'parab.y')
+
+        model.add_design_var('x', lower=-50.0, upper=50.0)
+        model.add_design_var('y', lower=-50.0, upper=50.0)
+        model.add_constraint('f_xy', lower=0.0)
+        model.add_objective('f_sum', index=-1)
+
+        sub.approx_totals(method='fd')
+
+        prob.setup(mode='rev', force_alloc_complex=True)
+
+        prob.run_model()
+
+        assert_check_totals(prob.check_totals(method='fd', out_stream=None))
 
     def test_distrib_group_fd_unsupported_config(self):
         size = 7
@@ -815,37 +1236,15 @@ def compute(self, inputs, outputs):
         model.add_constraint('ndp2.g', lower=0.0)
         model.add_objective('f_sum', index=-1)
 
-        mode_idx = {'fwd': 0, 'rev': 1}
         for mode in ['fwd', 'rev']:
             prob.setup(mode=mode, force_alloc_complex=True)
 
             prob.run_model()
 
-            J = prob.check_totals(method='fd', show_only_incorrect=True)
-            assert_near_equal(J['parab.f_xy', 'p.x']['abs error'][mode_idx[mode]], 0.0, 1e-5)
-            assert_near_equal(J['parab.f_xy', 'p.y']['abs error'][mode_idx[mode]], 0.0, 1e-5)
-            assert_near_equal(J['ndp.g', 'p.x']['abs error'][mode_idx[mode]], 0.0, 2e-5)
-            assert_near_equal(J['ndp.g', 'p.y']['abs error'][mode_idx[mode]], 0.0, 2e-5)
-            assert_near_equal(J['parab2.f_xy', 'p.x2']['abs error'][mode_idx[mode]], 0.0, 1e-5)
-            assert_near_equal(J['parab2.f_xy', 'p.y2']['abs error'][mode_idx[mode]], 0.0, 1e-5)
-            assert_near_equal(J['ndp2.g', 'p.x2']['abs error'][mode_idx[mode]], 0.0, 2e-5)
-            assert_near_equal(J['ndp2.g', 'p.y2']['abs error'][mode_idx[mode]], 0.0, 2e-5)
-            assert_near_equal(J['sum.f_sum', 'p.x']['abs error'][mode_idx[mode]], 0.0, 1e-5)
-            assert_near_equal(J['sum.f_sum', 'p.y']['abs error'][mode_idx[mode]], 0.0, 1e-5)
-
-            J = prob.check_totals(method='cs', show_only_incorrect=True)
-            assert_near_equal(J['parab.f_xy', 'p.x']['abs error'][mode_idx[mode]], 0.0, 1e-14)
-            assert_near_equal(J['parab.f_xy', 'p.y']['abs error'][mode_idx[mode]], 0.0, 1e-14)
-            assert_near_equal(J['ndp.g', 'p.x']['abs error'][mode_idx[mode]], 0.0, 1e-13)
-            assert_near_equal(J['ndp.g', 'p.y']['abs error'][mode_idx[mode]], 0.0, 1e-13)
-            assert_near_equal(J['parab2.f_xy', 'p.x2']['abs error'][mode_idx[mode]], 0.0, 1e-14)
-            assert_near_equal(J['parab2.f_xy', 'p.y2']['abs error'][mode_idx[mode]], 0.0, 1e-14)
-            assert_near_equal(J['ndp2.g', 'p.x2']['abs error'][mode_idx[mode]], 0.0, 1e-13)
-            assert_near_equal(J['ndp2.g', 'p.y2']['abs error'][mode_idx[mode]], 0.0, 1e-13)
-            assert_near_equal(J['sum.f_sum', 'p.x']['abs error'][mode_idx[mode]], 0.0, 1e-14)
-            assert_near_equal(J['sum.f_sum', 'p.y']['abs error'][mode_idx[mode]], 0.0, 1e-14)
-
-    def run_mixed_distrib2_prob(self, mode):
+            assert_check_totals(prob.check_totals(method='fd', out_stream=None), atol=2e-5, rtol=2e-5)
+            assert_check_totals(prob.check_totals(method='cs', out_stream=None), rtol=1e-13)
+
+    def run_mixed_distrib2_prob(self, mode, klass=MixedDistrib2):
         size = 5
         comm = MPI.COMM_WORLD
         rank = comm.rank
@@ -859,7 +1258,7 @@ def run_mixed_distrib2_prob(self, mode):
         ivc.add_output('x_nd', np.zeros(size))
 
         model.add_subsystem("indep", ivc)
-        model.add_subsystem("D1", MixedDistrib2())
+        model.add_subsystem("D1", klass())
 
         model.connect('indep.x_dist', 'D1.in_dist')
         model.connect('indep.x_nd', 'D1.in_nd')
@@ -891,6 +1290,12 @@ def test_distrib_mixeddistrib2_totals_rev(self):
         totals = prob.check_totals(show_only_incorrect=True, method='cs')
         assert_check_totals(totals)
 
+    def test_distrib_simplemixeddistrib2_totals_rev(self):
+        prob = self.run_mixed_distrib2_prob('rev', klass=SimpleMixedDistrib2)
+
+        totals = prob.check_totals(show_only_incorrect=True, method='cs')
+        assert_check_totals(totals)
+
     def test_distrib_mixeddistrib2_partials_rev(self):
         prob = self.run_mixed_distrib2_prob('rev')
 
@@ -2113,7 +2518,7 @@ def test_check_totals_rev_old(self):
             assert_check_totals(data)
 
         msg = "During total derivative computation, the following partial derivatives resulted in serial inputs that were inconsistent across processes: ['D1.out_dist wrt D1.in_nd']."
-        self.assertEqual(str(cm.exception), msg)
+        self.assertTrue(msg in str(cm.exception))
 
     def test_check_partials_cs_old(self):
         prob = self.get_problem(Distrib_Derivs_Matfree_Old)
@@ -2221,6 +2626,105 @@ def test_constraint_aliases(self):
         totals = prob.check_totals(method='cs', out_stream=None)
         self._compare_totals(totals)
 
+    def test_dist_desvar_dist_input(self):
+        class SimpleSum(om.ExplicitComponent):
+            """Simple component to sum distributed vector"""
+
+            def setup(self):
+                # Inputs
+                self.add_input('x', 1.0, shape=[2], distributed=True)
+
+                # Outputs
+                self.add_output('sum', 0.0)
+
+            def compute(self, inputs, outputs):
+                outputs['sum'] = self.comm.allreduce(np.sum(inputs["x"]))
+
+            def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+                if mode == "fwd":
+                    d_outputs['sum'] += self.comm.allreduce(np.sum(d_inputs["x"]))
+                if mode == "rev":
+                    d_inputs["x"] += d_outputs['sum'] * np.ones(2)
+
+        prob = om.Problem()
+        prob.model.add_subsystem("ivc", om.IndepVarComp("x", 1.0, shape=[2], distributed=True))
+        prob.model.connect("ivc.x", "ParallelSum.x")
+        prob.model.add_subsystem("ParallelSum", SimpleSum())
+
+        prob.setup(mode='rev')
+
+        prob.run_model()
+        assert_check_totals(prob.check_totals("ParallelSum.sum", "ivc.x"))
+
+
+class DummyComp(om.ExplicitComponent):
+    def initialize(self):
+        self.options.declare('a',default=0.)
+        self.options.declare('b',default=0.)
+
+    def setup(self):
+        self.add_input('x')
+        self.add_output('y', 0.)
+
+    def compute(self, inputs, outputs):
+        outputs['y'] = self.options['a']*inputs['x'] + self.options['b']
+
+    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+        if mode=='rev':
+            if 'y' in d_outputs:
+                if 'x' in d_inputs:
+                    d_inputs['x'] += self.options['a'] * d_outputs['y']
+        else:
+            if 'y' in d_outputs:
+                if 'x' in d_inputs:
+                    d_outputs['y'] += self.options['a'] * d_inputs['x']
+
+class DummyGroup(om.ParallelGroup):
+    def setup(self):
+        self.add_subsystem('C1',DummyComp(a=1,b=2.))
+        self.add_subsystem('C2',DummyComp(a=3.,b=4.))
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestLocalSrcIndsParColoring2(unittest.TestCase):
+    N_PROCS = 2
+
+    def check_model(self, mode):
+        # this uses parallel coloring with src_indices indexing into a local array
+        prob = om.Problem()
+        model = prob.model
+        model.add_subsystem('dvs',om.IndepVarComp('x',[1.,2.]), promotes=['*'])
+        model.add_subsystem('par',DummyGroup())
+        model.connect('x','par.C1.x',src_indices=[0])
+        model.connect('x','par.C2.x',src_indices=[1])
+
+        model.add_design_var('x',lower=0.,upper=1.)
+
+        # compute derivatives for made-up y constraints in parallel
+        model.add_constraint('par.C1.y', lower=1.0, parallel_deriv_color='deriv_color')
+        model.add_constraint('par.C2.y', lower=1.0, parallel_deriv_color='deriv_color')
+
+        prob.setup(mode=mode)
+        prob.run_model()
+        assert_check_totals(prob.check_totals(out_stream=None))
+
+    def test_local_src_inds_fwd(self):
+        self.check_model(mode='fwd')
+
+    def test_local_src_inds_rev(self):
+        self.check_model(mode='rev')
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestLocalSrcIndsParColoring3(TestLocalSrcIndsParColoring2):
+    N_PROCS = 3
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestLocalSrcIndsParColoring4(TestLocalSrcIndsParColoring2):
+    N_PROCS = 4
+
+
 
 if __name__ == "__main__":
     from openmdao.utils.mpi import mpirun_tests
diff --git a/openmdao/core/tests/test_distribcomp.py b/openmdao/core/tests/test_distribcomp.py
index 9b44643f78..a4fb53c686 100644
--- a/openmdao/core/tests/test_distribcomp.py
+++ b/openmdao/core/tests/test_distribcomp.py
@@ -1056,8 +1056,8 @@ def initialize(self):
                                      desc="Size of input vector x.")
 
             def setup(self):
-                self.add_input('x', np.ones(self.options['size']))
-                self.add_output('y', 1.0)
+                self.add_input('x', np.ones(self.options['size']), distributed=True)
+                self.add_output('y', 1.0, distributed=True)
 
             def compute(self, inputs, outputs):
                 outputs['y'] = np.sum(inputs['x'])*2.0
@@ -1068,7 +1068,7 @@ def compute(self, inputs, outputs):
                               promotes_outputs=['x'])
 
         # decide what parts of the array we want based on our rank
-        if self.comm.rank == 0:
+        if p.comm.rank == 0:
             idxs = [0, 1, 2]
         else:
             # use [3, -1] here rather than [3, 4] just to show that we
@@ -1083,12 +1083,12 @@ def compute(self, inputs, outputs):
         p.run_model()
 
         # each rank holds the assigned portion of the input array
-        assert_near_equal(p['C1.x'],
+        assert_near_equal(p.get_val('C1.x', get_remote=False),
                          np.arange(3, dtype=float) if p.model.C1.comm.rank == 0 else
                          np.arange(3, 5, dtype=float))
 
         # the output in each rank is based on the local inputs
-        assert_near_equal(p['C1.y'], 6. if p.model.C1.comm.rank == 0 else 14.)
+        assert_near_equal(p.get_val('C1.y', get_remote=False), 6. if p.model.C1.comm.rank == 0 else 14.)
 
 
 if __name__ == '__main__':
diff --git a/openmdao/core/tests/test_driver.py b/openmdao/core/tests/test_driver.py
index d99b948f7e..e88d9c5778 100644
--- a/openmdao/core/tests/test_driver.py
+++ b/openmdao/core/tests/test_driver.py
@@ -11,7 +11,7 @@
 import openmdao.api as om
 from openmdao.core.driver import Driver
 from openmdao.utils.units import convert_units
-from openmdao.utils.assert_utils import assert_near_equal, assert_warnings
+from openmdao.utils.assert_utils import assert_near_equal, assert_warnings, assert_check_totals
 from openmdao.utils.general_utils import printoptions
 from openmdao.utils.testing_utils import use_tempdirs
 from openmdao.test_suite.components.paraboloid import Paraboloid
@@ -1012,7 +1012,9 @@ def compute_partials(self, inputs, J):
         # non-distributed indep var, 'w'
         ivc = model.add_subsystem('ivc', om.IndepVarComp(distributed=False),
                                   promotes=['*'])
-        ivc.add_output('w', size)
+        # previous version of this test set w to 2 different values depending on rank, which
+        # is not valid for a non-distributed variable.  Changed to be the same on all procs.
+        ivc.add_output('w', 3.0)
 
         # distributed component, 'dc'
         model.add_subsystem('dc', DistribComp(), promotes=['*'])
@@ -1025,7 +1027,6 @@ def compute_partials(self, inputs, J):
         driver.supports._read_only = False
         driver.supports['distributed_design_vars'] = True
 
-        # run model
         p.setup()
         p.run_model()
 
@@ -1053,10 +1054,9 @@ def compute_partials(self, inputs, J):
         assert_near_equal(driver.get_design_var_values(get_remote=True)['d_ivc.x'],
                           [5, 5, 5, 9, 9])
 
-        # run driver
         p.run_driver()
 
-        assert_near_equal(p.get_val('dc.y', get_remote=True), [81, 96])
+        assert_near_equal(p.get_val('dc.y', get_remote=True), [81, 81])
         assert_near_equal(p.get_val('dc.z', get_remote=True), [25, 25, 25, 81, 81])
 
     def test_distrib_desvar_bug(self):
diff --git a/openmdao/core/tests/test_mixed_dist.py b/openmdao/core/tests/test_mixed_dist.py
new file mode 100644
index 0000000000..f5e8324c52
--- /dev/null
+++ b/openmdao/core/tests/test_mixed_dist.py
@@ -0,0 +1,138 @@
+import unittest
+
+import numpy as np
+import openmdao.api as om
+from openmdao.utils.mpi import MPI
+from openmdao.utils.assert_utils import assert_check_totals
+
+
+if MPI:
+    try:
+        from openmdao.vectors.petsc_vector import PETScVector
+    except ImportError:
+        PETScVector = None
+
+    dist_shape = 1 if MPI.COMM_WORLD.rank > 0 else 2
+else:
+    dist_shape = 2
+
+
+class SerialComp(om.ExplicitComponent):
+    def setup(self):
+        self.add_input('x')
+        self.add_output('y')
+
+    def compute(self, inputs, outputs):
+        outputs['y'] = 2.0* inputs['x']
+
+    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+        if mode == 'fwd':
+            if 'y' in d_outputs:
+                if 'x' in d_inputs:
+                    d_outputs['y'] += 2.0 * d_inputs['x']
+        if mode == 'rev':
+            if 'y' in d_outputs:
+                if 'x' in d_inputs:
+                    d_inputs['x'] += 2.0 * d_outputs['y']
+
+
+class MixedSerialInComp(om.ExplicitComponent):
+    def setup(self):
+        self.add_input('x')
+        self.add_output('yd', shape = dist_shape, distributed=True)
+
+    def compute(self, inputs, outputs):
+        outputs['yd'][:] = 0.5 * inputs['x']
+
+    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+        if mode == 'fwd':
+            if 'yd' in d_outputs:
+                if 'x' in d_inputs:
+                    d_outputs['yd'] += 0.5 * d_inputs['x']
+        if mode == 'rev':
+            if 'yd' in d_outputs:
+                if 'x' in d_inputs:
+                    d_inputs['x'] += 0.5 * self.comm.allreduce(np.sum(d_outputs['yd']))
+
+
+class MixedSerialOutComp(om.ExplicitComponent):
+    def setup(self):
+        self.add_input('x')
+        self.add_input('xd', shape = dist_shape, distributed=True)
+        self.add_output('y')
+
+    def compute(self, inputs, outputs):
+        outputs['y'] = 2.0 * inputs['x'] + self.comm.allreduce(3.0 * np.sum(inputs['xd']))
+
+    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+        if mode == 'fwd':
+            if 'y' in d_outputs:
+                if 'x' in d_inputs:
+                    d_outputs['y'] += 2.0 * d_inputs['x']
+                if 'xd' in d_inputs:
+                    d_outputs['y'] += 3.0 * self.comm.allreduce(np.sum(d_inputs['xd']))
+        if mode == 'rev':
+            if 'y' in d_outputs:
+                if 'x' in d_inputs:
+                    d_inputs['x'] += 2.0 * d_outputs['y']
+                if 'xd' in d_inputs:
+                    d_inputs['xd'] += 3.0 * d_outputs['y']
+
+
+class DistComp(om.ExplicitComponent):
+    def setup(self):
+        self.add_input('xd', shape = dist_shape, distributed=True)
+        self.add_output('yd', shape = dist_shape, distributed=True)
+
+    def compute(self, inputs, outputs):
+        outputs['yd'] = 3.0 * inputs['xd']
+
+    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+        if mode == 'fwd':
+            if 'yd' in d_outputs:
+                if 'xd' in d_inputs:
+                    d_outputs['yd'] += 3.0 * d_inputs['xd']
+        if mode == 'rev':
+            if 'yd' in d_outputs:
+                if 'xd' in d_inputs:
+                    d_inputs['xd'] += 3.0 * d_outputs['yd']
+
+
+def create_problem():
+    prob = om.Problem()
+    model = prob.model
+    model.add_subsystem('ivc', om.IndepVarComp('x', val = 1.0))
+
+    model.add_subsystem('S', SerialComp())  # x -> y
+    model.add_subsystem('MI', MixedSerialInComp()) # x -> yd
+    model.add_subsystem('D', DistComp()) # xd -> yd
+    model.add_subsystem('MO', MixedSerialOutComp()) # x, xd -> y
+    model.connect('ivc.x', 'S.x')
+    model.connect('S.y','MI.x')
+    model.connect('MI.yd', 'D.xd')
+    model.connect('D.yd', 'MO.xd')
+    model.connect('S.y', 'MO.x')
+    model.add_design_var("ivc.x")
+    model.add_objective("MO.xd", index=0)  # adding objective using input name
+    return prob
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestMixedDist2(unittest.TestCase):
+    N_PROCS = 2
+
+    def test_mixed_dist(self):
+        prob = create_problem()
+        prob.setup(mode='rev')
+        prob.run_model()
+        assert_check_totals(prob.check_totals())
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestMixedDist3(TestMixedDist2):
+    N_PROCS = 3
+
+
+if __name__ == '__main__':
+    from openmdao.utils.mpi import mpirun_tests
+    mpirun_tests()
diff --git a/openmdao/core/tests/test_mpi_coloring_bug.py b/openmdao/core/tests/test_mpi_coloring_bug.py
index 9682dbc2ac..b147375419 100644
--- a/openmdao/core/tests/test_mpi_coloring_bug.py
+++ b/openmdao/core/tests/test_mpi_coloring_bug.py
@@ -4,7 +4,7 @@
 
 import openmdao.api as om
 import openmdao.utils.coloring as coloring_mod
-from openmdao.utils.assert_utils import assert_near_equal
+from openmdao.utils.assert_utils import assert_near_equal, assert_check_totals
 from openmdao.utils.general_utils import set_pyoptsparse_opt
 from openmdao.utils.testing_utils import use_tempdirs
 
@@ -510,3 +510,32 @@ def run(self):
         dd = J['phases.burn1.collocation_constraint.defects:deltav']['phases.burn1.indep_states.states:deltav']
 
         assert_near_equal(dd, np.array([[-0.75, 0.75]]), 1e-6)
+
+    def test_bug2(self):
+        size = 3
+        p = om.Problem()
+        phases = p.model.add_subsystem('phases', om.ParallelGroup())
+        phase1 = phases.add_subsystem('phase1', om.Group())
+        phase1.add_subsystem('indep', om.IndepVarComp('x', val=np.ones(size)))
+        phase1.add_subsystem('comp1', om.ExecComp('y=3.0*x', x=np.ones(size), y=np.ones(size)))
+        phase1.add_subsystem('comp2', om.ExecComp('y=5.0*x', x=np.ones(size), y=np.ones(size)))
+        phase1.connect('indep.x', 'comp1.x')
+        phase1.connect('comp1.y', 'comp2.x')
+        phase1.add_design_var('indep.x')
+        phase1.add_constraint('comp2.y', lower=0.0)
+        phase1.add_constraint('comp1.y', lower=0.0)
+
+        phase2 = phases.add_subsystem('phase2', om.Group())
+        phase2.add_subsystem('indep', om.IndepVarComp('x', val=np.ones(size)))
+        phase2.add_subsystem('comp1', om.ExecComp('y=7.0*x', x=np.ones(size), y=np.ones(size)))
+        phase2.add_subsystem('comp2', om.ExecComp('y=9.0*x', x=np.ones(size), y=np.ones(size)))
+        phase2.connect('indep.x', 'comp1.x')
+        phase2.connect('comp1.y', 'comp2.x')
+        phase2.add_design_var('indep.x')
+        phase2.add_constraint('comp2.y', lower=0.0)
+        phase2.add_constraint('comp1.y', lower=0.0)
+
+        p.setup(mode='rev')
+        p.run_model()
+
+        assert_check_totals(p.check_totals())
diff --git a/openmdao/core/tests/test_parallel_derivatives.py b/openmdao/core/tests/test_parallel_derivatives.py
index cdafeba53c..f8e0488d82 100644
--- a/openmdao/core/tests/test_parallel_derivatives.py
+++ b/openmdao/core/tests/test_parallel_derivatives.py
@@ -854,63 +854,47 @@ class TestAutoIVCParDerivBug(unittest.TestCase):
     N_PROCS = 4
 
     def test_auto_ivc_par_deriv_bug(self):
-        class SimpleAero(om.ExplicitComponent):
-            """Simple aerodynamic model"""
-
-            def initialize(self):
-                self.options.declare( "CLwing", default=0.5, desc="Wing lift factor", )
-                self.options.declare( "CLtail", default=0.25, desc="tail lift factor", )
-                self.options.declare( "CDwing", default=0.05, desc="Wing drag factor", )
-                self.options.declare( "CDtail", default=0.025, desc="tail drag factor", )
+        class Simple(om.ExplicitComponent):
+            def __init__(self, mult1, mult2, mult3, mult4, **kwargs):
+                super().__init__(**kwargs)
+                self.mult1 = mult1
+                self.mult2 = mult2
+                self.mult3 = mult3
+                self.mult4 = mult4
 
             def setup(self):
-                # Inputs
-                self.add_input('alpha', 0.1, desc="Angle of attack of wing")
-                self.add_input('tail_angle', 0.01, desc="Angle of attack of tail")
+                self.add_input('x1', 0.1)
+                self.add_input('x2', 0.01)
 
-                # Outputs
-                self.add_output('L', 0.0, desc="Total lift")
-                self.add_output('D', 0.0, desc="Total drag")
-
-                # Set options
-                self.CLwing = self.options["CLwing"]
-                self.CDwing = self.options["CDwing"]
-                self.CLtail = self.options["CLtail"]
-                self.CDtail = self.options["CDtail"]
+                self.add_output('y1', 0.0)
+                self.add_output('y2', 0.0)
 
                 self.declare_partials(of='*', wrt='*')
 
             def compute(self, inputs, outputs):
-                """ A simple surrogate for a 2 dof aero problem"""
-                outputs['L'] = self.CLwing * inputs['alpha'] + self.CLtail * inputs['tail_angle']
-                outputs['D'] = self.CDwing * inputs['alpha'] ** 2 + self.CDtail * inputs['tail_angle'] ** 2
+                outputs['y1'] = self.mult1 * inputs['x1'] + self.mult3 * inputs['x2']
+                outputs['y2'] = self.mult2 * inputs['x1'] ** 2 + self.mult4 * inputs['x2'] ** 2
 
             def compute_partials(self, inputs, partials):
-                """Analytical derivatives"""
-                partials['L', 'alpha'] = self.CLwing
-                partials['L', 'tail_angle'] = self.CLtail
-                partials['D', 'alpha'] = 2 * self.CDwing * inputs['alpha']
-                partials['D', 'tail_angle'] = 2 * self.CDtail * inputs['tail_angle']
-
-        # Use parallel group to solve flight conditions simultaneously
-        flight_conds = om.ParallelGroup()
-        flight_conds.add_subsystem("cruise", SimpleAero(CLwing=0.5, CDwing=0.05, CLtail=0.25, CDtail=0.025))
-        flight_conds.add_subsystem("maneuver", SimpleAero(CLwing=0.75, CDwing=0.25, CLtail=0.45, CDtail=0.15))
-
-        # build the model
+                partials['y1', 'x1'] = self.mult1
+                partials['y1', 'x2'] = self.mult3
+                partials['y2', 'x1'] = 2 * self.mult2 * inputs['x1']
+                partials['y2', 'x2'] = 2 * self.mult4 * inputs['x2']
+
         prob = om.Problem()
-        prob.model.add_subsystem('flight_conditions', flight_conds)
-
-        # Set dvs for each flight condition
-        prob.model.add_design_var('flight_conditions.cruise.alpha', lower=-50, upper=50)
-        prob.model.add_design_var('flight_conditions.cruise.tail_angle', lower=-50, upper=50)
-        prob.model.add_design_var('flight_conditions.maneuver.alpha', lower=-50, upper=50)
-        prob.model.add_design_var('flight_conditions.maneuver.tail_angle', lower=-50, upper=50)
-        # Use the parallel derivative option to solve lift constraint simultaneously
-        prob.model.add_constraint('flight_conditions.cruise.L', equals=1.0, parallel_deriv_color="lift")
-        prob.model.add_constraint('flight_conditions.maneuver.L', equals=2.5, parallel_deriv_color="lift")
-        # Set cruise drag as objective
-        prob.model.add_objective('flight_conditions.cruise.D')
+        par = prob.model.add_subsystem('par', om.ParallelGroup())
+        par.add_subsystem("C1", Simple(mult1=0.5, mult2=0.15, mult3=0.25, mult4=0.35))
+        par.add_subsystem("C2", Simple(mult1=0.75, mult2=0.65, mult3=0.45, mult4=0.15))
+
+        prob.model.add_design_var('par.C1.x1', lower=-50, upper=50)
+        prob.model.add_design_var('par.C1.x2', lower=-50, upper=50)
+        prob.model.add_design_var('par.C2.x1', lower=-50, upper=50)
+        prob.model.add_design_var('par.C2.x2', lower=-50, upper=50)
+
+        # Use the parallel derivative option to solve constraint simultaneously
+        prob.model.add_constraint('par.C1.y1', equals=1.0, parallel_deriv_color="pd1")
+        prob.model.add_constraint('par.C2.y1', equals=2.5, parallel_deriv_color="pd1")
+        prob.model.add_objective('par.C1.y2')
 
         prob.setup(mode='rev', force_alloc_complex=True)
 
diff --git a/openmdao/core/tests/test_parallel_groups.py b/openmdao/core/tests/test_parallel_groups.py
index 9eff5f0444..1e27a06c8b 100644
--- a/openmdao/core/tests/test_parallel_groups.py
+++ b/openmdao/core/tests/test_parallel_groups.py
@@ -24,8 +24,11 @@
 from openmdao.test_suite.groups.parallel_groups import \
     FanOutGrouped, FanInGrouped2, Diamond, ConvergeDiverge
 
-from openmdao.utils.assert_utils import assert_near_equal
+from openmdao.core.tests.test_distrib_derivs import DistribExecComp
+
+from openmdao.utils.assert_utils import assert_near_equal, assert_check_totals
 from openmdao.utils.logger_utils import TestLogger
+from openmdao.utils.array_utils import evenly_distrib_idxs
 from openmdao.error_checking.check_config import _default_checks
 
 
@@ -40,8 +43,8 @@ def check_config(self, logger):
 def _test_func_name(func, num, param):
     args = []
     for p in param.args:
-        if not isinstance(p, Iterable):
-            p = {p}
+        if isinstance(p, str) or not isinstance(p, Iterable):
+            p = [p]
         for item in p:
             try:
                 arg = item.__name__
@@ -51,11 +54,8 @@ def _test_func_name(func, num, param):
     return func.__name__ + '_' + '_'.join(args)
 
 
-@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
 class TestParallelGroups(unittest.TestCase):
 
-    N_PROCS = 2
-
     @parameterized.expand(itertools.product([(om.LinearRunOnce, None)],
                                             [om.NonlinearBlockGS, om.NonlinearRunOnce]),
                           name_func=_test_func_name)
@@ -164,9 +164,10 @@ def test_fan_in_grouped_feature(self):
         assert_near_equal(prob['c3.y'], 29.0, 1e-6)
 
     @parameterized.expand(itertools.product([om.LinearRunOnce],
-                                            [om.NonlinearBlockGS, om.NonlinearRunOnce]),
+                                            [om.NonlinearBlockGS, om.NonlinearRunOnce],
+                                            ['fwd', 'rev']),
                           name_func=_test_func_name)
-    def test_diamond(self, solver, nlsolver):
+    def test_diamond(self, solver, nlsolver, mode):
 
         prob = om.Problem()
         prob.model = Diamond()
@@ -174,7 +175,7 @@ def test_diamond(self, solver, nlsolver):
         prob.model.linear_solver = solver()
         prob.model.nonlinear_solver = nlsolver()
 
-        prob.setup(check=False, mode='fwd')
+        prob.setup(check=False, mode=mode)
         prob.set_solver_print(level=0)
         prob.run_model()
 
@@ -188,20 +189,11 @@ def test_diamond(self, solver, nlsolver):
         assert_near_equal(J['c4.y1', 'iv.x'][0][0], 25, 1e-6)
         assert_near_equal(J['c4.y2', 'iv.x'][0][0], -40.5, 1e-6)
 
-        prob.setup(check=False, mode='rev')
-        prob.run_model()
-
-        assert_near_equal(prob['c4.y1'], 46.0, 1e-6)
-        assert_near_equal(prob['c4.y2'], -93.0, 1e-6)
-
-        J = prob.compute_totals(of=unknown_list, wrt=indep_list)
-        assert_near_equal(J['c4.y1', 'iv.x'][0][0], 25, 1e-6)
-        assert_near_equal(J['c4.y2', 'iv.x'][0][0], -40.5, 1e-6)
-
     @parameterized.expand(itertools.product([om.LinearRunOnce],
-                                            [om.NonlinearBlockGS, om.NonlinearRunOnce]),
+                                            [om.NonlinearBlockGS, om.NonlinearRunOnce],
+                                            ['fwd', 'rev']),
                           name_func=_test_func_name)
-    def test_converge_diverge(self, solver, nlsolver):
+    def test_converge_diverge(self, solver, nlsolver, mode):
 
         prob = om.Problem()
         prob.model = ConvergeDiverge()
@@ -209,7 +201,7 @@ def test_converge_diverge(self, solver, nlsolver):
         prob.model.linear_solver = solver()
         prob.model.nonlinear_solver = nlsolver()
 
-        prob.setup(check=False, mode='fwd')
+        prob.setup(check=False, mode=mode)
         prob.set_solver_print(level=0)
         prob.run_model()
 
@@ -221,15 +213,37 @@ def test_converge_diverge(self, solver, nlsolver):
         J = prob.compute_totals(of=unknown_list, wrt=indep_list)
         assert_near_equal(J['c7.y1', 'iv.x'][0][0], -40.75, 1e-6)
 
-        prob.setup(check=False, mode='rev')
-        prob.run_model()
 
-        assert_near_equal(prob['c7.y1'], -102.7, 1e-6)
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestParallelGroupsMPI2(TestParallelGroups):
 
-        J = prob.compute_totals(of=unknown_list, wrt=indep_list)
-        assert_near_equal(J['c7.y1', 'iv.x'][0][0], -40.75, 1e-6)
+    N_PROCS = 2
 
-        assert_near_equal(prob['c7.y1'], -102.7, 1e-6)
+    @parameterized.expand(['fwd', 'rev'], name_func=_test_func_name)
+    def test_par_with_only_1_subsystem(self, mode):
+        p = om.Problem()
+        model = p.model
+
+        model.add_subsystem('p1', om.IndepVarComp('x', np.ones(3)))
+        par = model.add_subsystem('par', om.ParallelGroup())
+        G = par.add_subsystem('G', om.Group())
+        G.add_subsystem('c1', om.ExecComp('y=2.0*x', shape=3))
+        G.add_subsystem('c2', DistribExecComp(['y=5.0*x', 'y=7.0*x'], arr_size=3))
+        model.add_subsystem('c2', om.ExecComp('y=3.0*x', shape=3))
+
+        model.connect('p1.x', 'par.G.c1.x')
+        model.connect('par.G.c1.y', 'par.G.c2.x')
+        model.connect('par.G.c2.y', 'c2.x', src_indices=om.slicer[:])
+
+        model.add_design_var('p1.x', lower=-50.0, upper=50.0)
+        model.add_constraint('par.G.c1.y', lower=-15.0, upper=15.0)
+        model.add_constraint('par.G.c2.y', lower=-15.0, upper=15.0)
+        model.add_objective('c2.y', index=0)
+
+        p.setup(check=False, mode=mode)
+        p.run_model()
+
+        assert_check_totals(p.check_totals(out_stream=None), rtol=2e-5, atol=2e-5)
 
     def test_zero_shape(self):
         raise unittest.SkipTest("zero shapes not fully supported yet")
@@ -338,6 +352,7 @@ def test_setup_messages_only_on_proc0(self):
             else:
                 self.fail("Didn't find '%s' in info messages." % msg)
 
+
 @unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
 class TestDistDriverVars(unittest.TestCase):
 
@@ -664,6 +679,180 @@ def setup(self):
         self.assertEqual(con_names, ['phases.climb.comp.y', 'phases.cruise.comp.y', 'phases.descent.comp.y'])
 
 
+
+class SimpleDistComp(om.ExplicitComponent):
+
+    def setup(self):
+        comm = self.comm
+        rank = comm.rank
+
+        sizes, _ = evenly_distrib_idxs(comm.size, 3)
+        io_size = sizes[rank]
+
+        # src_indices will be computed automatically
+        self.add_input('x', val=np.ones(io_size), distributed=True)
+        self.add_input('a', val=-3.0 * np.ones(io_size), distributed=True)
+
+        self.add_output('y', val=np.ones(io_size), distributed=True)
+
+        self.declare_partials('y', ['x', 'a'])
+
+    def compute(self, inputs, outputs):
+        x = inputs['x']
+        a = inputs['a']
+
+        outputs['y'] = 2*x*a + x + a
+
+    def compute_partials(self, inputs, partials):
+        x = inputs['x']
+        a = inputs['a']
+
+        partials['y', 'x'] = np.diag(2.0 * a + 1.0)
+        partials['y', 'a'] = np.diag(2.0 * x + 1.0)
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestFD(unittest.TestCase):
+
+    N_PROCS = 2
+
+    def test_fd_rev_mode(self):
+        size = 3
+
+        p = om.Problem()
+        model = p.model
+
+        model.add_subsystem('p', om.IndepVarComp('x', np.ones(size)), promotes=['*'])
+        sub = model.add_subsystem('sub', om.Group(), promotes=['*'])
+
+        sub.add_subsystem('p2', om.IndepVarComp('a', -3.0 + 0.6 * np.arange(size)), promotes=['*'])
+
+        sub.add_subsystem('C1', om.ExecComp(['xd = x'],
+                                               x=np.ones(size), xd=np.ones(size)),
+                          promotes_inputs=['*'])
+
+        sub.add_subsystem("D1", SimpleDistComp(), promotes_outputs=['*'], promotes_inputs=['a'])
+
+        sub.connect('C1.xd', 'D1.x')
+
+        model.add_design_var('x', lower=-50.0, upper=50.0)
+        model.add_constraint('y', lower=0.0)
+
+        sub.approx_totals(method='fd')
+
+        p.setup(mode='rev', force_alloc_complex=True)
+
+        p.run_model()
+
+        data =  p.check_totals(method='fd', out_stream=None)
+
+        assert_check_totals(data, atol=1e-5)
+
+
+
+class DoubleComp(om.ExplicitComponent):
+    # dummy component to use outputs of parallel group
+    def setup(self):
+        self.add_input('x')
+        self.add_output('y', 0.)
+
+    def compute(self, inputs, outputs):
+        outputs['y'] = 2. * inputs['x']
+
+    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+        if mode=='rev':
+            if 'y' in d_outputs:
+                if 'x' in d_inputs:
+                    d_inputs['x'] += 2. * d_outputs['y']
+        else:
+            if 'y' in d_outputs:
+                if 'x' in d_inputs:
+                    d_outputs['y'] += 2. * d_inputs['x']
+
+
+class BcastComp(om.ExplicitComponent):
+    # dummy component to be evaluated in pararallel by om.ParallelGroup
+    def setup(self):
+        self.add_input('x', shape_by_conn=True)
+        self.add_output('y', 0.0)
+
+    def compute(self,inputs,outputs):
+        y = None
+        if self.comm.rank==0: # pretend that this must be computed on one or a subset of procs...
+                            # which may be necessary for various solvers/analyses
+            y = np.sum(inputs['x'])
+
+        # bcast to all procs
+        outputs['y'] = self.comm.bcast(y, root=0)
+
+    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
+        if mode == 'rev':
+            if 'y' in d_outputs:
+                # this used to give bad derivatives in parallel group due to non-uniform d_outputs['y'] across procs...
+                if self.comm.rank==0: # compute on first proc
+                    d_inputs['x'] += d_outputs['y']
+
+                # bcast to all procs
+                d_inputs['x'] = self.comm.bcast(d_inputs['x'], root=0)
+        else:
+            if 'y' in d_outputs:
+                if self.comm.rank==0: # compute on first proc
+                    d_outputs['y'] += np.sum(d_inputs['x'])
+
+                # bcast to all procs
+                d_outputs['y'] = self.comm.bcast(d_outputs['y'], root=0)
+
+
+@unittest.skipUnless(MPI and PETScVector, "MPI and PETSc are required.")
+class TestSingleRankRunWithBcast(unittest.TestCase):
+
+    N_PROCS = 2
+
+    def setup_model(self, mode):
+        prob = om.Problem()
+        model = prob.model
+
+        # dummy structural design variables
+        model.add_subsystem('indep', om.IndepVarComp('x', 0.01*np.ones(20)))
+        par = model.add_subsystem('par', om.ParallelGroup())
+
+        par.add_subsystem('C1', BcastComp())
+        par.add_subsystem('C2', BcastComp())
+
+        model.connect('indep.x', f'par.C1.x')
+        model.connect('indep.x', f'par.C2.x')
+
+        # add component that uses outputs from parallel components
+        model.add_subsystem('dummy_comp', DoubleComp())
+        model.connect('par.C1.y','dummy_comp.x')
+
+
+        prob.setup(mode=mode)
+        prob.run_model()
+
+        return prob
+
+    def test_bcast_fwd(self):
+        prob = self.setup_model(mode='fwd')
+
+        for i in range(1,3):
+            y = prob.get_val(f'par.C{i}.y',get_remote=True)
+            assert_near_equal(y, 0.2, 1e-6)
+
+        assert_check_totals(prob.check_totals(of=['dummy_comp.y','par.C1.y'],
+                                              wrt=['indep.x'], out_stream=None))
+
+    def test_bcast_rev(self):
+        prob = self.setup_model(mode='rev')
+
+        for i in range(1,3):
+            y = prob.get_val(f'par.C{i}.y',get_remote=True)
+            assert_near_equal(y, 0.2, 1e-6)
+
+        assert_check_totals(prob.check_totals(of=['dummy_comp.y','par.C1.y'],
+                                              wrt=['indep.x'], out_stream=None))
+
+
 if __name__ == "__main__":
     from openmdao.utils.mpi import mpirun_tests
     mpirun_tests()
diff --git a/openmdao/core/tests/test_theory_rev_derivs.py b/openmdao/core/tests/test_theory_rev_derivs.py
index 5bed9f380d..460ee494bc 100644
--- a/openmdao/core/tests/test_theory_rev_derivs.py
+++ b/openmdao/core/tests/test_theory_rev_derivs.py
@@ -111,8 +111,8 @@ def test_theory_example(self):
 
         all_dinputs = model.comm.allgather(model._dinputs.asarray())
 
-        assert_near_equal(all_dinputs[0], np.array([16.,  8.,  2.,  3.]))
-        assert_near_equal(all_dinputs[1], np.array([36., 18.,  2.,  3.]))
+        assert_near_equal(all_dinputs[0], np.array([26.,  4.,  2.,  3.]))
+        assert_near_equal(all_dinputs[1], np.array([26., 9.,  2.,  3.]))
 
 
 if __name__ == "__main__":
diff --git a/openmdao/core/total_jac.py b/openmdao/core/total_jac.py
index d96ea4ab61..df93ab1844 100644
--- a/openmdao/core/total_jac.py
+++ b/openmdao/core/total_jac.py
@@ -11,7 +11,7 @@
 import numpy as np
 
 from openmdao.core.constants import INT_DTYPE
-from openmdao.utils.general_utils import ContainsAll, _src_or_alias_dict, _src_or_alias_name
+from openmdao.utils.general_utils import _contains_all, _src_or_alias_dict, _src_or_alias_name
 
 from openmdao.utils.mpi import MPI, check_mpi_env
 from openmdao.utils.coloring import _initialize_model_approx, Coloring
@@ -29,7 +29,6 @@
 elif use_mpi is False:
     PETSc = None
 
-_contains_all = ContainsAll()
 _directional_rng = np.random.default_rng(99)
 
 
@@ -91,7 +90,7 @@ class _TotalJacInfo(object):
         (local indices, local sizes).
     in_idx_map : dict
         Mapping of jacobian row/col index to a tuple of the form
-        (ndups, relevant_systems, cache_linear_solutions_flag)
+        (relevant_systems, cache_linear_solutions_flag, voi name)
     total_relevant_systems : set
         The set of names of all systems relevant to the computation of the total derivatives.
     directional : bool
@@ -129,7 +128,7 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
             If True, perform a single directional derivative.
         """
         driver = problem.driver
-        prom2abs = problem.model._var_allprocs_prom2abs_list['output']
+        prom2abs_out = problem.model._var_allprocs_prom2abs_list['output']
         prom2abs_in = problem.model._var_allprocs_prom2abs_list['input']
         conns = problem.model._conn_global_abs_in2out
 
@@ -145,12 +144,6 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
         self.get_remote = get_remote
         self.directional = directional
 
-        if isinstance(wrt, str):
-            wrt = [wrt]
-
-        if isinstance(of, str):
-            of = [of]
-
         # convert designvar and response dicts to use src or alias names
         # keys will all be absolute names or aliases after conversion
         design_vars = _src_or_alias_dict(driver._designvars)
@@ -167,16 +160,21 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
         # that don't specify them, so we need these here.
         if wrt is None:
             wrt = driver_wrt
+        elif isinstance(wrt, str):
+            wrt = [wrt]
+
         if of is None:
             of = driver_of
+        elif isinstance(of, str):
+            of = [of]
 
         # Convert 'wrt' names from promoted to absolute
         prom_wrt = wrt
         wrt = []
         self.ivc_print_names = {}
         for name in prom_wrt:
-            if name in prom2abs:
-                wrt_name = prom2abs[name][0]
+            if name in prom2abs_out:
+                wrt_name = prom2abs_out[name][0]
             elif name in prom2abs_in:
                 in_abs = prom2abs_in[name][0]
                 wrt_name = conns[in_abs]
@@ -190,8 +188,8 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
         of = []
         src_of = []
         for name in prom_of:  # these names could be aliases
-            if name in prom2abs:
-                of_name = prom2abs[name][0]
+            if name in prom2abs_out:
+                of_name = prom2abs_out[name][0]
             elif name in prom2abs_in:
                 # An auto_ivc design var can be used as a response too.
                 in_abs = prom2abs_in[name][0]
@@ -243,7 +241,7 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
         }
         self._dist_driver_vars = driver._dist_driver_vars
 
-        abs2meta_out = model._var_allprocs_abs2meta['output']
+        all_abs2meta_out = model._var_allprocs_abs2meta['output']
 
         constraints = driver._cons
 
@@ -257,8 +255,6 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
 
         self.has_lin_cons = has_lin_cons
         self.dist_input_range_map = {}
-        self.has_input_dist = {}
-        self.has_output_dist = {}
 
         self.total_relevant_systems = set()
         self.simul_coloring = None
@@ -308,12 +304,10 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
             self.dist_idx_map = {m: None for m in modes}
             self.modes = modes
 
-        self.of_meta, self.of_size, has_of_dist = \
-            self._get_tuple_map(of, responses, abs2meta_out)
-        self.has_input_dist['rev'] = self.has_output_dist['fwd'] = has_of_dist
-        self.wrt_meta, self.wrt_size, has_wrt_dist = \
-            self._get_tuple_map(wrt, design_vars, abs2meta_out)
-        self.has_input_dist['fwd'] = self.has_output_dist['rev'] = has_wrt_dist
+        self.of_meta, self.of_size, _ = \
+            self._get_tuple_map(of, responses, all_abs2meta_out)
+        self.wrt_meta, self.wrt_size, self.has_wrt_dist = \
+            self._get_tuple_map(wrt, design_vars, all_abs2meta_out)
 
         # always allocate a 2D dense array and we can assign views to dict keys later if
         # return format is 'dict' or 'flat_dict'.
@@ -321,7 +315,7 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
 
         # if we have distributed 'wrt' variables in fwd mode we have to broadcast the jac
         # columns from the owner of a given range of dist indices to everyone else.
-        if self.get_remote and has_wrt_dist and self.comm.size > 1:
+        if self.get_remote and self.has_wrt_dist and self.comm.size > 1:
             abs2idx = model._var_allprocs_abs2idx
             sizes = model._var_sizes['output']
             meta = self.wrt_meta
@@ -345,7 +339,6 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
 
         # create scratch array for jac scatters
         self.jac_scratch = None
-        self.jac_dist_col_mask = None
 
         if self.comm.size > 1 and self.get_remote:
             # need 2 scratch vectors of the same size here
@@ -368,21 +361,31 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
                 self.jac_scratch['rev'] = [scratch[0][:J.shape[1]]]
                 if self.simul_coloring is not None:  # when simul coloring, need two scratch arrays
                     self.jac_scratch['rev'].append(scratch[1][:J.shape[1]])
-                if self.has_output_dist['rev']:
-                    sizes = model._var_sizes['output']
-                    abs2idx = model._var_allprocs_abs2idx
-                    self.jac_dist_col_mask = mask = np.ones(J.shape[1], dtype=bool)
-                    start = end = 0
-                    for name in self.wrt:
-                        meta = abs2meta_out[name]
-                        end += meta['global_size']
-                        if meta['distributed']:
-                            # see if we have an odd dist var like some auto_ivcs connected to
-                            # remote vars, which are zero everywhere except for one proc
-                            sz = sizes[:, abs2idx[name]]
-                            if np.count_nonzero(sz) > 1:
-                                mask[start:end] = False
-                        start = end
+
+                # create a column mask to zero out contributions to the Allreduce from
+                # duplicated vars
+                self.rev_allreduce_mask = np.ones(J.shape[1], dtype=bool)
+
+                voimeta = self.output_meta['rev']
+                start = end = 0
+                has_dist = False
+                for name, pname in zip(wrt, prom_wrt):
+                    vmeta = all_abs2meta_out[name]
+                    if pname in voimeta:
+                        end += voimeta[pname]['size']
+                    else:
+                        end += vmeta['global_size']
+
+                    dist = vmeta['distributed']
+                    has_dist |= dist
+                    if not dist and model._owning_rank[name] != model.comm.rank:
+                        self.rev_allreduce_mask[start:end] = False
+                    start = end
+
+                # if rev_allreduce_mask isn't all True on all procs, then we need to do an Allreduce
+                need_allreduce = not np.all(self.rev_allreduce_mask)
+                if not (has_dist or any(model.comm.allgather(need_allreduce))):
+                    self.rev_allreduce_mask = None
 
         if not approx:
             for mode in modes:
@@ -396,7 +399,7 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
             for mode in modes:
                 self.sol2jac_map[mode] = self._get_sol2jac_map(self.output_list[mode],
                                                                self.output_meta[mode],
-                                                               abs2meta_out, mode)
+                                                               all_abs2meta_out, mode)
             self.jac_scatters = {}
             self.tgt_petsc = {n: {} for n in modes}
             self.src_petsc = {n: {} for n in modes}
@@ -420,8 +423,8 @@ def __init__(self, problem, of, wrt, use_abs_names, return_format, approx=False,
                 self.dist_idx_map[mode] = dist_map = np.zeros(arr.size, dtype=bool)
                 start = end = 0
                 for name in self.output_list[mode]:
-                    end += abs2meta_out[name]['size']
-                    if abs2meta_out[name]['distributed']:
+                    end += all_abs2meta_out[name]['size']
+                    if all_abs2meta_out[name]['distributed']:
                         dist_map[start:end] = True
                     start = end
 
@@ -463,7 +466,7 @@ def _compute_jac_scatters(self, mode, rowcol_size, get_remote):
             myrank = self.comm.rank
             if get_remote:
                 myoffset = rowcol_size * myrank
-            elif not get_remote:  # rev and not get_remote
+            else:  # rev and not get_remote
                 # reduce size of vector by not including distrib vars
                 arr = np.ones(rowcol_size, dtype=bool)
                 start = end = 0
@@ -489,7 +492,7 @@ def _compute_jac_scatters(self, mode, rowcol_size, get_remote):
             owns = self.model._owning_rank
 
             abs2meta_out = self.model._var_allprocs_abs2meta['output']
-            loc_abs = self.model._var_abs2meta['output']
+            loc_abs2meta = self.model._var_abs2meta['output']
             sizes = self.model._var_sizes['output']
             abs2idx = self.model._var_allprocs_abs2idx
             full_j_tgts = []
@@ -502,7 +505,7 @@ def _compute_jac_scatters(self, mode, rowcol_size, get_remote):
 
                 is_dist = abs2meta_out[name]['distributed']
 
-                if name in loc_abs:
+                if name in loc_abs2meta:
                     end += abs2meta_out[name]['size']
 
                 if get_remote and is_dist:
@@ -532,7 +535,7 @@ def _compute_jac_scatters(self, mode, rowcol_size, get_remote):
                             full_j_srcs.append(myinds)
                             full_j_tgts.append(srcinds + offset)
 
-                if name in loc_abs:
+                if name in loc_abs2meta:
                     start = end
 
             if full_j_srcs:
@@ -631,7 +634,7 @@ def _create_in_idx_map(self, mode):
         model = self.model
         relevant = model._relevant
         has_par_deriv_color = False
-        abs2meta_out = model._var_allprocs_abs2meta['output']
+        all_abs2meta_out = model._var_allprocs_abs2meta['output']
         var_sizes = model._var_sizes
         var_offsets = model._get_var_offsets()
         abs2idx = model._var_allprocs_abs2idx
@@ -658,34 +661,34 @@ def _create_in_idx_map(self, mode):
         # so we just bulk check the outputs here.
         qoi_i = self.input_meta[mode]
         qoi_o = self.output_meta[mode]
+        non_rel_outs = False
         if qoi_i and qoi_o:
             for out in self.output_list[mode]:
                 if out not in qoi_o and out not in qoi_i:
                     non_rel_outs = True
                     break
-            else:
-                non_rel_outs = False
-        else:
-            non_rel_outs = False
 
         for name in input_list:
+            parallel_deriv_color = None
+
             if name in self.responses and self.responses[name]['alias'] is not None:
                 path = self.responses[name]['source']
             else:
                 path = name
 
-            if path not in abs2meta_out:
+            if path not in all_abs2meta_out:
                 # could be promoted input name
                 abs_in = model._var_allprocs_prom2abs_list['input'][path][0]
                 path = model._conn_global_abs_in2out[abs_in]
 
-            in_var_meta = abs2meta_out[path]
+            in_var_meta = all_abs2meta_out[path]
+            dist = in_var_meta['distributed']
 
             if name in vois:
                 # if name is in vois, then it has been declared as either a design var or
                 # a constraint or an objective.
                 meta = vois[name]
-                if meta['distributed']:
+                if dist:
                     end += meta['global_size']
                 else:
                     end += meta['size']
@@ -694,7 +697,7 @@ def _create_in_idx_map(self, mode):
                 cache_lin_sol = meta['cache_linear_solution']
 
                 _check_voi_meta(name, parallel_deriv_color, simul_coloring)
-                if parallel_deriv_color:
+                if parallel_deriv_color is not None:
                     if parallel_deriv_color not in self.par_deriv_printnames:
                         self.par_deriv_printnames[parallel_deriv_color] = []
 
@@ -715,7 +718,7 @@ def _create_in_idx_map(self, mode):
             else:  # name is not a design var or response  (should only happen during testing)
                 end += in_var_meta['global_size']
                 irange = np.arange(in_var_meta['global_size'], dtype=INT_DTYPE)
-                in_idxs = parallel_deriv_color = None
+                in_idxs = None
                 cache_lin_sol = False
 
             in_var_idx = abs2idx[path]
@@ -726,7 +729,7 @@ def _create_in_idx_map(self, mode):
 
             # if we're doing parallel deriv coloring, we only want to set the seed on one proc
             # for each var in a given color
-            if parallel_deriv_color:
+            if parallel_deriv_color is not None:
                 if fwd:
                     relev = relevant[name]['@all'][0]['output']
                 else:
@@ -734,25 +737,11 @@ def _create_in_idx_map(self, mode):
             else:
                 relev = None
 
-            dist = in_var_meta['distributed']
-            if dist:
-                if self.jac_dist_col_mask is None:
-                    ndups = 1  # we don't divide by ndups for distributed inputs
-                else:
-                    ndups = np.nonzero(sizes[:, in_var_idx])[0].size
-            else:
+            if not dist:
                 # if the var is not distributed, convert the indices to global.
                 # We don't iterate over the full distributed size in this case.
                 irange += gstart
 
-                if fwd or parallel_deriv_color:
-                    ndups = 1
-                else:
-                    # find the number of duplicate components in rev mode so we can divide
-                    # the seed between 'ndups' procs so that at the end after we do an
-                    # Allreduce, the contributions from all procs will add up properly.
-                    ndups = np.nonzero(sizes[:, in_var_idx])[0].size
-
             # all local idxs that correspond to vars from other procs will be -1
             # so each entry of loc_i will either contain a valid local index,
             # indicating we should set the local vector entry to 1.0 before running
@@ -762,7 +751,7 @@ def _create_in_idx_map(self, mode):
             if gend > gstart and (relev is None or relev):
                 loc = np.nonzero(np.logical_and(irange >= gstart, irange < gend))[0]
                 if in_idxs is None:
-                    if in_var_meta['distributed']:
+                    if dist:
                         loc_i[loc] = np.arange(0, gend - gstart, dtype=INT_DTYPE)
                     else:
                         loc_i[loc] = irange[loc] - gstart
@@ -786,27 +775,31 @@ def _create_in_idx_map(self, mode):
                     imeta = defaultdict(bool)
                     imeta['par_deriv_color'] = parallel_deriv_color
                     imeta['idx_list'] = [(start, end)]
+                    imeta['seed_vars'] = {name}
                     idx_iter_dict[parallel_deriv_color] = (imeta, it)
                 else:
                     imeta = idx_iter_dict[parallel_deriv_color][0]
                     imeta['idx_list'].append((start, end))
+                    imeta['seed_vars'].add(name)
             elif self.directional:
                 imeta = defaultdict(bool)
                 imeta['idx_list'] = range(start, end)
+                imeta['seed_vars'] = {name}
                 idx_iter_dict[name] = (imeta, self.directional_iter)
             elif not simul_coloring:  # plain old single index iteration
                 imeta = defaultdict(bool)
                 imeta['idx_list'] = range(start, end)
+                imeta['seed_vars'] = {name}
                 idx_iter_dict[name] = (imeta, self.single_index_iter)
 
             if path in relevant and not non_rel_outs:
                 relsystems = relevant[path]['@all'][1]
                 if self.total_relevant_systems is not _contains_all:
                     self.total_relevant_systems.update(relsystems)
-                tup = (ndups, relsystems, cache_lin_sol, name)
+                tup = (relsystems, cache_lin_sol, name)
             else:
                 self.total_relevant_systems = _contains_all
-                tup = (ndups, _contains_all, cache_lin_sol, name)
+                tup = (_contains_all, cache_lin_sol, name)
 
             idx_map.extend([tup] * (end - start))
             start = end
@@ -829,10 +822,13 @@ def _create_in_idx_map(self, mode):
             imeta['itermeta'] = itermeta = []
             locs = None
             for ilist in simul_coloring.color_iter(mode):
+                all_vois = set()
+
                 for i in ilist:
-                    _, rel_systems, cache_lin_sol, _ = idx_map[i]
+                    rel_systems, cache_lin_sol, voiname = idx_map[i]
                     all_rel_systems = _update_rel_systems(all_rel_systems, rel_systems)
                     cache |= cache_lin_sol
+                    all_vois.add(voiname)
 
                 iterdict = defaultdict(bool)
 
@@ -844,6 +840,7 @@ def _create_in_idx_map(self, mode):
 
                 iterdict['relevant'] = all_rel_systems
                 iterdict['cache_lin_solve'] = cache
+                iterdict['seed_vars'] = all_vois
                 itermeta.append(iterdict)
 
             idx_iter_dict['@simul_coloring'] = (imeta, self.simul_coloring_iter)
@@ -920,6 +917,7 @@ def _get_sol2jac_map(self, names, vois, allprocs_abs2meta_out, mode):
             if (path in abs2idx and path in slices and path not in self.remote_vois):
                 var_idx = abs2idx[path]
                 slc = slices[path]
+                slcsize = slc.stop - slc.start
 
                 if MPI and meta['distributed'] and self.get_remote:
                     if indices is not None:
@@ -933,16 +931,20 @@ def _get_sol2jac_map(self, names, vois, allprocs_abs2meta_out, mode):
                         name2jinds.append((path, jac_inds[-1]))
                     else:
                         dist_offset = np.sum(sizes[:myproc, var_idx])
-                        inds.append(np.arange(slc.start, slc.stop, dtype=INT_DTYPE))
+                        inds.append(range(slc.start, slc.stop) if slcsize > 0
+                                    else np.zeros(0, dtype=INT_DTYPE))
                         jac_inds.append(np.arange(jstart + dist_offset,
                                         jstart + dist_offset + sizes[myproc, var_idx],
                                         dtype=INT_DTYPE))
                         name2jinds.append((path, jac_inds[-1]))
                 else:
-                    idx_array = np.arange(slc.start, slc.stop, dtype=INT_DTYPE)
-                    if indices is not None:
-                        idx_array = idx_array[indices.flat()]
-                    inds.append(idx_array)
+                    if indices is None:
+                        sol_inds = range(slc.start, slc.stop) if slcsize > 0 \
+                            else np.zeros(0, dtype=INT_DTYPE)
+                    else:
+                        sol_inds = np.arange(slc.start, slc.stop, dtype=INT_DTYPE)
+                        sol_inds = sol_inds[indices.flat()]
+                    inds.append(sol_inds)
                     jac_inds.append(np.arange(jstart, jstart + sz, dtype=INT_DTYPE))
                     if fwd or not self.get_remote:
                         name2jinds.append((path, jac_inds[-1]))
@@ -1005,6 +1007,8 @@ def _get_tuple_map(self, names, vois, abs2meta_out):
                 else:
                     size = voi['size']
                 indices = vois[name]['indices']
+                if indices:
+                    size = indices.indexed_src_size
                 if responses:
                     path = vois[name]['source']
 
@@ -1047,7 +1051,7 @@ def single_index_iter(self, imeta, mode):
             Iteration metadata.
         """
         for i in imeta['idx_list']:
-            yield i, self.single_input_setter, self.single_jac_setter, None
+            yield i, self.single_input_setter, self.single_jac_setter, imeta
 
     def simul_coloring_iter(self, imeta, mode):
         """
@@ -1076,11 +1080,7 @@ def simul_coloring_iter(self, imeta, mode):
         jac_setter = self.simul_coloring_jac_setter
 
         for color, ilist in enumerate(coloring.color_iter(mode)):
-            if len(ilist) == 1:
-                yield ilist, input_setter, jac_setter, None
-            else:
-                # yield all indices for a color at once
-                yield ilist, input_setter, jac_setter, imeta['itermeta'][color]
+            yield ilist, input_setter, jac_setter, imeta['itermeta'][color]
 
     def par_deriv_iter(self, imeta, mode):
         """
@@ -1161,7 +1161,7 @@ def single_input_setter(self, idx, imeta, mode):
         int or None
             key used for storage of cached linear solve (if active, else None).
         """
-        _, rel_systems, cache_lin_sol, _ = self.in_idx_map[mode][idx]
+        rel_systems, cache_lin_sol, _ = self.in_idx_map[mode][idx]
 
         self._zero_vecs(mode)
 
@@ -1196,7 +1196,7 @@ def simul_coloring_input_setter(self, inds, itermeta, mode):
         int or None
             key used for storage of cached linear solve (if active, else None).
         """
-        if itermeta is None:
+        if len(inds) == 1:
             return self.single_input_setter(inds[0], None, mode)
 
         self._zero_vecs(mode)
@@ -1233,14 +1233,16 @@ def par_deriv_input_setter(self, inds, imeta, mode):
         all_rel_systems = set()
         vec_names = set()
 
-        dist = self.comm.size > 1
-
         for i in inds:
-            if not dist or self.in_loc_idxs[mode][i] >= 0:
+            if self.in_loc_idxs[mode][i] >= 0:
                 rel_systems, vnames, _ = self.single_input_setter(i, imeta, mode)
-                all_rel_systems = _update_rel_systems(all_rel_systems, rel_systems)
                 if vnames is not None:
                     vec_names.add(vnames[0])
+            else:
+                rel_systems, _, _ = self.in_idx_map[mode][i]
+            all_rel_systems = _update_rel_systems(all_rel_systems, rel_systems)
+
+        self.model._problem_meta['parallel_deriv_color'] = imeta['par_deriv_color']
 
         if vec_names:
             return all_rel_systems, sorted(vec_names), (inds[0], mode)
@@ -1270,7 +1272,7 @@ def directional_input_setter(self, inds, itermeta, mode):
             Not used.
         """
         for i in inds:
-            _, rel_systems, _, _ = self.in_idx_map[mode][i]
+            rel_systems, _, _ = self.in_idx_map[mode][i]
             break
 
         self._zero_vecs(mode)
@@ -1298,6 +1300,7 @@ def simple_single_jac_scatter(self, i, mode):
         """
         deriv_idxs, jac_idxs, _ = self.sol2jac_map[mode]
         deriv_val = self.output_vec[mode].asarray()
+
         if not self.get_remote:
             loc_idx = self.loc_jac_idxs[mode][i]
             if loc_idx >= 0:
@@ -1312,7 +1315,7 @@ def simple_single_jac_scatter(self, i, mode):
 
     def _jac_setter_dist(self, i, mode):
         """
-        Scatter the i'th row or allreduce the i'th column of the jacobian.
+        Scatter the i'th column or allreduce the i'th row of the jacobian.
 
         Parameters
         ----------
@@ -1321,52 +1324,22 @@ def _jac_setter_dist(self, i, mode):
         mode : str
             Direction of derivative solution.
         """
-        if self.get_remote and mode == 'fwd':
-            if self.jac_scatters[mode] is not None:
-                self.src_petsc[mode].array = self.J[:, i]
-                self.tgt_petsc[mode].array[:] = self.J[:, i]
-                self.jac_scatters[mode].scatter(self.src_petsc[mode], self.tgt_petsc[mode],
-                                                addv=False, mode=False)
-                self.J[:, i] = self.tgt_petsc[mode].array
-
-        elif mode == 'rev':
-            # for rows corresponding to serial 'of' vars, we need to correct for
-            # duplication of their seed values by dividing by the number of duplications.
-            ndups, _, _, _ = self.in_idx_map[mode][i]
+        if mode == 'fwd':
             if self.get_remote:
-                scratch = self.jac_scratch['rev'][0]
-                scratch[:] = self.J[i]
+                if self.jac_scatters[mode] is not None:
+                    self.src_petsc[mode].array = self.J[:, i]
+                    self.tgt_petsc[mode].array[:] = self.J[:, i]
+                    self.jac_scatters[mode].scatter(self.src_petsc[mode], self.tgt_petsc[mode],
+                                                    addv=False, mode=False)
+                    self.J[:, i] = self.tgt_petsc[mode].array
 
+        else:  # rev
+            if self.get_remote and self.rev_allreduce_mask is not None:
+                scratch = self.jac_scratch['rev'][0]
+                scratch[:] = 0.0
+                scratch[self.rev_allreduce_mask] = self.J[i][self.rev_allreduce_mask]
                 self.comm.Allreduce(scratch, self.J[i], op=MPI.SUM)
 
-                if ndups > 1:
-                    if self.jac_dist_col_mask is not None:
-                        scratch[:] = 1.0
-                        scratch[self.jac_dist_col_mask] = (1.0 / ndups)
-                        self.J[i] *= scratch
-                    else:
-                        self.J[i] *= (1.0 / ndups)
-            else:
-                scatter = self.jac_scatters[mode]
-                if scatter is not None:
-                    if self.dist_idx_map[mode][i]:  # distrib var, skip scatter
-                        return
-                    loc = self.loc_jac_idxs[mode][i]
-                    if loc >= 0:
-                        self.tgt_petsc[mode].array[:] = self.J[loc, :][self.nondist_loc_map[mode]]
-                        self.src_petsc[mode].array[:] = self.J[loc, :][self.nondist_loc_map[mode]]
-                    else:
-                        self.src_petsc[mode].array[:] = 0.0
-
-                    scatter.scatter(self.src_petsc[mode], self.tgt_petsc[mode],
-                                    addv=True, mode=False)
-                    if loc >= 0:
-                        if ndups > 1:
-                            self.J[loc, :][self.nondist_loc_map[mode]] = \
-                                self.tgt_petsc[mode].array * (1.0 / ndups)
-                        else:
-                            self.J[loc, :][self.nondist_loc_map[mode]] = self.tgt_petsc[mode].array
-
     def single_jac_setter(self, i, mode, meta):
         """
         Set the appropriate part of the total jacobian for a single input index.
@@ -1397,8 +1370,7 @@ def par_deriv_jac_setter(self, inds, mode, meta):
         meta : dict
             Metadata dict.
         """
-        dist = self.comm.size > 1
-        if dist:
+        if self.comm.size > 1:
             for i in inds:
                 if self.in_loc_idxs[mode][i] >= 0:
                     self.simple_single_jac_scatter(i, mode)
@@ -1551,7 +1523,8 @@ def compute_totals(self):
             for key, idx_info in self.idx_iter_dict[mode].items():
                 imeta, idx_iter = idx_info
                 for inds, input_setter, jac_setter, itermeta in idx_iter(imeta, mode):
-                    rel_systems, vec_names, cache_key = input_setter(inds, itermeta, mode)
+                    self.model._problem_meta['seed_vars'] = itermeta['seed_vars']
+                    rel_systems, _, cache_key = input_setter(inds, itermeta, mode)
 
                     if debug_print:
                         if par_print and key in par_print:
@@ -1588,13 +1561,17 @@ def compute_totals(self):
 
                     jac_setter(inds, mode, imeta)
 
+                    # reset any Problem level data for the current iteration
+                    self.model._problem_meta['parallel_deriv_color'] = None
+                    self.model._problem_meta['seed_vars'] = None
+
         # Driver scaling.
         if self.has_scaling:
             self._do_driver_scaling(self.J_dict)
 
         # if some of the wrt vars are distributed in fwd mode, we have to bcast from the rank
         # where each part of the distrib var exists
-        if self.get_remote and mode == 'fwd' and self.has_input_dist[mode]:
+        if self.get_remote and mode == 'fwd' and self.has_wrt_dist:
             for start, stop, rank in self.dist_input_range_map[mode]:
                 contig = self.J[:, start:stop].copy()
                 model.comm.Bcast(contig, root=rank)
@@ -2027,7 +2004,7 @@ def _check_voi_meta(name, parallel_deriv_color, simul_coloring):
 
 def _fix_pdc_lengths(idx_iter_dict):
     """
-    Take any parallel_deriv_color entries and make sure their index arrays are same length.
+    Take any parallel_deriv_color entries and make sure their index arrays are the same length.
 
     Parameters
     ----------
diff --git a/openmdao/devtools/debug.py b/openmdao/devtools/debug.py
index e79efbf963..c32c15ee2e 100644
--- a/openmdao/devtools/debug.py
+++ b/openmdao/devtools/debug.py
@@ -9,11 +9,12 @@
 from contextlib import contextmanager
 from collections import Counter
 
-from openmdao.core.constants import _SetupStatus
+from openmdao.core.constants import _SetupStatus, _DEFAULT_OUT_STREAM
 from openmdao.utils.mpi import MPI
 from openmdao.utils.om_warnings import issue_warning, MPIWarning
 from openmdao.utils.reports_system import register_report
-from openmdao.utils.file_utils import text2html
+from openmdao.utils.file_utils import text2html, _load_and_exec
+from openmdao.utils.rangemapper import RangeMapper
 from openmdao.visualization.tables.table_builder import generate_table
 
 
@@ -636,3 +637,260 @@ def comm_info(system, outfile=None, verbose=False, table_format='box_grid'):
         else:
             with open(outfile, 'w') as f:
                 print("No MPI process info available.", file=f)
+
+
+def is_full_slice(range, inds):
+    size = range[1] - range[0]
+    inds = np.asarray(inds)
+    if len(inds) > 1 and inds[0] == 0 and inds[-1] == size - 1:
+        step = inds[1] - inds[0]
+        diffs = inds[1:] - inds[:-1]
+        return np.all(diffs == step)
+
+    return len(inds) == 1 and inds[0] == 0
+
+
+def show_dist_var_conns(group, rev=False, out_stream=_DEFAULT_OUT_STREAM):
+    """
+    Show all distributed variable connections in the given group and below.
+
+    The ranks displayed will be relative to the communicator of the given top group.
+
+    Parameters
+    ----------
+    group : Group
+        The top level group to be searched.  Connections in all subgroups will also be displayed.
+    rev : bool
+        If True show reverse transfers instead of forward transfers.
+    out_stream : file-like
+        Where the output will go.
+
+    Returns
+    -------
+    dict or None
+        Dictionary containing the data for the connections. None is returned on all ranks except
+        rank 0.
+    """
+    from openmdao.core.group import Group
+
+    if out_stream is _DEFAULT_OUT_STREAM:
+        out_stream = sys.stdout
+
+    if out_stream is None:
+        printer = lambda *args, **kwargs: None
+    else:
+        printer = print
+
+    direction = 'rev' if rev else 'fwd'
+    arrowdir = {'fwd': '->', 'rev': '<-'}
+    arrow = arrowdir[direction]
+
+    gdict = {}
+
+    for g in group.system_iter(typ=Group, include_self=True):
+        if g._transfers[direction]:
+            in_ranges = list(g.dist_size_iter('input', group.comm))
+            out_ranges = list(g.dist_size_iter('output', group.comm))
+
+            inmapper = RangeMapper.create(in_ranges)
+            outmapper = RangeMapper.create(out_ranges)
+
+            gprint = False
+
+            gprefix = g.pathname + '.' if g.pathname else ''
+            skip = len(gprefix)
+
+            for sub, transfer in g._transfers[direction].items():
+                if sub is not None and (not isinstance(sub, tuple) or sub[0] is not None):
+                    if not gprint:
+                        gdict[g.pathname] = {}
+                        gprint = True
+
+                    conns = {}
+                    for iidx, oidx in zip(transfer._in_inds, transfer._out_inds):
+                        idata, irind = inmapper.index2key_rel(iidx)
+                        ivar, irank = idata
+                        odata, orind = outmapper.index2key_rel(oidx)
+                        ovar, orank = odata
+
+                        if odata not in conns:
+                            conns[odata] = {}
+                        if idata not in conns[odata]:
+                            conns[odata][idata] = []
+                        conns[odata][idata].append((orind, irind))
+
+                    strs = {}  # use to avoid duplicate printouts for different ranks
+
+                    for odata, odict in conns.items():
+                        ovar, orank = odata
+                        ovar = ovar[skip:]
+
+                        for idata, dlist in odict.items():
+                            ivar, irank = idata
+                            ivar = ivar[skip:]
+                            ranktup = (orank, irank)
+
+                            oinds = [d[0] for d in dlist]
+                            iinds = [d[1] for d in dlist]
+
+                            orange = outmapper.key2range(odata)
+                            irange = inmapper.key2range(idata)
+
+                            if is_full_slice(orange, oinds) and is_full_slice(irange, iinds):
+                                s = f"{ovar} {arrow} {ivar}"
+                                if s not in strs:
+                                    strs[s] = set()
+                                strs[s].add(ranktup)
+                                continue
+
+                            for oidx, iidx in zip(oinds, iinds):
+                                s = f"{ovar}[{oidx}] {arrow} {ivar}[{iidx}]"
+                                if s not in strs:
+                                    strs[s] = set()
+                                strs[s].add(ranktup)
+
+                    gdict[g.pathname][str(sub)] = strs
+
+    do_ranks = False
+
+    if group.comm.size > 1:
+        do_ranks = True
+        final = {}
+        gatherlist = group.comm.gather(gdict, root=0)
+        if group.comm.rank == 0:
+            for dct in gatherlist:
+                for gpath, subdct in dct.items():
+                    if gpath not in final:
+                        final[gpath] = subdct
+                    else:
+                        fgpath = final[gpath]
+                        for sub, strs in subdct.items():
+                            if sub not in fgpath:
+                                fgpath[sub] = strs
+                            else:
+                                fgpathsub = fgpath[sub]
+                                for s, ranks in strs.items():
+                                    if s not in fgpathsub:
+                                        fgpathsub[s] = ranks
+                                    else:
+                                        fgpathsub[s] |= ranks
+
+        gdict = final
+
+    if group.comm.rank == 0:
+        fwd = direction == 'fwd'
+        for gpath, subdct in sorted(gdict.items(), key=lambda x: x[0]):
+            indent = 0 if gpath == '' else gpath.count('.') + 1
+            pad = '   ' * indent
+            printer(f"{pad}In Group '{gpath}'", file=out_stream)
+            for sub, strs in sorted(subdct.items(), key=lambda x: x[0]):
+                if fwd:
+                    printer(f"{pad}   {arrow} {sub}", file=out_stream)
+                else:
+                    printer(f"{pad}   {sub} {arrow}", file=out_stream)
+                for s, ranks in strs.items():
+                    if do_ranks:
+                        oranks = np.empty(len(ranks), dtype=int)
+                        iranks = np.empty(len(ranks), dtype=int)
+                        for i, (ornk, irnk) in enumerate(sorted(ranks)):
+                            oranks[i] = ornk
+                            iranks[i] = irnk
+
+                        if np.all(oranks == oranks[0]):
+                            orstr = str(oranks[0])
+                        else:
+                            sorted_ranks = sorted(oranks)
+                            orstr = str(sorted_ranks)
+                            if len(sorted_ranks) > 3:
+                                for j, r in enumerate(sorted_ranks):
+                                    if j == 0 or r - val == 1:
+                                        val = r
+                                    else:
+                                        break
+                                else:
+                                    orstr = f"[{sorted_ranks[0]} to {sorted_ranks[-1]}]"
+
+                        if np.all(iranks == iranks[0]):
+                            irstr = str(iranks[0])
+                        else:
+                            sorted_ranks = sorted(iranks)
+                            irstr = str(sorted(iranks))
+                            if len(sorted_ranks) > 3:
+                                for j, r in enumerate(sorted_ranks):
+                                    if j == 0 or r - val == 1:
+                                        val = r
+                                    else:
+                                        break
+                                else:
+                                    irstr = f"[{sorted_ranks[0]} to {sorted_ranks[-1]}]"
+
+                        if orstr == irstr and '[' not in orstr:
+                            printer(f"{pad}      {s}    rank {orstr}", file=out_stream)
+                        else:
+                            printer(f"{pad}      {s}    ranks {orstr} {arrow} {irstr}", file=out_stream)
+                    else:
+                        printer(f"{pad}      {s}", file=out_stream)
+
+        return gdict
+
+
+def _dist_conns_setup_parser(parser):
+    """
+    Set up the openmdao subparser for the 'openmdao dist_conns' command.
+
+    Parameters
+    ----------
+    parser : argparse subparser
+        The parser we're adding options to.
+    """
+    parser.add_argument('file', nargs=1, help='Python file containing the model.')
+    parser.add_argument('-o', default=None, action='store', dest='outfile',
+                        help='Name of output file.  By default, output goes to stdout.')
+    parser.add_argument('-r', '--rev', action='store_true', dest='rev',
+                        help='If set, use "rev" transfer direction instead of "fwd".')
+    parser.add_argument('-p', '--problem', action='store', dest='problem', help='Problem name')
+
+
+def _dist_conns_cmd(options, user_args):
+    """
+    Run the `openmdao dist_conns` command.
+
+    The command shows connections, with relative indexing information, between all
+    variables in the model across all MPI processes.
+
+    Parameters
+    ----------
+    options : argparse Namespace
+        Command line options.
+    user_args : list of str
+        Args to be passed to the user script.
+    """
+    import openmdao.utils.hooks as hooks
+
+    def _dist_conns(prob):
+        model = prob.model
+        if options.problem:
+            if model._problem_meta['name'] != options.problem and \
+                    model._problem_meta['pathname'] != options.problem:
+                return
+        elif '/' in model._problem_meta['pathname']:
+            # by default, only display comm info for a top level problem
+            return
+
+        if options.outfile is None:
+            out_stream = sys.stdout
+        else:
+            out_stream = open(options.outfile, 'w')
+
+        try:
+            show_dist_var_conns(model, rev=options.rev, out_stream=out_stream)
+        finally:
+            if out_stream is not sys.stdout:
+                out_stream.close()
+
+        exit()
+
+    # register the hook to be called right after final_setup on the problem
+    hooks._register_hook('final_setup', 'Problem', post=_dist_conns)
+
+    _load_and_exec(options.file[0], user_args)
diff --git a/openmdao/docs/openmdao_book/features/core_features/working_with_components/distributed_components.ipynb b/openmdao/docs/openmdao_book/features/core_features/working_with_components/distributed_components.ipynb
index ae9cbe6fe1..f97f71faa1 100644
--- a/openmdao/docs/openmdao_book/features/core_features/working_with_components/distributed_components.ipynb
+++ b/openmdao/docs/openmdao_book/features/core_features/working_with_components/distributed_components.ipynb
@@ -33,7 +33,7 @@
     "\n",
     "At times when you need to perform a computation using large input arrays, you may want to perform that computation in multiple processes, where each process operates on some subset of the input values. This may be done purely for performance reasons, or it may be necessary because the entire input will not fit in the memory of a single machine. In any case, this can be accomplished in OpenMDAO by declaring those inputs and outputs as distributed. By definition, a `distributed variable` is an input or output where each process contains only a part of the whole variable. Distributed variables are declared by setting the optional \"distributed\" argument to True when adding the variable to a component. A component that has at least one distributed variable can also be called a distributed component.\n",
     "\n",
-    "Any variable that is not distributed is called a `non-distributed variable`. When the model is run under MPI, every process contains a copy of the entire variable.\n",
+    "Any variable that is not distributed is called a `non-distributed variable`. When the model is run under MPI, every process contains a copy of the entire non-distributed variable.  We also call these duplicated variables.\n",
     "\n",
     "We’ve already seen that by using [src_indices](connect-with-src-indices), we can connect an input to only a subset of an output variable. By giving different values for src_indices in each MPI process, we can distribute computations on a distributed output across the processes. All of the scenarios that involve connecting distributed and non-distributed variables are detailed in [Connections involving distributed variables](../working_with_groups/dist_serial.ipynb)."
    ]
diff --git a/openmdao/docs/openmdao_book/features/core_features/working_with_groups/src_indices.ipynb b/openmdao/docs/openmdao_book/features/core_features/working_with_groups/src_indices.ipynb
index b068a22c84..e75791eb35 100644
--- a/openmdao/docs/openmdao_book/features/core_features/working_with_groups/src_indices.ipynb
+++ b/openmdao/docs/openmdao_book/features/core_features/working_with_groups/src_indices.ipynb
@@ -264,8 +264,8 @@
     "        self.idxs = idxs\n",
     "\n",
     "    def setup(self):\n",
-    "        self.add_input('x', np.ones(len(self.idxs)))\n",
-    "        self.add_output('y', 1.0)\n",
+    "        self.add_input('x', np.ones(len(self.idxs)), distributed=True)\n",
+    "        self.add_output('y', 1.0, distributed=True)\n",
     "\n",
     "    def compute(self, inputs, outputs):\n",
     "        outputs['y'] = np.sum(inputs['x'])*2.0\n",
@@ -321,11 +321,11 @@
     "%%px\n",
     "from openmdao.utils.assert_utils import assert_near_equal\n",
     "\n",
-    "assert_near_equal(p['C1.x'],\n",
+    "assert_near_equal(p.get_val('C1.x'),\n",
     "                  np.arange(3, dtype=float) if p.model.C1.comm.rank == 0 else np.arange(3, 5, dtype=float))\n",
     "\n",
     "# the output in each rank is based on the local inputs\n",
-    "assert_near_equal(p['C1.y'], 6. if p.model.C1.comm.rank == 0 else 14.)"
+    "assert_near_equal(p.get_val('C1.y'), 6. if p.model.C1.comm.rank == 0 else 14.)"
    ]
   }
  ],
diff --git a/openmdao/docs/openmdao_book/theory_manual/images/Par1.png b/openmdao/docs/openmdao_book/theory_manual/images/Par1.png
index 1ea865d5a6..8b4b7df5c4 100644
Binary files a/openmdao/docs/openmdao_book/theory_manual/images/Par1.png and b/openmdao/docs/openmdao_book/theory_manual/images/Par1.png differ
diff --git a/openmdao/docs/openmdao_book/theory_manual/images/Par2.png b/openmdao/docs/openmdao_book/theory_manual/images/Par2.png
deleted file mode 100644
index 1da84fa68c..0000000000
Binary files a/openmdao/docs/openmdao_book/theory_manual/images/Par2.png and /dev/null differ
diff --git a/openmdao/docs/openmdao_book/theory_manual/images/Par3.png b/openmdao/docs/openmdao_book/theory_manual/images/Par3.png
deleted file mode 100644
index 5e86f9229e..0000000000
Binary files a/openmdao/docs/openmdao_book/theory_manual/images/Par3.png and /dev/null differ
diff --git a/openmdao/docs/openmdao_book/theory_manual/images/Par4.png b/openmdao/docs/openmdao_book/theory_manual/images/Par4.png
deleted file mode 100644
index 610a6bc0f4..0000000000
Binary files a/openmdao/docs/openmdao_book/theory_manual/images/Par4.png and /dev/null differ
diff --git a/openmdao/docs/openmdao_book/theory_manual/images/nonpar_fwd.png b/openmdao/docs/openmdao_book/theory_manual/images/nonpar_fwd.png
new file mode 100644
index 0000000000..cad93443cd
Binary files /dev/null and b/openmdao/docs/openmdao_book/theory_manual/images/nonpar_fwd.png differ
diff --git a/openmdao/docs/openmdao_book/theory_manual/images/nonpar_rev.png b/openmdao/docs/openmdao_book/theory_manual/images/nonpar_rev.png
new file mode 100644
index 0000000000..32c0f0f072
Binary files /dev/null and b/openmdao/docs/openmdao_book/theory_manual/images/nonpar_rev.png differ
diff --git a/openmdao/docs/openmdao_book/theory_manual/images/par_fwd.png b/openmdao/docs/openmdao_book/theory_manual/images/par_fwd.png
new file mode 100644
index 0000000000..b19fe4fa13
Binary files /dev/null and b/openmdao/docs/openmdao_book/theory_manual/images/par_fwd.png differ
diff --git a/openmdao/docs/openmdao_book/theory_manual/images/par_rev.png b/openmdao/docs/openmdao_book/theory_manual/images/par_rev.png
new file mode 100644
index 0000000000..26408fc041
Binary files /dev/null and b/openmdao/docs/openmdao_book/theory_manual/images/par_rev.png differ
diff --git a/openmdao/docs/openmdao_book/theory_manual/mpi.ipynb b/openmdao/docs/openmdao_book/theory_manual/mpi.ipynb
index 612986a6b7..f1113c6b2e 100644
--- a/openmdao/docs/openmdao_book/theory_manual/mpi.ipynb
+++ b/openmdao/docs/openmdao_book/theory_manual/mpi.ipynb
@@ -34,33 +34,39 @@
     "\n",
     "![Non-parallel example](images/Par1.png)\n",
     "\n",
-    "This model contains two components that don't depend on each other's outputs, and thus those calculations can be performed simultaneously by placing them in a `ParallelGroup`.  When a model containing a `ParallelGroup` is run without using MPI, its components are just executed in succession.  But when it is run using `mpirun` or `mpiexec`, its components are divided amongst the processors. To take fullest advantage of the available processors, the number of subsystems in the ParallelGroup should be equal to the number of available processors. However, it will still work if the number of processors is higher or lower than you need. If you don't give it enough processors, then some processors will sequentially execute some of the components. If the number of subsystems is evenly divisible by the number of processors, then you won't have idle time on any of the processors, so that is ideal. If you give your model too many processors, those processors will be idle during execution of the parallel subsystem.\n",
+    "This model contains two components that don't depend on each other's outputs, and thus those calculations can be performed simultaneously by placing them in a `ParallelGroup`.  When a model containing a `ParallelGroup` is run without using MPI, its components are just executed in succession.  But when it is run using `mpirun` or `mpiexec`, its components are divided amongst the processes. To ensure that all subsystems execute in parallel, you should run with at least as many processes as there are subsystems. If you don't provide enough processes, then some processes will sequentially execute some of the components. Some subsystems may require more processes than others. If you give your model more processes than are needed by the subsystems, those processes will be either be idle during execution of the parallel subsystem or will perform duplicate computations, depending upon how processes are assigned within a subsystem.\n",
     "\n",
-    "The following diagram shows the same example executing on 2 processors.\n",
+    "OpenMDAO can compute derivatives in forward or reverse mode, with the best choice of mode being determined by the ratio of the number of design variables vs. the number of responses.  If the number of responses is greater than the number of design variables, then forward mode is best, and reverse is best if the number of design variables exceeds the number of responses.  'Best' in this case means requiring a smaller number of linear solves in order to compute the total jacobian matrix.\n",
     "\n",
-    "![Parallel subsystem example](images/Par2.png)\n",
+    "The following diagram shows forward mode derivative transfers for our example model running on 1 process.\n",
     "\n",
-    "We see here that every component that isn't under the ParallelGroup is executed on all processors. This is done to limit data transfer between the processors. Similarly, the input and output vectors on these components is the same on all processors. We sometimes call these duplicated variables, but it is clearer to call them non-parallel non-distributed variables.\n",
+    "![Non-parallel forward mode derivatives](images/nonpar_fwd.png)\n",
     "\n",
-    "In contrast, the inputs and outputs on the parallel components only exist on their execution processor. In this model, there are parallel outputs that need to be passed downstream to the final component.  To make this happen, OpenMDAO broadcasts them from the rank that owns them to every rank. This can be seen in the diagram as the crossed arrows that connect to x1 and x2.\n",
+    "The next diagram shows the forward mode transfers for the same example executing on 2 processes.  Note that the derivative values at the input and output of each component in the model are the same as they were in the 1 process case.\n",
     "\n",
-    "Since component execution is repeated on all processors, component authors need to be careful about file operations which can collide if they are called from all processors at once. The safest way to handle these is to restrict them to only write files on the root processor. In addition, the computation of derivatives is duplicated on all processes except for the components in the parallel group, which handle their own unique parts of the calculation. However this is only true in forward mode. \n",
+    "![Parallel forward model derivatives](images/par_fwd.png)\n",
+    "\n",
+    "We see here that every component that isn't under the ParallelGroup is executed on all processes. This is done to limit data transfer between the processes. Similarly, the input and output vectors on these components are the same on all processes. We sometimes call these duplicated variables, but it is clearer to call them non-parallel non-distributed variables.\n",
+    "\n",
+    "In contrast, the inputs and outputs on the parallel components only exist on their execution processor(s). In this model, there are parallel outputs that need to be passed downstream to the final component.  To make this happen, OpenMDAO scatters them from the rank(s) that contain them to the rank(s) that don't. This can be seen in the diagram as the crossed arrows that connect to x1 and x2.  Data transfers are done so as to minimize the amount of data passed between processes so, for example, the 'y' value from the duplicated 'y=2x' component, since it exists in both processes, is only passed to the connected 'x' input in the same process.\n",
+    "\n",
+    "Since component execution is repeated on all processes, component authors need to be careful about file operations which can collide if they are called from all processes at once. The safest way to handle these is to restrict them to only write files on the root processor. In addition, the computation of derivatives is duplicated on all processes except for the components in the parallel group, which handle their own unique parts of the calculation.\n",
     "\n",
     "## Reverse-mode Derivatives in Parallel Subsystems\n",
     "\n",
-    "Reverse-mode derivative calculation is the single exception where the computation on non-parallel, non-distributed portions is different on each processor. This can cause confusion if you are, for example, printing the values of the derivatives vectors when using the matrix-free API.\n",
+    "Reverse-mode derivative calculation uses different transfers than forward mode in order to ensure that the values of non-parallel non-distributed derivatives are consistent across processes and agree with derivatives from the same model if run in a single process.\n",
     "\n",
-    "To understand what is happening, let's examine how derivatives are computed in reverse mode for the example used above.\n",
+    "The following diagram shows reverse mode derivative transfers for our example model running on 1 process.\n",
     "\n",
-    "![Non-parallel reverse mode derivatives](images/Par3.png)\n",
+    "![Non-parallel reverse mode derivatives](images/nonpar_rev.png)\n",
     "\n",
     "In this diagram, our model has one derivative to compute.  We start with a seed of 1.0 in the output, and propagate that through the model (as denoted by the red arrows), multiplying by the sub-jacobians in each component as we go. Whenever we have an internal output that is connected to multiple inputs, we need to sum the contributions that are propagated there in reverse mode. The end result is the derivative across these components.\n",
     "\n",
     "Now, let's examine this process under MPI with 2 processors:\n",
     "\n",
-    "![Parallel reverse mode derivatives](images/Par4.png)\n",
+    "![Parallel reverse mode derivatives](images/par_rev.png)\n",
     "\n",
-    "The biggest surprise here is that the parallel components receive a value that is double the corresponding value in the non-parallel example. This is because the output is computed as the sum of the inputs from each rank. This is a slightly unusual way to do it, but it is motivated by memory performance. The operation that transfers data from an output to an input, either of which may be local or remote, is done using a set of source indices and a set of target indices. These index sets may be large, and the scale with the full-model variable size. We found that we could save memory by using the same index sets, while swapping the source and target sets, for both forward and reverse modes. When this is done, different parts of the derivative calculation end up propagating on different ranks in the non-parallel part of the model, as the diagram shows. The correct derivative result can be extracted as a final step by summing up the values on all ranks, then dividing by the number of processors.\n",
+    "We see here, as in the forward case, the derivative values in the component inputs and outputs agree with those we saw in the non-parallel case.  Note that, as mentioned above, we have to sum the values from multiple inputs if they are connected to the same output. However, when running on multiple processes, some of our inputs are duplicated, i.e. we have the *same* input existing in multiple processes.  In that case, assuming the input is not distributed, we do not sum the multiple instances together but instead use only one of the values, either the value from the same process as the output if it exists, or the value from the lowest rank process where it does exist.\n",
     "\n",
     "\n",
     "## Distributed Components\n",
diff --git a/openmdao/drivers/scipy_optimizer.py b/openmdao/drivers/scipy_optimizer.py
index 4f049274f8..49b890e18a 100644
--- a/openmdao/drivers/scipy_optimizer.py
+++ b/openmdao/drivers/scipy_optimizer.py
@@ -621,8 +621,9 @@ def _objfunc(self, x_new):
             return 0
 
         # print("Functions calculated")
-        # print('   xnew', x_new)
-        # print('   fnew', f_new)
+        # rank = MPI.COMM_WORLD.rank if MPI else 0
+        # print(rank, '   xnew', x_new)
+        # print(rank, '   fnew', f_new)
 
         return f_new
 
diff --git a/openmdao/drivers/tests/test_scipy_optimizer.py b/openmdao/drivers/tests/test_scipy_optimizer.py
index 9022294e0a..5330e3abab 100644
--- a/openmdao/drivers/tests/test_scipy_optimizer.py
+++ b/openmdao/drivers/tests/test_scipy_optimizer.py
@@ -17,7 +17,7 @@
 from openmdao.test_suite.components.sellar_feature import SellarMDA
 from openmdao.test_suite.components.simple_comps import NonSquareArrayComp
 from openmdao.test_suite.groups.sin_fitter import SineFitter
-from openmdao.utils.assert_utils import assert_near_equal, assert_warning
+from openmdao.utils.assert_utils import assert_near_equal, assert_warning, assert_check_totals
 from openmdao.utils.general_utils import run_driver
 from openmdao.utils.testing_utils import set_env_vars_context
 from openmdao.utils.mpi import MPI
@@ -155,6 +155,8 @@ def test_opt_distcomp(self):
         con = prob.driver.get_constraint_values()
         obj = prob.driver.get_objective_values()
 
+        assert_check_totals(prob.check_totals(method='cs', out_stream=None))
+
         assert_near_equal(obj['sum.f_sum'], 0.0, 2e-6)
         assert_near_equal(con['parab.f_xy'],
                           np.zeros(7),
diff --git a/openmdao/jacobians/dictionary_jacobian.py b/openmdao/jacobians/dictionary_jacobian.py
index c16e4764a3..b4b807d509 100644
--- a/openmdao/jacobians/dictionary_jacobian.py
+++ b/openmdao/jacobians/dictionary_jacobian.py
@@ -21,6 +21,8 @@ class DictionaryJacobian(Jacobian):
     ----------
     _iter_keys : list of (vname, vname) tuples
         List of tuples of variable names that match subjacs in the this Jacobian.
+    _key_owner : dict
+        Dict mapping subjac keys to the rank where that subjac is local.
     """
 
     def __init__(self, system, **kwargs):
@@ -29,6 +31,7 @@ def __init__(self, system, **kwargs):
         """
         super().__init__(system, **kwargs)
         self._iter_keys = None
+        self._key_owner = None
 
     def _iter_abs_keys(self, system):
         """
@@ -47,17 +50,63 @@ def _iter_abs_keys(self, system):
             List of keys matching this jacobian for the current system.
         """
         if self._iter_keys is None:
+            # determine the set of remote keys (keys where either of or wrt is remote somewhere)
+            # only if we're under MPI with comm size > 1 and the given system is a Group that
+            # computes its derivatives using finite difference or complex step.
+            include_remotes = system.pathname and \
+                system.comm.size > 1 and system._owns_approx_jac and system._subsystems_allprocs
             subjacs = self._subjacs_info
             keys = []
-            for res_name in system._var_abs2meta['output']:
+            if include_remotes:
+                ofnames = system._var_allprocs_abs2meta['output']
+                wrtnames = system._var_allprocs_abs2meta
+            else:
+                ofnames = system._var_abs2meta['output']
+                wrtnames = system._var_abs2meta
+
+            for res_name in ofnames:
                 for type_ in ('output', 'input'):
-                    for name in system._var_abs2meta[type_]:
+                    for name in wrtnames[type_]:
                         key = (res_name, name)
                         if key in subjacs:
                             keys.append(key)
 
             self._iter_keys = keys
 
+            if include_remotes:
+                local_out = system._var_abs2meta['output']
+                local_in = system._var_abs2meta['input']
+                remote_keys = []
+                for key in keys:
+                    of, wrt = key
+                    if of not in local_out or (wrt not in local_in and wrt not in local_out):
+                        remote_keys.append(key)
+
+                abs2idx = system._var_allprocs_abs2idx
+                sizes_out = system._var_sizes['output']
+                sizes_in = system._var_sizes['input']
+                owner_dict = {}
+                for keys in system.comm.allgather(remote_keys):
+                    for key in keys:
+                        if key not in owner_dict:
+                            of, wrt = key
+                            ofsizes = sizes_out[:, abs2idx[of]]
+                            if wrt in ofnames:
+                                wrtsizes = sizes_out[:, abs2idx[wrt]]
+                            else:
+                                wrtsizes = sizes_in[:, abs2idx[wrt]]
+                            for rank, (ofsz, wrtsz) in enumerate(zip(ofsizes, wrtsizes)):
+                                # find first rank where both of and wrt are local
+                                if ofsz and wrtsz:
+                                    owner_dict[key] = rank
+                                    break
+                            else:  # no rank was found where both were local. Use 'of' local rank
+                                owner_dict[key] = np.min(np.nonzero(ofsizes)[0])
+
+                self._key_owner = owner_dict
+            else:
+                self._key_owner = {}
+
         return self._iter_keys
 
     def _apply(self, system, d_inputs, d_outputs, d_residuals, mode):
@@ -78,8 +127,8 @@ def _apply(self, system, d_inputs, d_outputs, d_residuals, mode):
             'fwd' or 'rev'.
         """
         fwd = mode == 'fwd'
-        d_res_names = d_residuals._names
         d_out_names = d_outputs._names
+        d_res_names = d_residuals._names
         d_inp_names = d_inputs._names
 
         if not d_out_names and not d_inp_names:
@@ -95,33 +144,34 @@ def _apply(self, system, d_inputs, d_outputs, d_residuals, mode):
         with system._unscaled_context(outputs=[d_outputs], residuals=[d_residuals]):
             for abs_key in self._iter_abs_keys(system):
                 res_name, other_name = abs_key
-                if res_name in d_res_names:
-                    if other_name in d_out_names:
+                ofvec = rflat(res_name) if res_name in d_res_names else None
+
+                if other_name in d_out_names:
+                    wrtvec = oflat(other_name)
+                elif other_name in d_inp_names:
+                    wrtvec = iflat(other_name)
+                else:
+                    wrtvec = None
+
+                if fwd:
+                    if is_explicit and res_name is other_name and wrtvec is not None:
                         # skip the matvec mult completely for identity subjacs
-                        if is_explicit and res_name is other_name:
-                            if fwd:
-                                val = rflat(res_name)
-                                val -= oflat(other_name)
-                            else:
-                                val = oflat(other_name)
-                                val -= rflat(res_name)
-                            continue
-                        if fwd:
-                            left_vec = rflat(res_name)
-                            right_vec = oflat(other_name)
-                        else:
-                            left_vec = oflat(other_name)
-                            right_vec = rflat(res_name)
-                    elif other_name in d_inp_names:
-                        if fwd:
-                            left_vec = rflat(res_name)
-                            right_vec = iflat(other_name)
-                        else:
-                            left_vec = iflat(other_name)
-                            right_vec = rflat(res_name)
-                    else:
+                        ofvec -= wrtvec
                         continue
+                    left_vec = ofvec
+                    right_vec = wrtvec
+                else:  # rev
+                    left_vec = wrtvec
+                    right_vec = ofvec
+
+                if abs_key in self._key_owner and abs_key in system._cross_keys:
+                    wrtowner = system._owning_rank[other_name]
+                    if system.comm.rank == wrtowner:
+                        system.comm.bcast(right_vec, root=wrtowner)
+                    else:
+                        right_vec = system.comm.bcast(None, root=wrtowner)
 
+                if left_vec is not None and right_vec is not None:
                     subjac_info = subjacs_info[abs_key]
                     if randgen:
                         subjac = self._randomize_subjac(subjac_info['val'], abs_key)
@@ -150,6 +200,21 @@ def _apply(self, system, d_inputs, d_outputs, d_residuals, mode):
                             subjac = subjac.transpose()
                             left_vec += subjac.dot(right_vec)
 
+                if abs_key in self._key_owner:
+                    owner = self._key_owner[abs_key]
+                    if owner == system.comm.rank:
+                        system.comm.bcast(left_vec, root=owner)
+                    elif owner is not None:
+                        left_vec = system.comm.bcast(None, root=owner)
+                        if fwd:
+                            if res_name in d_res_names:
+                                d_residuals._abs_set_val(res_name, left_vec)
+                        else:  # rev
+                            if other_name in d_out_names:
+                                d_outputs._abs_set_val(other_name, left_vec)
+                            elif other_name in d_inp_names:
+                                d_inputs._abs_set_val(other_name, left_vec)
+
 
 class _CheckingJacobian(DictionaryJacobian):
     """
@@ -268,11 +333,15 @@ def set_col(self, system, icol, column):
             if key in self._subjacs_info:
                 subjac = self._subjacs_info[key]
                 if subjac['cols'] is None:
+                    if subjac['val'] is None:  # can happen for matrix free comp
+                        subjac['val'] = np.zeros(subjac['shape'])
                     subjac['val'][:, loc_idx] = column[start:end]
                 else:
                     match_inds = np.nonzero(subjac['cols'] == loc_idx)[0]
                     if match_inds.size > 0:
                         row_inds = subjac['rows'][match_inds]
+                        if subjac['val'] is None:
+                            subjac['val'] = np.zeros(len(subjac['rows']))
                         subjac['val'][match_inds] = column[start:end][row_inds]
                     else:
                         row_inds = np.zeros(0, dtype=INT_DTYPE)
diff --git a/openmdao/jacobians/jacobian.py b/openmdao/jacobians/jacobian.py
index 20a35823ce..3410d54b67 100644
--- a/openmdao/jacobians/jacobian.py
+++ b/openmdao/jacobians/jacobian.py
@@ -357,7 +357,7 @@ def _setup_index_maps(self, system):
                 if ridxs is not _full_slice or cidxs is not _full_slice:
                     # replace our local subjac with a smaller one but don't
                     # change the subjac belonging to the system (which has values
-                    # shared with subsystems)
+                    # shared with other systems)
                     if self._subjacs_info is system._subjacs_info:
                         self._subjacs_info = system._subjacs_info.copy()
                     meta = self._subjacs_info[key] = meta.copy()
diff --git a/openmdao/solvers/linear/tests/test_linear_block_gs.py b/openmdao/solvers/linear/tests/test_linear_block_gs.py
index 9c6b1e9ee9..dcdda4fa80 100644
--- a/openmdao/solvers/linear/tests/test_linear_block_gs.py
+++ b/openmdao/solvers/linear/tests/test_linear_block_gs.py
@@ -651,7 +651,6 @@ def compute(self, inputs, outputs):
 
     def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
         self.count += 1
-        # print(f'{self.name}: call to compute jacvecs have x? {"x" in d_inputs} have y? {"y" in d_inputs} have w? {"w" in d_inputs}')
         if mode == 'fwd':
             if 'z' in d_outputs:
                 if 'x' in d_inputs:
diff --git a/openmdao/solvers/nonlinear/newton.py b/openmdao/solvers/nonlinear/newton.py
index 161e68058f..1162cf5069 100644
--- a/openmdao/solvers/nonlinear/newton.py
+++ b/openmdao/solvers/nonlinear/newton.py
@@ -76,7 +76,6 @@ def _setup_solvers(self, system, depth):
             depth of the current system (already incremented).
         """
         super()._setup_solvers(system, depth)
-        rank = MPI.COMM_WORLD.rank if MPI is not None else 0
 
         self._disallow_discrete_outputs()
 
@@ -220,35 +219,36 @@ def _single_iteration(self):
         approx_status = system._owns_approx_jac
         system._owns_approx_jac = False
 
-        system._dresiduals.set_vec(system._residuals)
-        system._dresiduals *= -1.0
-        my_asm_jac = self.linear_solver._assembled_jac
+        try:
+            system._dresiduals.set_vec(system._residuals)
+            system._dresiduals *= -1.0
+            my_asm_jac = self.linear_solver._assembled_jac
 
-        system._linearize(my_asm_jac, sub_do_ln=do_sub_ln)
-        if (my_asm_jac is not None and system.linear_solver._assembled_jac is not my_asm_jac):
-            my_asm_jac._update(system)
+            system._linearize(my_asm_jac, sub_do_ln=do_sub_ln)
+            if (my_asm_jac is not None and system.linear_solver._assembled_jac is not my_asm_jac):
+                my_asm_jac._update(system)
 
-        self._linearize()
+            self._linearize()
 
-        self.linear_solver.solve('fwd')
+            self.linear_solver.solve('fwd')
 
-        if self.linesearch and not system.under_complex_step:
-            self.linesearch._do_subsolve = do_subsolve
-            self.linesearch.solve()
-        else:
-            system._outputs += system._doutputs
+            if self.linesearch and not system.under_complex_step:
+                self.linesearch._do_subsolve = do_subsolve
+                self.linesearch.solve()
+            else:
+                system._outputs += system._doutputs
 
-        self._solver_info.pop()
+            self._solver_info.pop()
 
-        # Hybrid newton support.
-        if do_subsolve:
-            with Recording('Newton_subsolve', 0, self):
-                self._solver_info.append_solver()
-                self._gs_iter()
-                self._solver_info.pop()
-
-        # Enable local fd
-        system._owns_approx_jac = approx_status
+            # Hybrid newton support.
+            if do_subsolve:
+                with Recording('Newton_subsolve', 0, self):
+                    self._solver_info.append_solver()
+                    self._gs_iter()
+                    self._solver_info.pop()
+        finally:
+            # Enable local fd
+            system._owns_approx_jac = approx_status
 
     def _set_complex_step_mode(self, active):
         """
diff --git a/openmdao/test_suite/components/paraboloid_distributed.py b/openmdao/test_suite/components/paraboloid_distributed.py
index b8d7de1189..73b94c2e48 100644
--- a/openmdao/test_suite/components/paraboloid_distributed.py
+++ b/openmdao/test_suite/components/paraboloid_distributed.py
@@ -30,7 +30,6 @@ def setup(self):
         start = offsets[rank]
         io_size = sizes[rank]
         self.offset = offsets[rank]
-        end = start + io_size
 
         # src_indices will be computed automatically
         self.add_input('x', val=np.ones(io_size), distributed=True)
diff --git a/openmdao/test_suite/groups/parallel_groups.py b/openmdao/test_suite/groups/parallel_groups.py
index c1c2afa560..76bab2d986 100644
--- a/openmdao/test_suite/groups/parallel_groups.py
+++ b/openmdao/test_suite/groups/parallel_groups.py
@@ -233,16 +233,18 @@ class ConvergeDiverge(om.Group):
     Used for testing parallel reverse scatters.
     """
 
-    def __init__(self):
+    def __init__(self, parallel=True, inner_ivc=True):
         super().__init__()
 
-        self.add_subsystem('iv', om.IndepVarComp('x', 2.0))
+        if inner_ivc:
+            self.add_subsystem('iv', om.IndepVarComp('x', 2.0))
 
         self.add_subsystem('c1', om.ExecComp(['y1 = 2.0*x1**2',
                                               'y2 = 3.0*x1'
                                               ]))
 
-        g1 = self.add_subsystem('g1', om.ParallelGroup())
+        grp = om.ParallelGroup() if parallel else om.Group()
+        g1 = self.add_subsystem('g1', grp)
         g1.add_subsystem('c2', om.ExecComp('y1 = 0.5*x1'))
         g1.add_subsystem('c3', om.ExecComp('y1 = 3.5*x1'))
 
@@ -250,14 +252,16 @@ def __init__(self):
                                               'y2 = 3.0*x1 - 5.0*x2'
                                               ]))
 
-        g2 = self.add_subsystem('g2', om.ParallelGroup())
+        grp = om.ParallelGroup() if parallel else om.Group()
+        g2 = self.add_subsystem('g2', grp)
         g2.add_subsystem('c5', om.ExecComp('y1 = 0.8*x1'))
         g2.add_subsystem('c6', om.ExecComp('y1 = 0.5*x1'))
 
         self.add_subsystem('c7', om.ExecComp('y1 = x1 + 3.0*x2'))
 
         # make connections
-        self.connect('iv.x', 'c1.x1')
+        if inner_ivc:
+            self.connect('iv.x', 'c1.x1')
 
         self.connect('c1.y1', 'g1.c2.x1')
         self.connect('c1.y2', 'g1.c3.x1')
diff --git a/openmdao/utils/coloring.py b/openmdao/utils/coloring.py
index 2cfffc729c..7ae112e36a 100644
--- a/openmdao/utils/coloring.py
+++ b/openmdao/utils/coloring.py
@@ -2697,12 +2697,10 @@ def _initialize_model_approx(model, driver, of=None, wrt=None):
     """
     Set up internal data structures needed for computing approx totals.
     """
-    design_vars = driver._designvars
-
     if of is None:
         of = driver._get_ordered_nl_responses()
     if wrt is None:
-        wrt = list(design_vars)
+        wrt = list(driver._designvars)
 
     # Initialization based on driver (or user) -requested "of" and "wrt".
     if (not model._owns_approx_jac or model._owns_approx_of is None or
@@ -2712,19 +2710,12 @@ def _initialize_model_approx(model, driver, of=None, wrt=None):
         model._owns_approx_wrt = wrt
 
         # Support for indices defined on driver vars.
-        if MPI and model.comm.size > 1:
-            of_idx = model._owns_approx_of_idx
-            for key, meta in driver._responses.items():
-                if meta['indices'] is not None:
-                    of_idx[key] = meta['indices']
-        else:
-            model._owns_approx_of_idx = {
-                key: meta['indices']
-                    for key, meta in _src_or_alias_item_iter(driver._responses)
-                if meta['indices'] is not None
-            }
+        model._owns_approx_of_idx = {
+            key: meta['indices'] for key, meta in _src_or_alias_item_iter(driver._responses)
+            if meta['indices'] is not None
+        }
         model._owns_approx_wrt_idx = {
-            key: meta['indices'] for key, meta in _src_or_alias_item_iter(design_vars)
+            key: meta['indices'] for key, meta in _src_or_alias_item_iter(driver._designvars)
             if meta['indices'] is not None
         }
 
diff --git a/openmdao/utils/general_utils.py b/openmdao/utils/general_utils.py
index f096632a61..7817e7d9e0 100644
--- a/openmdao/utils/general_utils.py
+++ b/openmdao/utils/general_utils.py
@@ -320,6 +320,9 @@ def __contains__(self, name):
         return True
 
 
+_contains_all = ContainsAll()
+
+
 def all_ancestors(pathname, delim='.'):
     """
     Return a generator of pathnames of the starting object and all of its parents.
@@ -1173,10 +1176,12 @@ def wing_dbg():
 
 class LocalRangeIterable(object):
     """
-    Iterable object yielding local indices while iterating over local or distributed vars.
+    Iterable object yielding local indices while iterating over local, distributed, or remote vars.
 
     The number of iterations for a distributed variable will be the full distributed size of the
-    variable but None will be returned for any indices that are not local to the given rank.
+    variable.
+
+    None will be returned for any indices that are not local to the given rank.
 
     Parameters
     ----------
@@ -1185,15 +1190,17 @@ class LocalRangeIterable(object):
     vname : str
         Name of the variable.
     use_vec_offset : bool
-        If True, return indices for the given variable within its vector, else just return
+        If True, return indices for the given variable within its parent vector, else just return
         indices within the variable itself, i.e. range(var_size).
 
     Attributes
     ----------
+    _vname : str
+        Name of the variable.
     _inds : ndarray
         Variable indices (unused for distributed variables).
-    _dist_size : int
-        Full size of distributed variable.
+    _var_size : int
+        Full size of distributed or remote variable.
     _start : int
         Starting index of distributed variable on this rank.
     _end : int
@@ -1208,18 +1215,21 @@ def __init__(self, system, vname, use_vec_offset=True):
         """
         Initialize the iterator.
         """
-        self._dist_size = 0
+        self._vname = vname
+        self._var_size = 0
 
-        abs2meta = system._var_allprocs_abs2meta['output']
-        if vname in abs2meta:
+        all_abs2meta = system._var_allprocs_abs2meta['output']
+        if vname in all_abs2meta:
             sizes = system._var_sizes['output']
             slices = system._outputs.get_slice_dict()
+            abs2meta = system._var_abs2meta['output']
         else:
-            abs2meta = system._var_allprocs_abs2meta['input']
+            all_abs2meta = system._var_allprocs_abs2meta['input']
             sizes = system._var_sizes['input']
             slices = system._inputs.get_slice_dict()
+            abs2meta = system._var_abs2meta['input']
 
-        if abs2meta[vname]['distributed']:
+        if all_abs2meta[vname]['distributed']:
             var_idx = system._var_allprocs_abs2idx[vname]
             rank = system.comm.rank
             self._offset = np.sum(sizes[rank, :var_idx]) if use_vec_offset else 0
@@ -1227,13 +1237,32 @@ def __init__(self, system, vname, use_vec_offset=True):
             self._iter = self._dist_iter
             self._start = np.sum(sizes[:rank, var_idx])
             self._end = self._start + sizes[rank, var_idx]
-            self._dist_size = np.sum(sizes[:, var_idx])
+            self._var_size = np.sum(sizes[:, var_idx])
+        elif vname not in abs2meta:  # variable is remote
+            self._iter = self._remote_iter
+            self._var_size = all_abs2meta[vname]['global_size']
         else:
             self._iter = self._serial_iter
             if use_vec_offset:
                 self._inds = range(slices[vname].start, slices[vname].stop)
             else:
                 self._inds = range(slices[vname].stop - slices[vname].start)
+            self._var_size = all_abs2meta[vname]['global_size']
+
+    def __str__(self):
+        """
+        Return a string representation of the iterator.
+
+        Returns
+        -------
+        str
+            String representation of the iterator.
+        """
+        if self._iter is self._dist_iter:
+            return f"LocalRangeIterable({self._vname}, dist: {self._start} to {self._end})"
+        elif self._iter is self._remote_iter:
+            return f"LocalRangeIterable({self._vname}, remote: size={self._var_size})"
+        return f"LocalRangeIterable({self._vname}, serial: size={self._var_size})"
 
     def _serial_iter(self):
         """
@@ -1258,12 +1287,24 @@ def _dist_iter(self):
         start = self._start
         end = self._end
 
-        for i in range(self._dist_size):
+        for i in range(self._var_size):
             if i >= start and i < end:
                 yield i - start + self._offset
             else:
                 yield None
 
+    def _remote_iter(self):
+        """
+        Iterate over a remote variable.
+
+        Yields
+        ------
+        None
+            Always yields None.
+        """
+        for _ in range(self._var_size):
+            yield None
+
     def __iter__(self):
         """
         Return an iterator.
@@ -1363,3 +1404,26 @@ def inconsistent_across_procs(comm, arr, tol=1e-15, return_array=True):
 
     comm.gather(arr, root=0)
     return comm.bcast(None, root=0)
+
+
+def get_rev_conns(conns):
+    """
+    Return a dict mapping each connected output to a list of its connected inputs.
+
+    Parameters
+    ----------
+    conns : dict
+        Dict mapping each input to its connected output.
+
+    Returns
+    -------
+    dict
+        Dict mapping each connected output to a list of its connected inputs.
+    """
+    rev = {}
+    for tgt, src in conns.items():
+        if src in rev:
+            rev[src].append(tgt)
+        else:
+            rev[src] = [tgt]
+    return rev
diff --git a/openmdao/utils/graph_utils.py b/openmdao/utils/graph_utils.py
index f9bd4535be..1c04fa0538 100644
--- a/openmdao/utils/graph_utils.py
+++ b/openmdao/utils/graph_utils.py
@@ -54,63 +54,3 @@ def get_out_of_order_nodes(graph, orders):
                 out_of_order.append((u, v))
 
     return strongcomps, out_of_order
-
-
-def get_hybrid_graph(connections, model=None):
-    """
-    Return a graph of all variables and components in the model.
-
-    Each component is connected each of its input and output variables, and
-    those variables are connected to other variables based on the connections
-    in the model.
-
-    Parameters
-    ----------
-    connections : dict
-        Dictionary of connections in the model, of the form {tgt: src}.
-    model : <Group>
-        The model that contains the connections, or None.  If not None, all
-        outputs, even unconnected ones, will be included in the graph.
-
-    Returns
-    -------
-    networkx.DiGraph
-        Graph of all variables and components in the model.
-    """
-    # Create a hybrid graph with components and all connected vars.  If a var is connected,
-    # also connect it to its corresponding component.  This results in a smaller graph
-    # (fewer edges) than would be the case for a pure variable graph where all inputs
-    # to a particular component would have to be connected to all outputs from that component.
-    graph = nx.DiGraph()
-
-    for tgt, src in connections.items():
-        if src not in graph:
-            graph.add_node(src, type_='out')
-
-        graph.add_node(tgt, type_='in')
-
-        src_sys, _, _ = src.rpartition('.')
-        graph.add_edge(src_sys, src)
-
-        tgt_sys, _, _ = tgt.rpartition('.')
-        graph.add_edge(tgt, tgt_sys)
-
-        graph.add_edge(src, tgt)
-
-    if model is None:
-        return graph
-
-    # it's possible to have outputs that are unconnected, so we need to add them to the graph
-    # and connect them to their component.  All inputs are connected so we don't need to do the
-    # same for them.
-    for n in model._var_allprocs_abs2meta['output']:
-        if n not in graph:
-            graph.add_node(n, type_='out')
-            graph.add_edge(n.rpartition('.')[0], n)
-
-    for n in model._discrete_outputs:
-        if n not in graph:
-            graph.add_node(n, type_='out')
-            graph.add_edge(n.rpartition('.')[0], n)
-
-    return graph
diff --git a/openmdao/utils/indexer.py b/openmdao/utils/indexer.py
index 92ed608221..e4225b8f28 100644
--- a/openmdao/utils/indexer.py
+++ b/openmdao/utils/indexer.py
@@ -271,8 +271,6 @@ def set_src_shape(self, shape, dist_shape=None):
                 raise
             self._shaped_inst = None
 
-        self._src_shape = sshape
-
         return self
 
     def to_json(self):
@@ -345,6 +343,24 @@ def __str__(self):
         """
         return f"{self._idx}"
 
+    def apply_offset(self, offset, flat=True):
+        """
+        Apply an offset to this index.
+
+        Parameters
+        ----------
+        offset : int
+            The offset to apply.
+        flat : bool
+            If True, return a flat index.
+
+        Returns
+        -------
+        int
+            The offset index.
+        """
+        return self._idx + offset
+
     def copy(self):
         """
         Copy this Indexer.
@@ -520,6 +536,24 @@ def __str__(self):
         """
         return f"{self._slice}"
 
+    def apply_offset(self, offset, flat=True):
+        """
+        Apply an offset to this index.
+
+        Parameters
+        ----------
+        offset : int
+            The offset to apply.
+        flat : bool
+            If True, return a flat index.
+
+        Returns
+        -------
+        slice
+            The offset slice.
+        """
+        return slice(self._slice.start + offset, self._slice.stop + offset, self._slice.step)
+
     def copy(self):
         """
         Copy this Indexer.
@@ -752,6 +786,24 @@ def __str__(self):
         """
         return _truncate(f"{self._arr}".replace('\n', ''))
 
+    def apply_offset(self, offset, flat=True):
+        """
+        Apply an offset to this index.
+
+        Parameters
+        ----------
+        offset : int
+            The offset to apply.
+        flat : bool
+            If True, return a flat index.
+
+        Returns
+        -------
+        slice
+            The offset slice.
+        """
+        return self.as_array(flat=flat) + offset
+
     def copy(self):
         """
         Copy this Indexer.
@@ -957,6 +1009,26 @@ def __str__(self):
         """
         return str(self._tup)
 
+    def apply_offset(self, offset, flat=True):
+        """
+        Apply an offset to this index.
+
+        Parameters
+        ----------
+        offset : int
+            The offset to apply.
+        flat : bool
+            If True, return a flat index.
+
+        Returns
+        -------
+        ndarray
+            The offset array.
+        """
+        if flat:
+            return self.flat() + offset
+        return self.as_array(flat=False) + offset
+
     def copy(self):
         """
         Copy this Indexer.
@@ -1169,6 +1241,24 @@ def __str__(self):
         """
         return f"{self._tup}"
 
+    def apply_offset(self, offset, flat=True):
+        """
+        Apply an offset to this index.
+
+        Parameters
+        ----------
+        offset : int
+            The offset to apply.
+        flat : bool
+            If True, return a flat index.
+
+        Returns
+        -------
+        ndarray
+            The offset array.
+        """
+        return self.as_array(flat=flat) + offset
+
     def copy(self):
         """
         Copy this Indexer.
diff --git a/openmdao/utils/om.py b/openmdao/utils/om.py
index 4f90635844..3d8b0f029b 100644
--- a/openmdao/utils/om.py
+++ b/openmdao/utils/om.py
@@ -43,7 +43,8 @@
 from openmdao.components.meta_model_structured_comp import MetaModelStructuredComp
 from openmdao.components.meta_model_unstructured_comp import MetaModelUnStructuredComp
 from openmdao.core.component import Component
-from openmdao.devtools.debug import config_summary, tree, comm_info
+from openmdao.devtools.debug import config_summary, tree, comm_info, _dist_conns_cmd, \
+    _dist_conns_setup_parser
 from openmdao.devtools.itrace import _itrace_exec, _itrace_setup_parser
 from openmdao.devtools.iprofile_app.iprofile_app import _iprof_exec, _iprof_setup_parser
 from openmdao.devtools.iprofile import _iprof_totals_exec, _iprof_totals_setup_parser
@@ -440,7 +441,7 @@ def _list_pre_post(prob):
 
 def _get_deps(dep_dict: dict, package_name: str) -> None:
     """
-    Recursively determine all installed depenency versions and add newly found ones to dep_dict.
+    Recursively determine all installed dependency versions and add newly found ones to dep_dict.
 
     Parameters
     ----------
@@ -536,6 +537,8 @@ def _set_dyn_hook(prob):
                   'Print MPI communicator info for systems.'),
     'compute_entry_points': (_compute_entry_points_setup_parser, _compute_entry_points_exec,
                              'Compute entry point declarations to add to the setup.py file.'),
+    'dist_conns': (_dist_conns_setup_parser, _dist_conns_cmd,
+                   'Display connection information for variables across multiple MPI processes.'),
     'find_plugins': (_find_plugins_setup_parser, _find_plugins_exec,
                      'Find openmdao plugins on github.'),
     'iprof': (_iprof_setup_parser, _iprof_exec,
diff --git a/openmdao/utils/rangemapper.py b/openmdao/utils/rangemapper.py
new file mode 100644
index 0000000000..382ec7bb69
--- /dev/null
+++ b/openmdao/utils/rangemapper.py
@@ -0,0 +1,403 @@
+"""
+A collection of classes for mapping indices to variable names and vice versa.
+"""
+
+from openmdao.utils.array_utils import shape_to_len
+
+
+# default size of array for which we use a FlatRangeMapper instead of a RangeTree
+MAX_FLAT_RANGE_SIZE = 10000
+
+
+class RangeMapper(object):
+    """
+    A mapper of indices to variable names and vice versa.
+
+    Parameters
+    ----------
+    sizes : iterable of (key, size) tuples
+        Iterable of (key, size) tuples.  key must be hashable.
+
+    Attributes
+    ----------
+    size : int
+        Total size of all of the sizes combined.
+    _key2range : dict
+        Dictionary mapping key to an index range.
+    """
+
+    def __init__(self, sizes):
+        """
+        Initialize a RangeMapper.
+        """
+        self._key2range = {}
+        self.size = sum(size for _, size in sizes)
+
+    @staticmethod
+    def create(sizes, max_flat_range_size=MAX_FLAT_RANGE_SIZE):
+        """
+        Return a mapper that maps indices to variable names and relative indices.
+
+        Parameters
+        ----------
+        sizes : iterable of (key, size)
+            Iterable of (key, size) tuples.
+        max_flat_range_size : int
+            If the total array size is less than this, a FlatRangeMapper will be returned instead
+            of a RangeTree.
+
+        Returns
+        -------
+        FlatRangeMapper or RangeTree
+            A mapper that maps indices to variable key and relative indices.
+        """
+        size = sum(size for _, size in sizes)
+        return FlatRangeMapper(sizes) if size < max_flat_range_size else RangeTree(sizes)
+
+    def key2range(self, key):
+        """
+        Get the range corresponding to the given key.
+
+        Parameters
+        ----------
+        key : object (must be hashable)
+            Data corresponding to an index range.
+
+        Returns
+        -------
+        tuple of (int, int)
+            The range of indices corresponding to the given key.
+        """
+        return self._key2range[key]
+
+    def key2size(self, key):
+        """
+        Get the size corresponding to the given key.
+
+        Parameters
+        ----------
+        key : object (must be hashable)
+            Key corresponding to an index range.
+
+        Returns
+        -------
+        int
+            The size corresponding to the given key.
+        """
+        start, stop = self._key2range[key]
+        return stop - start
+
+    def __getitem__(self, idx):
+        """
+        Find the key corresponding to the given index.
+
+        Parameters
+        ----------
+        idx : int
+            The index into the full array.
+        """
+        raise NotImplementedError("__getitem__ method must be implemented by subclass.")
+
+    def __iter__(self):
+        """
+        Iterate over (key, start, stop) tuples.
+
+        Yields
+        ------
+        (obj, int, int)
+            (key, start index, stop index), where key is a hashable object.
+        """
+        raise NotImplementedError("__getitem__ method must be implemented by subclass.")
+
+    def dump(self):
+        """
+        Dump the contents of the mapper to stdout.
+        """
+        for key, (start, stop) in self._key2range.items():
+            print(f'{key}: {start} - {stop}')
+
+
+class RangeTreeNode(RangeMapper):
+    """
+    A node in a binary search tree of sizes, mapping key to an index range.
+
+    Parameters
+    ----------
+    key : object
+        Data corresponding to an index range.
+    start : int
+        Starting index of the variable.
+    stop : int
+        Ending index of the variable.
+
+    Attributes
+    ----------
+    key : object
+        Data corresponding to an index range.
+    start : int
+        Starting index of the variable.
+    stop : int
+        Ending index of the variable.
+    left : RangeTreeNode or None
+        Left child node.
+    right : RangeTreeNode or None
+        Right child node.
+    """
+
+    __slots__ = ['key', 'start', 'stop', 'left', 'right']
+
+    def __init__(self, key, start, stop):
+        """
+        Initialize a RangeTreeNode.
+        """
+        self.key = key
+        self.start = start
+        self.stop = stop
+        self.left = None
+        self.right = None
+
+    def __repr__(self):
+        """
+        Return a string representation of the RangeTreeNode.
+        """
+        return f"RangeTreeNode({self.key}, ({self.start}:{self.stop}))"
+
+
+def _size_of_ranges(ranges):
+    size = 0
+    for _, start, stop in ranges:
+        size += stop - start
+
+    return size
+
+
+class RangeTree(RangeMapper):
+    """
+    A binary search tree of sizes, mapping key to an index range.
+
+    Allows for fast lookup of the key corresponding to a given index. The sizes must be
+    contiguous, but they can be of different sizes.
+
+    Search complexity is O(log2 n). Uses less memory than FlatRangeMapper when total array size is
+    large.
+
+    Parameters
+    ----------
+    sizes : list of (key, start, stop)
+        Ordered list of (key, start, stop) tuples, where start and stop define the range of
+        indices for the key. Ranges must be contiguous.  key must be hashable.
+
+    Attributes
+    ----------
+    root : RangeTreeNode
+        Root node of the binary search tree.
+    """
+
+    def __init__(self, sizes):
+        """
+        Initialize a RangeTree.
+        """
+        super().__init__(sizes)
+        ranges = []
+        start = stop = 0
+        for key, size in sizes:
+            stop += size
+            ranges.append((key, start, stop))
+            start = stop
+
+        self.root = self.build(ranges)
+
+    def __getitem__(self, idx):
+        """
+        Find the key corresponding to the given index.
+
+        Parameters
+        ----------
+        idx : int
+            The index into the full array.
+
+        Returns
+        -------
+        object or None
+            The key corresponding to the given index, or None if not found.
+        int or None
+            The rank corresponding to the given index, or None if not found.
+        """
+        node = self.root
+        while node is not None:
+            if idx < node.start:
+                node = node.left
+            elif idx >= node.stop:
+                node = node.right
+            else:
+                return node.key
+
+    def __iter__(self):
+        """
+        Iterate over (key, start, stop) tuples.
+
+        Yields
+        ------
+        (obj, int, int)
+            (key, start index, stop index), where key is a hashable object.
+        """
+        node = self.root
+        stack = [[node, node.left, node.right]]
+        while stack:
+            sub = stack[-1]
+            node, left, right = sub
+            if left:
+                stack.append([left, left.left, left.right])
+                sub[1] = None  # zero left
+            else:
+                if right:
+                    stack.append([right, right.left, right.right])
+                    sub[2] = None  # zero right
+                else:
+                    stack.pop()
+                yield (node.key, node.start, node.stop)
+
+    def index2key_rel(self, idx):
+        """
+        Find the key and relative index corresponding to the matched range.
+
+        Parameters
+        ----------
+        idx : int
+            The index into the full array.
+
+        Returns
+        -------
+        obj or None
+            The key corresponding to the matched range, or None if not found.
+        int or None
+            The relative index into the matched range, or None if not found.
+        """
+        node = self.root
+        while node is not None:
+            if idx < node.start:
+                node = node.left
+            elif idx >= node.stop:
+                node = node.right
+            else:
+                return node.key, idx - node.start
+
+        return None, None
+
+    def build(self, ranges):
+        """
+        Build a binary search tree to map indices to variable key.
+
+        Parameters
+        ----------
+        ranges : list of (key, start, stop)
+            List of (key, start, stop) tuples, where start and stop
+            define the range of indices for the key. Ranges must be ordered and contiguous.
+            key must be hashable.
+
+        Returns
+        -------
+        RangeTreeNode
+            Root node of the binary search tree.
+        """
+        mid = len(ranges) // 2
+
+        key, start, stop = ranges[mid]
+
+        node = RangeTreeNode(key, start, stop)
+        self._key2range[key] = (start, stop)
+
+        left_slices = ranges[:mid]
+        right_slices = ranges[mid + 1:]
+
+        if left_slices:
+            node.left = self.build(left_slices)
+
+        if right_slices:
+            node.right = self.build(right_slices)
+
+        return node
+
+
+class FlatRangeMapper(RangeMapper):
+    """
+    A flat list mapping indices to variable key and relative indices.
+
+    Parameters
+    ----------
+    sizes : list of (key, size)
+        Ordered list of (key, size) tuples.  key must be hashable.
+
+    Attributes
+    ----------
+    ranges : list of (key, start, stop)
+        List of (key, start, stop) tuples, where start and stop define the range of
+        indices for that key. Ranges must be contiguous. key must be hashable.
+    """
+
+    def __init__(self, sizes):
+        """
+        Initialize a FlatRangeMapper.
+        """
+        super().__init__(sizes)
+        self.ranges = [None] * self.size
+        start = stop = 0
+        for key, size in sizes:
+            stop += size
+            self.ranges[start:stop] = [(key, start, stop)] * size
+            self._key2range[key] = (start, stop)
+            start = stop
+
+    def __getitem__(self, idx):
+        """
+        Find the key corresponding to the given index.
+
+        Parameters
+        ----------
+        idx : int
+            The index into the full array.
+
+        Returns
+        -------
+        object or None
+            The key corresponding to the given index, or None if not found.
+        """
+        try:
+            return self.ranges[idx][0]
+        except IndexError:
+            return None
+
+    def __iter__(self):
+        """
+        Iterate over (key, start, stop) tuples.
+
+        Yields
+        ------
+        (obj, int, int)
+            (key, start index, stop index), where key is a hashable object.
+        """
+        for key, (start, stop) in self._key2range.items():
+            yield (key, start, stop)
+
+    def index2key_rel(self, idx):
+        """
+        Find the key and relative index corresponding to the matched range.
+
+        Parameters
+        ----------
+        idx : int
+            The index into the full array.
+
+        Returns
+        -------
+        object or None
+            The key corresponding to the matched range, or None if not found.
+        int or None
+            The relative index into the matched range, or None if not found.
+        """
+        try:
+            key, start, _ = self.ranges[idx]
+        except IndexError:
+            return (None, None)
+
+        return (key, idx - start)
diff --git a/openmdao/utils/tests/test_rangemapper.py b/openmdao/utils/tests/test_rangemapper.py
new file mode 100644
index 0000000000..d29411f7e2
--- /dev/null
+++ b/openmdao/utils/tests/test_rangemapper.py
@@ -0,0 +1,86 @@
+import unittest
+
+from openmdao.utils.assert_utils import assert_near_equal
+from openmdao.utils.rangemapper import RangeMapper, RangeTree, FlatRangeMapper, MAX_FLAT_RANGE_SIZE
+
+
+_data = {
+    'a': 1,  # 0:1
+    'b': 8,  # 1:9
+    'x': 6,  # 9:15
+    'y': 21, # 15:36
+    'z': 6,  # 36:42
+}
+
+
+class TestRangeMapper(unittest.TestCase):
+    def test_create(self):
+        mapper = RangeMapper.create(_data.items())
+        self.assertEqual(type(mapper), FlatRangeMapper)
+        mapper = RangeMapper.create(_data.items(), max_flat_range_size=40)
+        self.assertEqual(type(mapper), RangeTree)
+
+    def test_get_item(self):
+        for mclass in (RangeTree, FlatRangeMapper):
+            with self.subTest(msg=f'{mclass.__name__} test'):
+                mapper = mclass(_data.items())
+                inds = [0, 1, 7, 9, 14, 15, 22, 41, 42, 43]
+                expected = ['a', 'b', 'b', 'x', 'x', 'y', 'y', 'z', None, None]
+                for i, ex_i in zip(inds, expected):
+                    got = mapper[i]
+                    self.assertEqual(got, ex_i)
+
+    def test_key2range(self):
+        for mclass in (RangeTree, FlatRangeMapper):
+            with self.subTest(msg=f'{mclass.__name__} test'):
+                mapper = mclass(_data.items())
+                keys = ['a', 'b', 'x', 'y', 'z']
+                expected = [(0, 1), (1, 9), (9, 15), (15, 36), (36, 42)]
+                for key, ex in zip(keys, expected):
+                    got = mapper.key2range(key)
+                    self.assertEqual(got, ex)
+
+                try:
+                    mapper.key2range('bad')
+                except KeyError:
+                    pass
+                else:
+                    self.fail("Expected KeyError")
+
+    def test_key2size(self):
+        for mclass in (RangeTree, FlatRangeMapper):
+            with self.subTest(msg=f'{mclass.__name__} test'):
+                mapper = mclass(_data.items())
+                keys = ['a', 'b', 'x', 'y', 'z']
+                expected = [1, 8, 6, 21, 6]
+                for key, ex in zip(keys, expected):
+                    got = mapper.key2size(key)
+                    self.assertEqual(got, ex)
+
+                try:
+                    mapper.key2size('bad')
+                except KeyError:
+                    pass
+                else:
+                    self.fail("Expected KeyError")
+
+    def test_index2key_rel(self):
+        for mclass in (RangeTree, FlatRangeMapper):
+            with self.subTest(msg=f'{mclass.__name__} test'):
+                mapper = mclass(_data.items())
+                inds = [0, 1, 7, 9, 14, 15, 22, 41, 42, 43]
+                expected = [('a',0), ('b',0), ('b',6), ('x',0), ('x',5), ('y',0), ('y',7), ('z',5), (None, None), (None, None)]
+                for i, ex in zip(inds, expected):
+                    got = mapper.index2key_rel(i)
+                    self.assertEqual(got, ex)
+
+    def test_iter(self):
+        for mclass in (RangeTree, FlatRangeMapper):
+            with self.subTest(msg=f'{mclass.__name__} test'):
+                mapper = mclass(_data.items())
+                expected = [('a', 0, 1), ('b', 1, 9), ('x', 9, 15), ('y', 15, 36), ('z', 36, 42)]
+                for got, ex in zip(mapper, expected):
+                    self.assertEqual(got, ex)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/openmdao/vectors/default_transfer.py b/openmdao/vectors/default_transfer.py
index c815e3f283..b0551619ef 100644
--- a/openmdao/vectors/default_transfer.py
+++ b/openmdao/vectors/default_transfer.py
@@ -9,14 +9,101 @@
 from openmdao.utils.array_utils import _global2local_offsets
 from openmdao.utils.mpi import MPI
 
-_empty_idx_array = np.array([], dtype=INT_DTYPE)
 
+def _fill(arr, indices_list):
+    """
+    Fill the given array with the given list of indices.
+
+    Parameters
+    ----------
+    arr : ndarray
+        Array to be filled.
+    indices_list : list of int ndarrays or ranges
+        List of ranges/indices to be placed into arr.
+    """
+    start = end = 0
+    for inds in indices_list:
+        end += len(inds)
+        arr[start:end] = inds
+        start = end
+
+
+def _setup_index_views(tot_size, in_xfers, out_xfers):
+    """
+    Create index views for all subsystems and allocate full transfer arrays.
+
+    Parameters
+    ----------
+    tot_size : int
+        Total size of each full array.
+    in_xfers : dict
+        Mapping of subsystem name to input index arrays.
+    out_xfers : dict
+        Mapping of subsystem name to output index arrays.
+    """
+    full_in = np.empty(tot_size, dtype=INT_DTYPE)
+    full_out = np.empty(tot_size, dtype=INT_DTYPE)
+
+    start = end = 0
+    for sname, ranges in in_xfers.items():
+        # input inds are always ranges.  output inds may be ranges or ndarrays.
+        rstart = rend = start
+        for rng in ranges:
+            rend += len(rng)
+            full_in[rstart:rend] = rng
+            rstart = rend
+
+        end += rend - start
+        _fill(full_out[start:end], out_xfers[sname])
+
+        # change subsystem transfer entries to be views of the full transfer arrays
+        in_xfers[sname] = full_in[start:end]
+        out_xfers[sname] = full_out[start:end]
+        start = end
+
+    return full_in, full_out
+
+
+def _setup_index_arrays(tot_size, in_xfers, out_xfers, vectors):
+    """
+    Create index arrays for all subsystems.
 
-def _merge(indices_list):
-    if len(indices_list) > 0:
-        return np.concatenate(indices_list)
+    Parameters
+    ----------
+    tot_size : int
+        Total size of each full array.
+    in_xfers : dict
+        Mapping of subsystem name to input index arrays.
+    out_xfers : dict
+        Mapping of subsystem name to output index arrays.
+    vectors : dict
+        Dictionary of input and output vectors.
+
+    Returns
+    -------
+    dict
+        Mapping of subsystem name to Transfer object. None key maps to the
+        'full' transfer across all subsystems.
+    """
+    xfer_in, xfer_out = _setup_index_views(tot_size, in_xfers, out_xfers)
+
+    if tot_size > 0:
+        xfer_all = DefaultTransfer(vectors['input']['nonlinear'],
+                                   vectors['output']['nonlinear'], xfer_in, xfer_out)
     else:
-        return _empty_idx_array
+        xfer_all = None
+
+    xfer_dict = {None: xfer_all}
+
+    for sname, inds in in_xfers.items():
+        if inds.size > 0:
+            xfer_dict[sname] = DefaultTransfer(vectors['input']['nonlinear'],
+                                               vectors['output']['nonlinear'],
+                                               inds, out_xfers[sname])
+        else:
+            xfer_dict[sname] = None
+
+    return xfer_dict
 
 
 class DefaultTransfer(Transfer):
@@ -33,12 +120,10 @@ class DefaultTransfer(Transfer):
         Input indices for the transfer.
     out_inds : int ndarray
         Output indices for the transfer.
-    comm : MPI.Comm or <FakeComm>
-        Communicator of the system that owns this transfer.
     """
 
     @staticmethod
-    def _setup_transfers(group):
+    def _setup_transfers(group, desvars, responses):
         """
         Compute all transfers that are owned by our parent group.
 
@@ -46,12 +131,16 @@ def _setup_transfers(group):
         ----------
         group : <Group>
             Parent group.
+        desvars : dict
+            Dictionary of all design variable metadata. Keyed by absolute source name or alias.
+        responses : dict
+            Dictionary of all response variable metadata. Keyed by absolute source name or alias.
         """
         iproc = group.comm.rank
         rev = group._mode == 'rev' or group._mode == 'auto'
 
         for subsys in group._subgroups_myproc:
-            subsys._setup_transfers()
+            subsys._setup_transfers(desvars, responses)
 
         abs2meta = group._var_abs2meta
 
@@ -61,8 +150,6 @@ def _setup_transfers(group):
         mypathlen = len(group.pathname + '.' if group.pathname else '')
 
         # Initialize empty lists for the transfer indices
-        xfer_in = []
-        xfer_out = []
         fwd_xfer_in = defaultdict(list)
         fwd_xfer_out = defaultdict(list)
         if rev:
@@ -79,12 +166,13 @@ def _setup_transfers(group):
         if offsets_out.size > 0:
             offsets_out = offsets_out[iproc]
 
+        tot_size = 0
+
         # Loop through all connections owned by this group
         for abs_in, abs_out in group._conn_abs_in2out.items():
             # This weeds out discrete vars (all vars are local if using this Transfer)
             if abs_in in abs2meta['input']:
 
-                indices = None
                 # Get meta
                 meta_in = abs2meta['input'][abs_in]
 
@@ -99,15 +187,14 @@ def _setup_transfers(group):
                 # 1. Compute the output indices
                 offset = offsets_out[idx_out]
                 if src_indices is None:
-                    output_inds = np.arange(offset, offset + meta_in['size'], dtype=INT_DTYPE)
+                    output_inds = range(offset, offset + meta_in['size'])
                 else:
                     output_inds = src_indices + offset
 
                 # 2. Compute the input indices
-                input_inds = np.arange(offsets_in[idx_in],
-                                       offsets_in[idx_in] + sizes_in[idx_in], dtype=INT_DTYPE)
-                if indices is not None:
-                    input_inds = input_inds.reshape(indices.shape)
+                # all input indices can be simple ranges during this part in order to save memory
+                input_inds = range(offsets_in[idx_in], offsets_in[idx_in] + sizes_in[idx_in])
+                tot_size += sizes_in[idx_in]
 
                 # Now the indices are ready - input_inds, output_inds
                 sub_in = abs_in[mypathlen:].split('.', 1)[0]
@@ -118,52 +205,9 @@ def _setup_transfers(group):
                     rev_xfer_in[sub_out].append(input_inds)
                     rev_xfer_out[sub_out].append(output_inds)
 
-        tot_size = 0
-        for sname, inds in fwd_xfer_in.items():
-            fwd_xfer_in[sname] = arr = _merge(inds)
-            fwd_xfer_out[sname] = _merge(fwd_xfer_out[sname])
-            tot_size += arr.size
-
-        if rev:
-            for sname, inds in rev_xfer_in.items():
-                rev_xfer_in[sname] = _merge(inds)
-                rev_xfer_out[sname] = _merge(rev_xfer_out[sname])
-
-        if tot_size > 0:
-            try:
-                xfer_in = np.concatenate(list(fwd_xfer_in.values()))
-                xfer_out = np.concatenate(list(fwd_xfer_out.values()))
-            except ValueError:
-                xfer_in = xfer_out = np.zeros(0, dtype=INT_DTYPE)
-
-            xfer_all = DefaultTransfer(vectors['input']['nonlinear'],
-                                       vectors['output']['nonlinear'], xfer_in, xfer_out,
-                                       group.comm)
-        else:
-            xfer_all = None
-
-        transfers['fwd'] = xfwd = {}
-        xfwd[None] = xfer_all
+        transfers['fwd'] = _setup_index_arrays(tot_size, fwd_xfer_in, fwd_xfer_out, vectors)
         if rev:
-            transfers['rev'] = xrev = {}
-            xrev[None] = xfer_all
-
-        for sname, inds in fwd_xfer_in.items():
-            if inds.size > 0:
-                xfwd[sname] = DefaultTransfer(vectors['input']['nonlinear'],
-                                              vectors['output']['nonlinear'],
-                                              inds, fwd_xfer_out[sname], group.comm)
-            else:
-                xfwd[sname] = None
-
-        if rev:
-            for sname, inds in rev_xfer_out.items():
-                if inds.size > 0:
-                    xrev[sname] = DefaultTransfer(vectors['input']['nonlinear'],
-                                                  vectors['output']['nonlinear'],
-                                                  rev_xfer_in[sname], inds, group.comm)
-                else:
-                    xrev[sname] = None
+            transfers['rev'] = _setup_index_arrays(tot_size, rev_xfer_in, rev_xfer_out, vectors)
 
     @staticmethod
     def _setup_discrete_transfers(group):
diff --git a/openmdao/vectors/petsc_transfer.py b/openmdao/vectors/petsc_transfer.py
index d06988ce86..4d1cef2cc5 100644
--- a/openmdao/vectors/petsc_transfer.py
+++ b/openmdao/vectors/petsc_transfer.py
@@ -1,7 +1,13 @@
 """Define the PETSc Transfer class."""
+import numpy as np
+import networkx as nx
 from openmdao.utils.mpi import check_mpi_env
+from openmdao.utils.general_utils import common_subpath
+from openmdao.core.constants import INT_DTYPE
 
 use_mpi = check_mpi_env()
+_empty_idx_array = np.array([], dtype=INT_DTYPE)
+
 
 if use_mpi is False:
     PETScTransfer = None
@@ -14,12 +20,10 @@
         if use_mpi is True:
             raise ImportError("Importing petsc4py failed and OPENMDAO_USE_MPI is true.")
 
-    import numpy as np
     from petsc4py import PETSc
     from collections import defaultdict
 
-    from openmdao.vectors.default_transfer import DefaultTransfer, _merge
-    from openmdao.core.constants import INT_DTYPE
+    from openmdao.vectors.default_transfer import DefaultTransfer, _setup_index_views
 
     class PETScTransfer(DefaultTransfer):
         """
@@ -48,15 +52,15 @@ def __init__(self, in_vec, out_vec, in_inds, out_inds, comm):
             """
             Initialize all attributes.
             """
-            super().__init__(in_vec, out_vec, in_inds, out_inds, comm)
-            in_indexset = PETSc.IS().createGeneral(self._in_inds, comm=self._comm)
-            out_indexset = PETSc.IS().createGeneral(self._out_inds, comm=self._comm)
+            super().__init__(in_vec, out_vec, in_inds, out_inds)
+            in_indexset = PETSc.IS().createGeneral(self._in_inds, comm=comm)
+            out_indexset = PETSc.IS().createGeneral(self._out_inds, comm=comm)
 
             self._scatter = PETSc.Scatter().create(out_vec._petsc, out_indexset, in_vec._petsc,
                                                    in_indexset).scatter
 
         @staticmethod
-        def _setup_transfers(group):
+        def _setup_transfers(group, desvars, responses):
             """
             Compute all transfers that are owned by our parent group.
 
@@ -64,162 +68,279 @@ def _setup_transfers(group):
             ----------
             group : <Group>
                 Parent group.
+            desvars : dict
+                Dictionary of all design variable metadata. Keyed by absolute source name or alias.
+            responses : dict
+                Dictionary of all response variable metadata. Keyed by absolute source name or
+                alias.
             """
             rev = group._mode != 'fwd'
 
             for subsys in group._subgroups_myproc:
-                subsys._setup_transfers()
+                subsys._setup_transfers(desvars, responses)
+
+            group._transfers = {
+                'fwd': PETScTransfer._setup_transfers_fwd(group, desvars, responses)
+            }
+
+            if rev:
+                group._transfers['rev'] = PETScTransfer._setup_transfers_rev(group, desvars,
+                                                                             responses)
+
+        @staticmethod
+        def _setup_transfers_fwd(group, desvars, responses):
+            transfers = {}
+
+            if not group._conn_abs_in2out:
+                return transfers
 
             abs2meta_in = group._var_abs2meta['input']
-            abs2meta_out = group._var_abs2meta['output']
-            allprocs_abs2meta_out = group._var_allprocs_abs2meta['output']
-            myproc = group.comm.rank
+            myrank = group.comm.rank
+
+            offsets_in = group._get_var_offsets()['input'][myrank, :]
+            mypathlen = len(group.pathname) + 1 if group.pathname else 0
+
+            xfer_in = defaultdict(list)
+            xfer_out = defaultdict(list)
+
+            allprocs_abs2idx = group._var_allprocs_abs2idx
+            sizes_in = group._var_sizes['input'][myrank, :]
+
+            total_len = 0
+
+            # Loop through all connections owned by this system
+            for abs_in, abs_out in group._conn_abs_in2out.items():
+                sub_in = abs_in[mypathlen:].partition('.')[0]
+
+                # Only continue if the input exists on this processor
+                if abs_in in abs2meta_in:
+
+                    output_inds, _ = _get_output_inds(group, abs_out, abs_in)
+
+                    idx_in = allprocs_abs2idx[abs_in]
+                    input_inds = range(offsets_in[idx_in], offsets_in[idx_in] + sizes_in[idx_in])
+
+                    total_len += len(input_inds)
+
+                    xfer_in[sub_in].append(input_inds)
+                    xfer_out[sub_in].append(output_inds)
+                else:
+                    # not a local input but still need entries in the transfer dicts to
+                    # avoid hangs
+                    xfer_in[sub_in]  # defaultdict will create an empty list there
+                    xfer_out[sub_in]
+
+            if xfer_in:
+                full_xfer_in, full_xfer_out = _setup_index_views(total_len, xfer_in, xfer_out)
+                # full transfer (transfer to all subsystems at once)
+                transfers[None] = PETScTransfer(group._vectors['input']['nonlinear'],
+                                                group._vectors['output']['nonlinear'],
+                                                full_xfer_in, full_xfer_out, group.comm)
+
+                # transfers to individual subsystems
+                for sname, inds in xfer_in.items():
+                    transfers[sname] = PETScTransfer(group._vectors['input']['nonlinear'],
+                                                     group._vectors['output']['nonlinear'],
+                                                     inds, xfer_out[sname], group.comm)
 
-            transfers = group._transfers = {}
+            return transfers
+
+        @staticmethod
+        def _setup_transfers_rev(group, desvars, responses):
+            abs2meta_in = group._var_abs2meta['input']
+            abs2meta_out = group._var_abs2meta['output']
+            allprocs_abs2prom = group._var_allprocs_abs2prom
+
+            # for an FD group, we use the relevance graph to determine which inputs on the
+            # boundary of the group are upstream of responses within the group so
+            # that we can perform any necessary corrections to the derivative inputs.
+            if group._owns_approx_jac:
+                if group.comm.size > 1 and group.pathname != '':
+                    all_abs2meta_out = group._var_allprocs_abs2meta['output']
+                    all_abs2meta_in = group._var_allprocs_abs2meta['input']
+
+                    # connections internal to this group and all direct/indirect subsystems
+                    conns = group._conn_global_abs_in2out
+
+                    inp_boundary_set = set(all_abs2meta_in).difference(conns)
+
+                    for resp, dvdct in group._relevant.items():
+                        if resp in all_abs2meta_out:  # resp is continuous and inside this group
+                            if all_abs2meta_out[resp]['distributed']:  # distributed response
+                                for dv, tup in dvdct.items():
+                                    # use only dvs outside of this group.
+                                    if dv not in allprocs_abs2prom:
+                                        rel = tup[0]
+                                        for inp in inp_boundary_set.intersection(rel['input']):
+                                            if inp in abs2meta_in:
+                                                if resp not in group._fd_rev_xfer_correction_dist:
+                                                    group._fd_rev_xfer_correction_dist[resp] = set()
+                                                group._fd_rev_xfer_correction_dist[resp].add(inp)
+
+                # FD groups don't need reverse transfers
+                return {}
+
+            myrank = group.comm.rank
+            allprocs_abs2idx = group._var_allprocs_abs2idx
+            transfers = group._transfers
             vectors = group._vectors
             offsets = group._get_var_offsets()
             mypathlen = len(group.pathname) + 1 if group.pathname else 0
 
-            # Initialize empty lists for the transfer indices
-            xfer_in = []
-            xfer_out = []
-            fwd_xfer_in = defaultdict(list)
-            fwd_xfer_out = defaultdict(list)
-            if rev:
-                rev_xfer_in = defaultdict(list)
-                rev_xfer_out = defaultdict(list)
+            has_rev_par_coloring = any([m['parallel_deriv_color'] is not None
+                                        for m in responses.values()])
+            xfer_in = defaultdict(list)
+            xfer_out = defaultdict(list)
+
+            # xfers that are only active when parallel coloring is not
+            xfer_in_nocolor = defaultdict(list)
+            xfer_out_nocolor = defaultdict(list)
 
-            allprocs_abs2idx = group._var_allprocs_abs2idx
             sizes_in = group._var_sizes['input']
-            sizes_out = group._var_sizes['output']
             offsets_in = offsets['input']
             offsets_out = offsets['output']
 
+            total_size = total_size_nocolor = 0
+
             # Loop through all connections owned by this system
             for abs_in, abs_out in group._conn_abs_in2out.items():
+                sub_out = abs_out[mypathlen:].partition('.')[0]
+
                 # Only continue if the input exists on this processor
                 if abs_in in abs2meta_in:
-
-                    # Get meta
                     meta_in = abs2meta_in[abs_in]
-                    meta_out = allprocs_abs2meta_out[abs_out]
-
                     idx_in = allprocs_abs2idx[abs_in]
                     idx_out = allprocs_abs2idx[abs_out]
 
-                    # Read in and process src_indices
-                    src_indices = meta_in['src_indices']
-                    if src_indices is None:
-                        owner = group._owning_rank[abs_out]
-                        # if the input is larger than the output on a single proc, we have
-                        # to just loop over the procs in the same way we do when src_indices
-                        # is defined.
-                        if meta_in['size'] > sizes_out[owner, idx_out]:
-                            src_indices = np.arange(meta_in['size'], dtype=INT_DTYPE)
-                    else:
-                        src_indices = src_indices.shaped_array()
-
-                    # 1. Compute the output indices
-                    # NOTE: src_indices are relative to a single, possibly distributed variable,
-                    # while the output_inds that we compute are relative to the full distributed
-                    # array that contains all local variables from each rank stacked in rank order.
-                    if src_indices is None:
-                        if meta_out['distributed']:
-                            # input in this case is non-distributed (else src_indices would be
-                            # defined by now).  dist output to non-distributed input conns w/o
-                            # src_indices are not allowed.
-                            raise RuntimeError(f"{group.msginfo}: Can't connect distributed output "
-                                               f"'{abs_out}' to non-distributed input '{abs_in}' "
-                                               "without declaring src_indices.",
-                                               ident=(abs_out, abs_in))
-                        else:
-                            rank = myproc if abs_out in abs2meta_out else owner
-                            offset = offsets_out[rank, idx_out]
-                            output_inds = np.arange(offset, offset + meta_in['size'],
-                                                    dtype=INT_DTYPE)
-                    else:
-                        output_inds = np.zeros(src_indices.size, INT_DTYPE)
-                        start = end = 0
-                        for iproc in range(group.comm.size):
-                            end += sizes_out[iproc, idx_out]
-                            if start == end:
-                                continue
-
-                            # The part of src on iproc
-                            on_iproc = np.logical_and(start <= src_indices, src_indices < end)
-
-                            if np.any(on_iproc):
-                                # This converts from iproc-then-ivar to ivar-then-iproc ordering
-                                # Subtract off part of previous procs
-                                # Then add all variables on previous procs
-                                # Then all previous variables on this proc
-                                # - np.sum(out_sizes[:iproc, idx_out])
-                                # + np.sum(out_sizes[:iproc, :])
-                                # + np.sum(out_sizes[iproc, :idx_out])
-                                # + inds
-                                offset = offsets_out[iproc, idx_out] - start
-                                output_inds[on_iproc] = src_indices[on_iproc] + offset
-
-                            start = end
+                    output_inds, src_indices = _get_output_inds(group, abs_out, abs_in)
 
                     # 2. Compute the input indices
-                    input_inds = np.arange(offsets_in[myproc, idx_in],
-                                           offsets_in[myproc, idx_in] +
-                                           sizes_in[myproc, idx_in], dtype=INT_DTYPE)
+                    input_inds = range(offsets_in[myrank, idx_in],
+                                       offsets_in[myrank, idx_in] + sizes_in[myrank, idx_in])
 
                     # Now the indices are ready - input_inds, output_inds
-                    sub_in = abs_in[mypathlen:].partition('.')[0]
-                    fwd_xfer_in[sub_in].append(input_inds)
-                    fwd_xfer_out[sub_in].append(output_inds)
-                    if rev:
-                        sub_out = abs_out[mypathlen:].partition('.')[0]
-                        rev_xfer_in[sub_out].append(input_inds)
-                        rev_xfer_out[sub_out].append(output_inds)
-                else:
-                    # not a local input but still need entries in the transfer dicts to
-                    # avoid hangs
-                    sub_in = abs_in[mypathlen:].partition('.')[0]
-                    fwd_xfer_in[sub_in]  # defaultdict will create an empty list there
-                    fwd_xfer_out[sub_in]
-                    if rev:
-                        sub_out = abs_out[mypathlen:].partition('.')[0]
-                        rev_xfer_in[sub_out]
-                        rev_xfer_out[sub_out]
-
-            for sname, inds in fwd_xfer_in.items():
-                fwd_xfer_in[sname] = _merge(inds)
-                fwd_xfer_out[sname] = _merge(fwd_xfer_out[sname])
-            if rev:
-                for sname, inds in rev_xfer_out.items():
-                    rev_xfer_in[sname] = _merge(rev_xfer_in[sname])
-                    rev_xfer_out[sname] = _merge(inds)
+                    inp_is_dup, inp_missing, distrib_in = group.get_var_dup_info(abs_in, 'input')
+                    out_is_dup, _, distrib_out = group.get_var_dup_info(abs_out, 'output')
+
+                    iowninput = myrank == group._owning_rank[abs_in]
+
+                    if inp_is_dup and (abs_out not in abs2meta_out or
+                                       (distrib_out and not iowninput)):
+                        xfer_in[sub_out]
+                        xfer_out[sub_out]
+                    elif out_is_dup and inp_missing > 0 and (iowninput or distrib_in):
+                        # if this rank owns the input or the input is distributed,
+                        # and the output is duplicated, then we send the owning/distrib input
+                        # to each duplicated output that doesn't have a corresponding connected
+                        # input on the same rank.
+                        oidxlist = []
+                        iidxlist = []
+                        oidxlist_nc = []
+                        iidxlist_nc = []
+                        size = size_nc = 0
+                        for rnk, osize, isize in zip(range(group.comm.size),
+                                                     group.get_var_sizes(abs_out, 'output'),
+                                                     group.get_var_sizes(abs_in, 'input')):
+                            if rnk == myrank:  # transfer to output on same rank
+                                oidxlist.append(output_inds)
+                                iidxlist.append(input_inds)
+                                size += len(input_inds)
+                            elif osize > 0 and isize == 0:
+                                # dup output exists on this rank but there is no corresponding
+                                # input, so we send the owning/distrib input to the dup output
+                                offset = offsets_out[rnk, idx_out]
+                                if src_indices is None:
+                                    oarr = range(offset, offset + meta_in['size'])
+                                elif src_indices.size > 0:
+                                    oarr = np.asarray(src_indices + offset, dtype=INT_DTYPE)
+                                else:
+                                    continue
+
+                                if has_rev_par_coloring:
+                                    # these transfers will only happen if parallel coloring is
+                                    # not active for the current seed response
+                                    oidxlist_nc.append(oarr)
+                                    iidxlist_nc.append(input_inds)
+                                    size_nc += len(input_inds)
+                                else:
+                                    oidxlist.append(oarr)
+                                    iidxlist.append(input_inds)
+                                    size += len(input_inds)
+
+                        if len(iidxlist) > 1:
+                            input_inds = _merge(iidxlist, size)
+                            output_inds = _merge(oidxlist, size)
+                        else:
+                            input_inds = iidxlist[0]
+                            output_inds = oidxlist[0]
 
-            if fwd_xfer_in:
-                xfer_in = np.concatenate(list(fwd_xfer_in.values()))
-                xfer_out = np.concatenate(list(fwd_xfer_out.values()))
-            else:
-                xfer_in = xfer_out = np.zeros(0, dtype=INT_DTYPE)
+                        total_size += len(input_inds)
 
-            out_vec = vectors['output']['nonlinear']
+                        xfer_in[sub_out].append(input_inds)
+                        xfer_out[sub_out].append(output_inds)
 
-            xfer_all = PETScTransfer(vectors['input']['nonlinear'], out_vec,
-                                     xfer_in, xfer_out, group.comm)
+                        if has_rev_par_coloring and iidxlist_nc:
+                            # keep transfers separate that shouldn't happen when parallel
+                            # deriv coloring is active
+                            if len(iidxlist_nc) > 1:
+                                input_inds = _merge(iidxlist_nc, size_nc)
+                                output_inds = _merge(oidxlist_nc, size_nc)
+                            else:
+                                input_inds = iidxlist_nc[0]
+                                output_inds = oidxlist_nc[0]
 
-            transfers['fwd'] = xfwd = {}
-            xfwd[None] = xfer_all
-            if rev:
-                transfers['rev'] = xrev = {}
-                xrev[None] = xfer_all
+                            total_size_nocolor += len(input_inds)
 
-            for sname, inds in fwd_xfer_in.items():
-                transfers['fwd'][sname] = PETScTransfer(
-                    vectors['input']['nonlinear'], vectors['output']['nonlinear'],
-                    inds, fwd_xfer_out[sname], group.comm)
-            if rev:
-                for sname, inds in rev_xfer_out.items():
-                    transfers['rev'][sname] = PETScTransfer(
-                        vectors['input']['nonlinear'], vectors['output']['nonlinear'],
-                        rev_xfer_in[sname], inds, group.comm)
+                            xfer_in_nocolor[sub_out].append(input_inds)
+                            xfer_out_nocolor[sub_out].append(output_inds)
+                    else:
+                        if (inp_is_dup and out_is_dup and src_indices is not None and
+                                src_indices.size > 0):
+                            offset = offsets_out[myrank, idx_out]
+                            output_inds = np.asarray(src_indices + offset, dtype=INT_DTYPE)
+
+                        total_size += len(input_inds)
+
+                        xfer_in[sub_out].append(input_inds)
+                        xfer_out[sub_out].append(output_inds)
+                else:
+                    # remote input but still need entries in the transfer dicts to avoid hangs
+                    xfer_in[sub_out]
+                    xfer_out[sub_out]
+                    if has_rev_par_coloring:
+                        xfer_in_nocolor[sub_out]
+                        xfer_out_nocolor[sub_out]
+
+            full_xfer_in, full_xfer_out = _setup_index_views(total_size, xfer_in, xfer_out)
+
+            transfers = {
+                None: PETScTransfer(vectors['input']['nonlinear'],
+                                    vectors['output']['nonlinear'],
+                                    full_xfer_in, full_xfer_out, group.comm)
+            }
+
+            for sname, inds in xfer_out.items():
+                transfers[sname] = PETScTransfer(vectors['input']['nonlinear'],
+                                                 vectors['output']['nonlinear'],
+                                                 xfer_in[sname], inds, group.comm)
+
+            if xfer_in_nocolor:
+                full_xfer_in, full_xfer_out = _setup_index_views(total_size_nocolor,
+                                                                 xfer_in_nocolor,
+                                                                 xfer_out_nocolor)
+
+                transfers[(None, 'nocolor')] = PETScTransfer(vectors['input']['nonlinear'],
+                                                             vectors['output']['nonlinear'],
+                                                             full_xfer_in, full_xfer_out,
+                                                             group.comm)
+
+                for sname, inds in xfer_out_nocolor.items():
+                    transfers[(sname, 'nocolor')] = PETScTransfer(vectors['input']['nonlinear'],
+                                                                  vectors['output']['nonlinear'],
+                                                                  xfer_in_nocolor[sname], inds,
+                                                                  group.comm)
+
+            return transfers
 
         def _transfer(self, in_vec, out_vec, mode='fwd'):
             """
@@ -276,3 +397,89 @@ def _transfer(self, in_vec, out_vec, mode='fwd'):
                 if in_vec._alloc_complex:
                     data = in_vec._get_data()
                     data[:] = in_petsc.array
+
+
+def _merge(inds_list, tot_size):
+    """
+    Convert a list of indices and/or ranges into an array.
+
+    Parameters
+    ----------
+    inds_list : list of ranges or ndarrays
+        List of indices.
+    tot_size : int
+        Total size of the indices in the list.
+
+    Returns
+    -------
+    ndarray
+        Array of indices.
+    """
+    if inds_list:
+        arr = np.empty(tot_size, dtype=INT_DTYPE)
+        start = end = 0
+        for inds in inds_list:
+            end += len(inds)
+            arr[start:end] = inds
+            start = end
+
+        return arr
+
+    return _empty_idx_array
+
+
+def _get_output_inds(group, abs_out, abs_in):
+    owner = group._owning_rank[abs_out]
+    meta_in = group._var_abs2meta['input'][abs_in]
+    out_dist = group._var_allprocs_abs2meta['output'][abs_out]['distributed']
+    in_dist = meta_in['distributed']
+    src_indices = meta_in['src_indices']
+
+    rank = group.comm.rank if abs_out in group._var_abs2meta['output'] else owner
+    out_idx = group._var_allprocs_abs2idx[abs_out]
+    offsets = group._get_var_offsets()['output'][:, out_idx]
+    sizes = group._var_sizes['output'][:, out_idx]
+
+    if src_indices is None:
+        orig_src_inds = src_indices
+    else:
+        src_indices = src_indices.shaped_array()
+        orig_src_inds = src_indices
+        if not out_dist and not in_dist:  # convert from local to distributed src_indices
+            off = np.sum(sizes[:rank])
+            if off > 0.:  # adjust for local offsets
+                # don't do += to avoid modifying stored value
+                src_indices = src_indices + off
+
+    # NOTE: src_indices are relative to a single, possibly distributed variable,
+    # while the output_inds that we compute are relative to the full distributed
+    # array that contains all local variables from each rank stacked in rank order.
+    if src_indices is None:
+        if out_dist:
+            # input in this case is non-distributed (else src_indices would be
+            # defined by now).  dist output to non-distributed input conns w/o
+            # src_indices are not allowed.
+            raise RuntimeError(f"{group.msginfo}: Can't connect distributed output "
+                               f"'{abs_out}' to non-distributed input '{abs_in}' "
+                               "without declaring src_indices.", ident=(abs_out, abs_in))
+        else:
+            offset = offsets[rank]
+            output_inds = range(offset, offset + sizes[rank])
+    else:
+        output_inds = np.empty(src_indices.size, INT_DTYPE)
+        start = end = 0
+        for iproc in range(group.comm.size):
+            end += sizes[iproc]
+            if start == end:
+                continue
+
+            # The part of src on iproc
+            on_iproc = np.logical_and(start <= src_indices, src_indices < end)
+
+            if np.any(on_iproc):
+                # This converts from global to variable specific ordering
+                output_inds[on_iproc] = src_indices[on_iproc] + (offsets[iproc] - start)
+
+            start = end
+
+    return output_inds, orig_src_inds
diff --git a/openmdao/vectors/transfer.py b/openmdao/vectors/transfer.py
index df959547ef..f44a26be8b 100644
--- a/openmdao/vectors/transfer.py
+++ b/openmdao/vectors/transfer.py
@@ -15,8 +15,6 @@ class Transfer(object):
         Input indices for the transfer.
     out_inds : int ndarray
         Output indices for the transfer.
-    comm : MPI.Comm or <FakeComm>
-        Communicator of the system that owns this transfer.
 
     Attributes
     ----------
@@ -24,17 +22,14 @@ class Transfer(object):
         input indices for the transfer.
     _out_inds : int ndarray
         output indices for the transfer.
-    _comm : MPI.Comm or FakeComm
-        communicator of the system that owns this transfer.
     """
 
-    def __init__(self, in_vec, out_vec, in_inds, out_inds, comm):
+    def __init__(self, in_vec, out_vec, in_inds, out_inds):
         """
         Initialize all attributes.
         """
         self._in_inds = in_inds
         self._out_inds = out_inds
-        self._comm = comm
 
     def __str__(self):
         """
diff --git a/openmdao/vectors/vector.py b/openmdao/vectors/vector.py
index 9e456e9807..ab4452906d 100644
--- a/openmdao/vectors/vector.py
+++ b/openmdao/vectors/vector.py
@@ -226,18 +226,23 @@ def items(self):
         Yields
         ------
         str
-            Name of each variable.
+            Relative name of each variable.
         ndarray or float
             Value of each variable.
         """
+        if self._system().pathname:
+            plen = len(self._system().pathname) + 1
+        else:
+            plen = 0
+
         if self._under_complex_step:
             for n, v in self._views.items():
                 if n in self._names:
-                    yield n, v
+                    yield n[plen:], v
         else:
             for n, v in self._views.items():
                 if n in self._names:
-                    yield n, v.real
+                    yield n[plen:], v.real
 
     def _name2abs_name(self, name):
         """
@@ -397,6 +402,24 @@ def _abs_get_val(self, name, flat=True):
         else:
             return self._views[name].real
 
+    def _abs_set_val(self, name, val):
+        """
+        Set the variable value using the absolute name.
+
+        No error checking is performed on the name.
+
+        Parameters
+        ----------
+        name : str
+            Absolute name in the owning system's namespace.
+        val : float or ndarray
+            Value to set.
+        """
+        if self._under_complex_step:
+            self._views[name][:] = val
+        else:
+            self._views[name].real[:] = val
+
     def __setitem__(self, name, value):
         """
         Set the variable value.