Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incorrect derivatives when using bcasts and parallel groups connected to outer components #2883

Closed
Asthelen opened this issue Apr 10, 2023 · 9 comments · Fixed by #3069
Closed
Assignees
Labels

Comments

@Asthelen
Copy link

Description

When running our model with parallel groups instead of serial groups, derivatives were observed to be incorrect for outputs that were functions of both a parallel group subsystem and an outer subsystem. Output derivatives that only depend on one of the two are correct. Derivatives are also correct when the parallel group subsystem only uses one processor.

When looking at a simpler example, this issue seems to stem from a combination of inconsistent d_output values across processors (within the parallel group subsystem's get_jacvec_product function) and a bcast of the computed d_inputs (which may be necessary when a solver/analysis only exists on a subset of ranks).

As a side note, the example's check_totals also seems to get stuck when using a parallel group (hence the if statement at the end of the script).

Example

import numpy as np
from mpi4py import MPI
import openmdao.api as om

use_parallel_group = True

# dummy scenarios to run in parallel
scenario_labels = ['full_fea','full_fea2']

# dummy structural thickness design variables
dv_struct = 0.01*np.ones(20)

class DummyOuterComp(om.ExplicitComponent):
    # dummy component to use outputs of parallel group
    def setup(self):
        self.add_input('VAR', tags=['mphys_inputs'])
        self.add_output('VAR2', 0.)
    def compute(self, inputs, outputs):
        outputs['VAR2'] = 2. * inputs['VAR']
    def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
        if mode=='rev':
            if 'VAR2' in d_outputs:
                if 'VAR' in d_inputs:
                    d_inputs['VAR'] += 2. * d_outputs['VAR2']

class DummyInnerComp(om.ExplicitComponent):
    # dummy component to be evaluated in pararallel by om.ParallelGroup
    def setup(self):
        self.add_input('dv_struct', shape_by_conn=True)
        self.add_output('mass', 0.0)

    def compute(self,inputs,outputs):
        mass = None
        if self.comm.rank==0: # pretend that this must be computed on one or a subset of procs...
                              # which may be necessary for various solvers/analyses
            mass = np.sum(inputs['dv_struct'])

        # cast to all procs
        mass = self.comm.bcast(mass)
        outputs['mass'] = mass

    def compute_jacvec_product(self,inputs, d_inputs, d_outputs, mode):
        if mode == 'rev':
            if 'mass' in d_outputs:
                print(f'JACVEC, comm {self.comm.rank}: d_outputs["mass"] = ',d_outputs['mass'])

                if 1: # bad derivative in parallel group due to non-uniform d_outputs['mass'] across procs...
                    if self.comm.rank==0: # compute on first proc
                        d_inputs['dv_struct'] += d_outputs['mass']

                    # cast to all procs
                    d_inputs['dv_struct'] = self.comm.bcast(d_inputs['dv_struct'])

                elif 0: # hack for getting correct derivative... sum over procs and divide by number of procs, since the average d_outputs['mass'] is still correct?
                    d_inputs['dv_struct'] += d_outputs['mass']
                    d_inputs['dv_struct'] = self.comm.allreduce(d_inputs['dv_struct'],op=MPI.SUM) / self.comm.size

                else: # alternative fix... don't use d_outputs until after the bcast
                    d_dv_struct = None
                    if self.comm.rank==0: # compute on first proc
                        d_dv_struct = 1.
                    d_dv_struct = self.comm.bcast(d_dv_struct)
                    d_inputs['dv_struct'] = d_dv_struct * d_outputs['mass']

class StructParallel(om.ParallelGroup): # derivatives in parallel are only correct if not using more than 1 proc per scenario
    def setup(self):
        for scenario in scenario_labels:
            self.add_subsystem(scenario, DummyInnerComp())

class StructSerial(om.Group): # derivatives are fine serially
    def setup(self):
        for scenario in scenario_labels:
            self.add_subsystem(scenario, DummyInnerComp())

# top-level group
class Top(om.Group):
    def setup(self):

        # dummy structural design variables
        self.add_subsystem('thickness_dv', om.IndepVarComp(), promotes=['*'])
        self.thickness_dv.add_output('dv_struct', dv_struct)

        # add the scenarios
        if use_parallel_group:
            self.add_subsystem('multipoint',StructParallel())
        else:
            self.add_subsystem('multipoint',StructSerial())

        # make necessary connections
        for scenario in scenario_labels:
            self.connect('dv_struct', f'multipoint.{scenario}.dv_struct')

        # add component that uses outputs from parallel components
        self.add_subsystem('dummy_comp', DummyOuterComp())
        self.connect('multipoint.full_fea.mass','dummy_comp.VAR')

prob = om.Problem()
prob.model = Top()
prob.setup(mode='rev')
prob.run_model()

# print response
for scenario in scenario_labels:
    mass = prob.get_val(f'multipoint.{scenario}.mass',get_remote=True)
    print(f'Scenario "{scenario}": mass = ', mass)

# check totals
if 0:
    prob.check_totals(
                of=['dummy_comp.VAR2','multipoint.full_fea.mass'],
                wrt=['dv_struct']
                )
else: # for when check_totals gets stuck in ParallelGroup...
    deriv = prob.compute_totals(
                of=['dummy_comp.VAR2','multipoint.full_fea.mass'],
                wrt=['dv_struct']
                )
    if MPI.COMM_WORLD.rank==0: print(deriv)

OpenMDAO Version

3.25.1-dev

Relevant environment information

I am using python 3.9.5 with the following package versions (among other packages):

numpy==1.24.2
scipy==1.10.1
mpi4py==3.1.4
petsc4py==3.15.0

@naylor-b
Copy link
Member

@Asthelen, I put a fix up on my check_totals_hang branch for your other issue, #2884, and running this example on that branch seems to give me correct total derivatives. Are you able to give that branch a try before the PR goes in to see if it fixes this issue?

@Asthelen
Copy link
Author

Thanks @naylor-b, that does seem to fix the hang, and check_totals shows the correct derivatives for the issue #2884 example.

The #2883 example doesn't get stuck now, but derivatives are the same as before (e.g. 6's instead of 2's when using 10 processors... 21's vs. 2's on 40). I assume you were referring to the other example showing correct derivatives, and not this example?

@naylor-b
Copy link
Member

Ah, sorry, I was just looking at 2 processor case. I'll dig into this.

@Asthelen
Copy link
Author

Ah I see--thanks. Yeah, for this example, derivatives are correct with 1 processor per parallel group. Then they increase by 1 per processor per subsystem. I've seen another more complicated case that also scaled linearly with the number of processors, but one that didn't as well.

@naylor-b
Copy link
Member

naylor-b commented Apr 18, 2023

I played around with your example, and by removing the bcast from the compute_jacvec_product method of the DummyInnerComp I was able to get correct derivatives for various numbers of processors. So the new version of compute_jacvec_product just looks like this:

    def compute_jacvec_product(self,inputs, d_inputs, d_outputs, mode):
        if mode == 'rev':
            if 'mass' in d_outputs:
                d_inputs['dv_struct'] += d_outputs['mass']

@Asthelen
Copy link
Author

Yeah, I had noticed that it doesn't happen without bcasts... I guess even though d_outputs['mass'] varies across processors (unlike in a serial group), the resulting total derivative is still fine.

Unfortunately our aero/structural solvers rely on bcasts in quite a few places, in both implicit and explicit components. That way our FEM can be ran on a subset of processors for example, since it doesn't make sense to use the same number as our CFD. I did try tweaking our VLM solver (ran on only the first rank) knowing this fix, but haven't gotten that working so far (its use of implicit components seems to complicate things).

@Asthelen
Copy link
Author

For what it's worth, I found a while back that this "fix" worked for the aero/structural cases I was trying to run:

d_inputs['dv_struct'] = self.comm.allreduce(d_outputs['mass'], op=MPI.SUM) / self.comm.size

I just needed to do this with the d_outputs of whatever final quantity of interest was being computed (e.g. aero coefficients, KS stress aggregate, structural mass, etc.)... no changes necessary in upstream components like the implicit aero/structural solvers.

@naylor-b
Copy link
Member

I think this issue is fixed now on my transfers5 branch. If you have time to check it out, please let me know if it's working. The derivative values at the outputs should be correct now without having to do any division by comm size or any other hacks like that.

@Asthelen
Copy link
Author

This example works as well. I also tested something similar with the MPhys example (https://github.com/Asthelen/mphys/blob/e20877696e9f280b9b56fe469e93ef5c312cd9c9/examples/aerostructural/supersonic_panel/as_opt_parallel.py), which worked too.

However, that did hang when I tried this check_totals:

        prob.check_totals(step_calc='rel_avg',                                   
                          of=['multipoint.aerostructural1.C_L','dummy.VAR2'],    
                          wrt=['aoa','geometry_morph_param'],                    
                          compact_print=True,                                    
                          directional=False,                                     
                          show_progress=True)

with this DummyOuterComp setup:

        # add objective/constraints                                              
        self.add_objective(f'multipoint.{self.scenario_names[0]}.mass', ref=0.01)
        self.add_constraint(f'multipoint.{self.scenario_names[0]}.func_struct', upper=1.0, parallel_deriv_color='struct_cons') # run func_struct derivatives in parallel
        self.add_constraint(f'multipoint.{self.scenario_names[1]}.func_struct', upper=1.0, parallel_deriv_color='struct_cons')
        self.add_constraint(f'multipoint.{self.scenario_names[0]}.C_L', lower=0.15, ref=0.1, parallel_deriv_color='lift_cons') # run C_L derivatives in parallel
        self.add_constraint(f'multipoint.{self.scenario_names[1]}.C_L', lower=0.45, ref=0.1, parallel_deriv_color='lift_cons')

        self.add_subsystem('dummy', DummyOuterComp())                            
        self.connect(f'multipoint.{self.scenario_names[0]}.C_L', 'dummy.VAR')

When I comment out the first C_L constraint, then it doesn't hang. Maybe this is similar to #2911 which unfortunately isn't fixed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
3 participants