Merge pull request #2468 from PrincetonUniversity/devel

Devel
PrincetonUniversity · Aug 18, 2022 · 86c70a9 · 86c70a9
2 parents dbdba04 + ab6e911
commit 86c70a9
Show file tree

Hide file tree

Showing 9 changed files with 100 additions and 96 deletions.
diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
@@ -769,18 +769,22 @@ def _is_static(it:SampleIterator):
             return False
 
         assert all(_is_static(sample_iterator) for sample_iterator in self.search_space)
+
         assert ocm is ocm.agent_rep.controller
-        # Compiled evaluate expects the same variable as mech function
-        variable = [input_port.parameters.value.get(context) for input_port in ocm.input_ports]
+
+        # Compiled evaluate expects the same variable as composition
+        state_features = ocm.parameters.state_feature_values._get(context)
+        inputs, num_inputs_sets = ocm.agent_rep._parse_run_inputs(state_features, context)
+
         num_evals = np.prod([d.num for d in self.search_space])
 
         # Map allocations to values
         comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, [context.execution_id])
         execution_mode = ocm.parameters.comp_execution_mode._get(context)
         if execution_mode == "PTX":
-            outcomes = comp_exec.cuda_evaluate(variable, num_evals)
+            outcomes = comp_exec.cuda_evaluate(inputs, num_inputs_sets, num_evals)
         elif execution_mode == "LLVM":
-            outcomes = comp_exec.thread_evaluate(variable, num_evals)
+            outcomes = comp_exec.thread_evaluate(inputs, num_inputs_sets, num_evals)
         else:
             assert False, f"Unknown execution mode for {ocm.name}: {execution_mode}."
 
@@ -1744,14 +1748,46 @@ def _gen_llvm_select_min_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags:fr
         return builder.function
 
     def _gen_llvm_function_body(self, ctx, builder, params, state_features, arg_in, arg_out, *, tags:frozenset):
-        ocm = self._get_optimized_controller()
-        if ocm is not None:
-            assert ocm.function is self
-            obj_func = ctx.import_llvm_function(ocm, tags=tags.union({"evaluate"}))
+        controller = self._get_optimized_controller()
+        if controller is not None:
+            assert controller.function is self
+            obj_func = ctx.import_llvm_function(controller, tags=tags.union({"evaluate"}))
             comp_args = builder.function.args[-3:]
             obj_param_ptr = comp_args[0]
             obj_state_ptr = comp_args[1]
-            extra_args = [arg_in, comp_args[2]]
+
+            # Construct input
+            comp_input = builder.alloca(obj_func.args[4].type.pointee, name="sim_input")
+
+            input_initialized = [False] * len(comp_input.type.pointee)
+            for src_idx, ip in enumerate(controller.input_ports):
+                if ip.shadow_inputs is None:
+                    continue
+
+                # shadow inputs point to an input port of of a node.
+                # If that node takes direct input, it will have an associated
+                # (input_port, output_port) in the input_CIM.
+                # Take the former as an index to composition input variable.
+                cim_in_port = controller.agent_rep.input_CIM_ports[ip.shadow_inputs][0]
+                dst_idx = controller.agent_rep.input_CIM.input_ports.index(cim_in_port)
+
+                # Check that all inputs are unique
+                assert not input_initialized[dst_idx], "Double initialization of input {}".format(dst_idx)
+                input_initialized[dst_idx] = True
+
+                src = builder.gep(arg_in, [ctx.int32_ty(0), ctx.int32_ty(src_idx)])
+                # Destination is a struct of 2d arrays
+                dst = builder.gep(comp_input, [ctx.int32_ty(0),
+                                               ctx.int32_ty(dst_idx),
+                                               ctx.int32_ty(0)])
+                builder.store(builder.load(src), dst)
+
+            # Assert that we have populated all inputs
+            assert all(input_initialized), \
+              "Not all inputs to the simulated composition are initialized: {}".format(input_initialized)
+
+            # Extra args: input and data
+            extra_args = [comp_input, comp_args[2]]
         else:
             obj_func = ctx.import_llvm_function(self.objective_function)
             obj_state_ptr = pnlvm.helpers.get_state_ptr(builder, self, state_features,

diff --git a/psyneulink/core/components/functions/userdefinedfunction.py b/psyneulink/core/components/functions/userdefinedfunction.py
@@ -9,6 +9,7 @@
 #
 # *****************************************  USER-DEFINED FUNCTION  ****************************************************
 
+import builtins
 import numpy as np
 import typecheck as tc
 from inspect import signature, _empty, getsourcelines, getsourcefile, getclosurevars
@@ -34,7 +35,7 @@ def __init__(self, *args, **kwargs):
         self.functions = set()
 
     def visit_Name(self, node):
-        if node.id not in __builtins__:
+        if node.id not in dir(builtins):
             self.vars.add(node.id)
 
     def visit_Call(self, node):
@@ -44,7 +45,7 @@ def visit_Call(self, node):
         except AttributeError:
             func_id = node.func.id
 
-        if func_id not in __builtins__:
+        if func_id not in dir(builtins):
             self.functions.add(func_id)
 
         for c in ast.iter_child_nodes(node):

diff --git a/psyneulink/core/components/mechanisms/modulatory/control/optimizationcontrolmechanism.py b/psyneulink/core/components/mechanisms/modulatory/control/optimizationcontrolmechanism.py
@@ -3196,10 +3196,6 @@ def evaluate_agent_rep(self, control_allocation, context=None, return_results=Fa
                                            context=context
                                            )
 
-    def _get_evaluate_input_struct_type(self, ctx):
-        # We construct input from optimization function input
-        return ctx.get_input_struct_type(self.function)
-
     def _get_evaluate_output_struct_type(self, ctx):
         # Returns a scalar that is the predicted net_outcome
         return ctx.float_ty
@@ -3326,15 +3322,15 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
                 ctx.get_state_struct_type(self.agent_rep).as_pointer(),
                 self._get_evaluate_alloc_struct_type(ctx).as_pointer(),
                 self._get_evaluate_output_struct_type(ctx).as_pointer(),
-                self._get_evaluate_input_struct_type(ctx).as_pointer(),
+                ctx.get_input_struct_type(self.agent_rep).as_pointer(),
                 ctx.get_data_struct_type(self.agent_rep).as_pointer()]
 
         builder = ctx.create_llvm_function(args, self, str(self) + "_evaluate")
         llvm_func = builder.function
         for p in llvm_func.args:
             p.attributes.add('nonnull')
 
-        comp_params, base_comp_state, allocation_sample, arg_out, arg_in, base_comp_data = llvm_func.args
+        comp_params, base_comp_state, allocation_sample, arg_out, comp_input, base_comp_data = llvm_func.args
 
         if "const_params" in debug_env:
             comp_params = builder.alloca(comp_params.type.pointee, name="const_params_loc")
@@ -3390,37 +3386,8 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
                                                       ctx.int32_ty(0)])
             builder.store(builder.load(sample_ptr), sample_dst)
 
-        # Construct input
-        comp_input = builder.alloca(sim_f.args[3].type.pointee, name="sim_input")
-
-        input_initialized = [False] * len(comp_input.type.pointee)
-        for src_idx, ip in enumerate(self.input_ports):
-            if ip.shadow_inputs is None:
-                continue
-
-            # shadow inputs point to an input port of of a node.
-            # If that node takes direct input, it will have an associated
-            # (input_port, output_port) in the input_CIM.
-            # Take the former as an index to composition input variable.
-            cim_in_port = self.agent_rep.input_CIM_ports[ip.shadow_inputs][0]
-            dst_idx = self.agent_rep.input_CIM.input_ports.index(cim_in_port)
-
-            # Check that all inputs are unique
-            assert not input_initialized[dst_idx], "Double initialization of input {}".format(dst_idx)
-            input_initialized[dst_idx] = True
-
-            src = builder.gep(arg_in, [ctx.int32_ty(0), ctx.int32_ty(src_idx)])
-            # Destination is a struct of 2d arrays
-            dst = builder.gep(comp_input, [ctx.int32_ty(0),
-                                           ctx.int32_ty(dst_idx),
-                                           ctx.int32_ty(0)])
-            builder.store(builder.load(src), dst)
-
-        # Assert that we have populated all inputs
-        assert all(input_initialized), \
-          "Not all inputs to the simulated composition are initialized: {}".format(input_initialized)
-
         if "const_input" in debug_env:
+            comp_input = builder.alloca(sim_f.args[3].type.pointee, name="sim_input")
             if not debug_env["const_input"]:
                 input_init = [[os.defaults.variable.tolist()] for os in self.agent_rep.input_CIM.input_ports]
                 print("Setting default input: ", input_init)

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
@@ -149,7 +149,7 @@ def upload_ctype(self, data, name='other'):
             # 0-sized structures fail to upload
             # provide a small device buffer instead
             return jit_engine.pycuda.driver.mem_alloc(4)
-        return jit_engine.pycuda.driver.to_device(bytearray(data))
+        return jit_engine.pycuda.driver.to_device(bytes(data))
 
     def download_ctype(self, source, ty, name='other'):
         self._downloaded_bytes[name] += ctypes.sizeof(ty)
@@ -563,8 +563,11 @@ def cuda_execute(self, inputs):
 
     # Methods used to accelerate "Run"
 
-    def _get_run_input_struct(self, inputs, num_input_sets):
-        input_type = self._bin_run_func.byref_arg_types[3]
+    def _get_run_input_struct(self, inputs, num_input_sets, arg=3):
+        # Callers that override input arg, should ensure that _bin_func is not None
+        bin_f = self._bin_run_func if arg == 3 else self._bin_func
+
+        input_type = bin_f.byref_arg_types[arg]
         c_input = (input_type * num_input_sets) * len(self._execution_contexts)
         if len(self._execution_contexts) == 1:
             inputs = [inputs]
@@ -676,7 +679,7 @@ def cuda_run(self, inputs, runs, num_input_sets):
             assert runs_np[0] <= runs, "Composition ran more times than allowed!"
             return _convert_ctype_to_python(ct_out)[0:runs_np[0]]
 
-    def _prepare_evaluate(self, variable, num_evaluations):
+    def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations):
         ocm = self._composition.controller
         assert len(self._execution_contexts) == 1
 
@@ -694,26 +697,24 @@ def _prepare_evaluate(self, variable, num_evaluations):
         ct_comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1)
         ct_comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)
 
-        # Construct input variable
-        var_dty = _element_dtype(bin_func.byref_arg_types[5])
-        converted_variable = np.concatenate(variable, dtype=var_dty)
+        # Construct input variable, the 5th parameter of the evaluate function
+        ct_inputs = self._get_run_input_struct(inputs, num_input_sets, 5)
 
         # Output ctype
         out_ty = bin_func.byref_arg_types[4] * num_evaluations
 
         # return variable as numpy array. pycuda can use it directly
-        return ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty
+        return ct_comp_param, ct_comp_state, ct_comp_data, ct_inputs, out_ty
 
-    def cuda_evaluate(self, variable, num_evaluations):
-        ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty = \
-            self._prepare_evaluate(variable, num_evaluations)
-        self._uploaded_bytes['input'] += converted_variable.nbytes
+    def cuda_evaluate(self, inputs, num_input_sets, num_evaluations):
+        ct_comp_param, ct_comp_state, ct_comp_data, ct_inputs, out_ty = \
+            self._prepare_evaluate(inputs, num_input_sets, num_evaluations)
 
         # Output is allocated on device, but we need the ctype (out_ty).
         cuda_args = (self.upload_ctype(ct_comp_param, 'params'),
                      self.upload_ctype(ct_comp_state, 'state'),
                      jit_engine.pycuda.driver.mem_alloc(ctypes.sizeof(out_ty)),
-                     jit_engine.pycuda.driver.In(converted_variable),
+                     self.upload_ctype(ct_inputs, 'input'),
                      self.upload_ctype(ct_comp_data, 'data'),
                     )
 
@@ -722,12 +723,11 @@ def cuda_evaluate(self, variable, num_evaluations):
 
         return ct_results
 
-    def thread_evaluate(self, variable, num_evaluations):
-        ct_param, ct_state, ct_data, converted_variale, out_ty = \
-            self._prepare_evaluate(variable, num_evaluations)
+    def thread_evaluate(self, inputs, num_input_sets, num_evaluations):
+        ct_param, ct_state, ct_data, ct_inputs, out_ty = \
+            self._prepare_evaluate(inputs, num_input_sets, num_evaluations)
 
         ct_results = out_ty()
-        ct_variable = converted_variale.ctypes.data_as(self.__bin_func.c_func.argtypes[5])
         jobs = min(os.cpu_count(), num_evaluations)
         evals_per_job = (num_evaluations + jobs - 1) // jobs
 
@@ -738,7 +738,9 @@ def thread_evaluate(self, variable, num_evaluations):
             results = [ex.submit(self.__bin_func, ct_param, ct_state,
                                  int(i * evals_per_job),
                                  min((i + 1) * evals_per_job, num_evaluations),
-                                 ct_results, ct_variable, ct_data)
+                                 ct_results,
+                                 ctypes.cast(ctypes.byref(ct_inputs), self.__bin_func.c_func.argtypes[5]),
+                                 ct_data)
                        for i in range(jobs)]
 
         parallel_stop = time.time()

diff --git a/psyneulink/core/llvm/helpers.py b/psyneulink/core/llvm/helpers.py
@@ -443,15 +443,15 @@ def printf_float_array(builder, array, prefix="", suffix="\n", override_debug=Fa
     printf(builder, prefix, override_debug=override_debug)
 
     with array_ptr_loop(builder, array, "print_array_loop") as (b1, i):
-        printf(b1, "%lf ", b1.load(b1.gep(array, [ir.IntType(32)(0), i])), override_debug=override_debug)
+        printf(b1, "%lf ", b1.load(b1.gep(array, [i.type(0), i])), override_debug=override_debug)
 
     printf(builder, suffix, override_debug=override_debug)
 
 
 def printf_float_matrix(builder, matrix, prefix="", suffix="\n", override_debug=False):
     printf(builder, prefix, override_debug=override_debug)
     with array_ptr_loop(builder, matrix, "print_row_loop") as (b1, i):
-        row = b1.gep(matrix, [ir.IntType(32)(0), i])
+        row = b1.gep(matrix, [i.type(0), i])
         printf_float_array(b1, row, suffix="\n", override_debug=override_debug)
     printf(builder, suffix, override_debug=override_debug)
 

diff --git a/requirements.txt b/requirements.txt
@@ -6,9 +6,8 @@ graphviz<0.21.0
 grpcio<1.43.0
 grpcio-tools<1.43.0
 llvmlite<0.40
-matplotlib<3.5.3
-modeci_mdf>=0.3.4, <0.4.2
-modelspec<0.2.7
+matplotlib<3.5.4
+modeci_mdf<0.5, >=0.3.4
 networkx<2.9
 numpy<1.21.7, >=1.17.0
 pillow<9.3.0

diff --git a/tests/functions/test_transfer.py b/tests/functions/test_transfer.py
@@ -76,15 +76,15 @@ def test_execute(func, variable, params, expected, benchmark, func_mode):
         benchmark(ex, variable)
 
 
-relu_derivative_helper = lambda x : RAND1 if x > 0 else RAND1 * RAND3
 logistic_helper = RAND4 / (1 + np.exp(-(RAND1 * (test_var - RAND2)) + RAND3))
 tanh_derivative_helper = (RAND1 * (test_var + RAND2) + RAND3)
 tanh_derivative_helper = (1 - np.tanh(tanh_derivative_helper)**2) * RAND4 * RAND1
+
 derivative_test_data = [
     (Functions.Linear, test_var, {'slope':RAND1, 'intercept':RAND2}, RAND1),
     (Functions.Exponential, test_var, {'scale':RAND1, 'rate':RAND2}, RAND1 * RAND2 * np.exp(RAND2 * test_var)),
     (Functions.Logistic, test_var, {'gain':RAND1, 'x_0':RAND2, 'offset':RAND3, 'scale':RAND4}, RAND1 * RAND4 * logistic_helper * (1 - logistic_helper)),
-    (Functions.ReLU, test_var, {'gain':RAND1, 'bias':RAND2, 'leak':RAND3}, list(map(relu_derivative_helper, test_var))),
+    (Functions.ReLU, test_var, {'gain':RAND1, 'bias':RAND2, 'leak':RAND3}, np.where(test_var > 0, RAND1, RAND1 * RAND3)),
     (Functions.Tanh, test_var, {'gain':RAND1, 'bias':RAND2, 'offset':RAND3, 'scale':RAND4}, tanh_derivative_helper),
 ]