rollforward of cl/441571702: Make RemoteCall decide if its outputs are host memory types using fulltype.

SeeForTwo · tensorflower-gardener · commit 55a1146af14d · 2022-07-28T18:54:07.000-07:00
NEW: Removed DT_TO_FT and full_type_from_spec from structure.py and related tests from structure_test.py. Added fulltype_list_to_product to type_utils.py. multi_device_iterator_ops.py now uses full_types_for_flat_tensors and fulltype_list_to_product from python/framework/type_utils.py.
PiperOrigin-RevId: 463976466
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/full_type_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -273,7 +275,8 @@ REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_DEFAULT),
                         SymbolicGradientOp);
 
-RemoteCallOp::RemoteCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+RemoteCallOp::RemoteCallOp(OpKernelConstruction* ctx)
+    : AsyncOpKernel(ctx), return_type_(ctx->def().experimental_type()) {
   OP_REQUIRES_OK(ctx,
                  ctx->GetAttr(FunctionLibraryDefinition::kFuncAttr, &func_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Tin", &input_dtypes_));
@@ -358,9 +361,30 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
     opts.args_alloc_attrs.push_back(arg_alloc_attrs);
   }
   opts.rets_alloc_attrs.reserve(output_dtypes_.size());
+  DCHECK(!return_type_.IsInitialized() ||
+         (return_type_.type_id() == TFT_UNSET) ||
+         (output_dtypes_.size() == return_type_.args_size()))
+      << "RemoteCall op has a full type information for "
+      << return_type_.args_size() << " outputs but the number of outputs is "
+      << output_dtypes_.size();
   for (const auto& dtype : output_dtypes_) {
     AllocatorAttributes ret_alloc_attrs;
-    ret_alloc_attrs.set_on_host(DataTypeAlwaysOnHost(dtype));
+    bool on_host = DataTypeAlwaysOnHost(dtype);
+    if (return_type_.IsInitialized() && (return_type_.type_id() != TFT_UNSET)) {
+      DCHECK(return_type_.type_id() == TFT_PRODUCT)
+          << return_type_.DebugString();
+      FullTypeDef ftd = full_type::GetArgDefaultUnset(
+          return_type_, opts.rets_alloc_attrs.size());
+      if (full_type::IsHostMemoryType(ftd)) {
+        on_host = true;
+      }
+      VLOG(5) << "FulltypeDef for RemoteCall output="
+              << opts.rets_alloc_attrs.size()
+              << ", IsHostMemoryType=" << full_type::IsHostMemoryType(ftd)
+              << ":\n"
+              << ftd.DebugString();
+    }
+    ret_alloc_attrs.set_on_host(on_host);
     opts.rets_alloc_attrs.push_back(ret_alloc_attrs);
   }
   auto* rets = new std::vector<Tensor>;
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_FUNCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_FUNCTION_OPS_H_
 
+#include "tensorflow/core/framework/full_type_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -70,6 +71,10 @@ class RemoteCallOp : public AsyncOpKernel {
   NameAttrList func_;
   DataTypeVector input_dtypes_;
   DataTypeVector output_dtypes_;
+  // Note that in the future if all RemoteCall ops have full type
+  // information, the kernel will not need access to the "Tout" Attr and
+  // return_type_ will replace output_dtypes_.
+  FullTypeDef return_type_;
 
   mutex mu_;
   typedef std::pair<string, FunctionLibraryRuntime*> FunctionTarget;
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
@@ -3879,6 +3879,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "factory_ops_test",
+    size = "small",
+    srcs = ["ops/factory_ops_test.py"],
+    tags = [
+        "no_gpu",  # TODO(b/213596871): a similar test times out (delete
+        # the "no_gpu" tag once this is bug is fully resolved)
+    ],
+    deps = [
+        ":sparse_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "decode_proto_ops_gen",
     deps = [
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -89,11 +90,22 @@ def _next_func(string_handle):
         attributes={"experimental_ints_on_device": True},
         autograph=False)  # Pure graph code.
     def _remote_next_func(string_handle):
-      return functional_ops.remote_call(
+      return_values = functional_ops.remote_call(
           target=source_device,
           args=[string_handle] + next_func_concrete.captured_inputs,
           Tout=structure.get_flat_tensor_types(self._element_spec),
           f=next_func_concrete)
+      # Add full type information to the graph so that the RemoteCall op
+      # can determine for each of its outputs whether or not they are ragged
+      # tensors (or other types that use variants) that contain strings
+      # (or other host memory types). Then RemoteCall can
+      # appropriately set AllocatorAttributes to control copies so
+      # strings/host memory types stay on CPU.
+      fulltype_list = type_utils.fulltypes_for_flat_tensors(self._element_spec)
+      fulltype = type_utils.fulltype_list_to_product(fulltype_list)
+      for return_value in return_values:
+        return_value.op.experimental_set_type(fulltype)
+      return return_values
 
     self._next_func = _remote_next_func.get_concrete_function()
     self._next_captured_args = self._next_func.captured_inputs
diff --git a/tensorflow/python/framework/type_utils.py b/tensorflow/python/framework/type_utils.py
@@ -162,5 +162,10 @@ def fulltypes_for_flat_tensors(element_spec):
   specs = _specs_for_flat_tensors(element_spec)
   full_types_lists = [_translate_to_fulltype_for_flat_tensors(s) for s in specs]
   rval = nest.flatten(full_types_lists)  # flattens list-of-list to flat list.
-  assert len(rval) == len(element_spec._flat_tensor_specs)  # pylint: disable=protected-access
   return rval
+
+
+def fulltype_list_to_product(fulltype_list):
+  """Convert a list of FullType Def into a single FullType Def."""
+  return full_type_pb2.FullTypeDef(
+      type_id=full_type_pb2.TFT_PRODUCT, args=fulltype_list)
diff --git a/tensorflow/python/ops/factory_ops_test.py b/tensorflow/python/ops/factory_ops_test.py
@@ -0,0 +1,73 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that sparse tensors work with GPU, such as placement of int and string.
+
+Test using sparse tensors with distributed dataset. Since GPU does
+not support strings, sparse tensors containing string should always be placed
+on CPU.
+"""
+
+from absl.testing import parameterized
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+def sparse_int64():
+  return sparse_tensor.SparseTensor(
+      indices=[[0, 0], [1, 1], [2, 2], [3, 3], [4, 0], [5, 1], [6, 2], [7, 3]],
+      values=constant_op.constant([1, 2, 3, 4, 5, 6, 7, 8], dtype=dtypes.int64),
+      dense_shape=[8, 4])
+
+
+def sparse_str():
+  return sparse_tensor.SparseTensor(
+      indices=[[0, 0], [1, 1], [2, 2], [3, 3], [4, 0], [5, 1], [6, 2], [7, 3]],
+      values=constant_op.constant(['1', '2', '3', '4', '5', '6', '7', '8']),
+      dense_shape=[8, 4])
+
+
+class FactoryOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (sparse_int64,),
+      (sparse_str,),
+  )
+  def testSparseWithDistributedDataset(self, sparse_factory):
+
+    @def_function.function
+    def distributed_dataset_producer(t):
+      strategy = mirrored_strategy.MirroredStrategy(['GPU:0', 'GPU:1'])
+      sparse_ds = dataset_ops.Dataset.from_tensor_slices(t).batch(2)
+      dist_dataset = strategy.experimental_distribute_dataset(sparse_ds)
+      ds = iter(dist_dataset)
+      return strategy.experimental_local_results(next(ds))[0]
+
+    t = sparse_factory()
+
+    result = distributed_dataset_producer(t)
+    self.assertAllEqual(
+        self.evaluate(sparse_ops.sparse_tensor_to_dense(t)[0]),
+        self.evaluate(sparse_ops.sparse_tensor_to_dense(result)[0]))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops_test.py b/tensorflow/python/ops/ragged/ragged_factory_ops_test.py
@@ -128,15 +128,13 @@ def distributed_dataset_producer(t):
       return strategy.experimental_local_results(next(ds))[0]
 
     t = ragged_factory()
-    if t.dtype == dtypes.string:
-      self.skipTest('b/194439197: fix ragged tensor of string')
 
     result = distributed_dataset_producer(t)
     self.assertAllEqual(self.evaluate(t[0]), self.evaluate(result[0]))
 
   @parameterized.parameters(
       (dense_str,),
-      # (ragged_str,),  # TODO(b/194439197) fix ragged tensor of string
+      (ragged_str,),
   )
   def testIntStringWithDistributedDataset(self, string_factory):