In [1]:
import tensorflow as tf
from tensorflow.python.client import device_lib 
import os

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

In [3]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12421168673629810770
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 17376871065945972175
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 15895756372821059687
physical_device_desc: "device: XLA_GPU device"
, name: "/device:XLA_GPU:1"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 12422713576042441445
physical_device_desc: "device: XLA_GPU device"
, name: "/device:XLA_GPU:2"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 10083096500865780179
physical_device_desc: "device: XLA_GPU device"
, name: "/device:XLA_GPU:3"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 18307627054457248060
physical_device_desc: "device: XLA_GPU device"
]


In [4]:
import tensorflow as tf
print("Num XLA_GPUs Available: ", len(tf.config.list_physical_devices('XLA_GPU')))

Num XLA_GPUs Available:  4


In [5]:
# @tf.function(jit_compile=True)
@tf.function(experimental_compile=True)
def matmul(a, b):
    tf.debugging.set_log_device_placement(True)    
    c = tf.matmul(a, b)
    return c

In [6]:
# Create some tensors
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])

c = matmul(a, b)
print(c)

tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)


In [7]:
s = tf.compat.v1.Session()

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device



In [8]:
tf.config.list_physical_devices(
    device_type=None
)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:1', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:2', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:3', device_type='XLA_GPU')]

In [9]:
physical_devices = tf.config.list_physical_devices('XLA_GPU')
print("Num GPUs:", len(physical_devices))

Num GPUs: 4


In [10]:
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


False

In [11]:
tf.config.get_visible_devices(
    device_type=None
)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:1', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:2', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:3', device_type='XLA_GPU')]

In [13]:
local_device_protos = device_lib.list_local_devices()
[x.name for x in local_device_protos if x.device_type == 'XLA_GPU']

['/device:XLA_GPU:0',
 '/device:XLA_GPU:1',
 '/device:XLA_GPU:2',
 '/device:XLA_GPU:3']

In [14]:
tf.config.experimental.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:XLA_CPU:0', device_type='XLA_CPU')]

In [15]:
tf.debugging.set_log_device_placement(True)

try:
  # Specify an invalid GPU device
  with tf.device('/device:XLA_GPU:0'):
    a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
    c = tf.matmul(a, b)
except RuntimeError as e:
    print(e)

# Use XLA with tf.function

### This tutorial trains a TensorFlow model to classify the MNIST dataset, where the training function is compiled using XLA.

### First, load TensorFlow and enable eager execution.

In [16]:
# # In TF 2.4 jit_compile is called experimental_compile
# pip install -q tf-nightly

In [17]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()

INFO:tensorflow:Enabling eager execution


### Then define some necessary constants and prepare the MNIST dataset.

In [18]:
# Size of each input image, 28 x 28 pixels
IMAGE_SIZE = 28 * 28
# Number of distinct number labels, [0..9]
NUM_CLASSES = 10
# Number of examples in each training batch (step)
TRAIN_BATCH_SIZE = 100
# Number of training steps to run
TRAIN_STEPS = 1000

# Loads MNIST dataset.
train, test = tf.keras.datasets.mnist.load_data()
train_ds = tf.data.Dataset.from_tensor_slices(train).batch(TRAIN_BATCH_SIZE).repeat()

# Casting from raw data to the required datatypes.
def cast(images, labels):
    images = tf.cast(
      tf.reshape(images, [-1, IMAGE_SIZE]), tf.float32)
    labels = tf.cast(labels, tf.int64)
    return (images, labels)

### Finally, define the model and the optimizer. The model uses a single dense layer.

In [19]:
layer = tf.keras.layers.Dense(NUM_CLASSES)
optimizer = tf.keras.optimizers.Adam()

## Define the training function

### In the training function, you get the predicted labels using the layer defined above, and then minimize the gradient of the loss using the optimizer. In order to compile the computation using XLA, place it inside tf.function with jit_compile=True.

In [20]:
@tf.function(jit_compile=True)
def train_mnist(images, labels):
    images, labels = cast(images, labels)

    with tf.GradientTape() as tape:
        predicted_labels = layer(images)
        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=predicted_labels, labels=labels
        ))
        layer_variables = layer.trainable_variables
        grads = tape.gradient(loss, layer_variables)
        optimizer.apply_gradients(zip(grads, layer_variables))

## Train and test the model

### Once you have defined the training function, define the model.

In [21]:
for images, labels in train_ds:
    if optimizer.iterations > TRAIN_STEPS:
        break
    train_mnist(images, labels)

### And, finally, check the accuracy:

In [22]:
images, labels = cast(test[0], test[1])
predicted_labels = layer(images)
correct_prediction = tf.equal(tf.argmax(predicted_labels, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Prediction accuracy after training: %s" % accuracy)

Prediction accuracy after training: tf.Tensor(0.8789, shape=(), dtype=float32)


### Behind the scenes, the XLA compiler has compiled the entire TF function to HLO, which has enabled fusion optimizations. Using the introspection facilities, we can see the HLO code (other interesting possible values for "stage" are optimized_hlo for HLO after optimizations and optimized_hlo_dot for a Graphviz graph):

In [23]:
print(train_mnist.experimental_get_compiler_ir(images, labels)(stage='hlo'))

HloModule a_inference_train_mnist_5302__.205, input_output_alias={ {0}: (2, {}, may-alias), {1}: (3, {}, may-alias), {2}: (5, {}, may-alias), {3}: (8, {}, may-alias), {4}: (9, {}, may-alias), {5}: (10, {}, may-alias), {6}: (11, {}, may-alias) }

%max_float_.64 (x.65: f32[], y.66: f32[]) -> f32[] {
  %x.65 = f32[] parameter(0)
  %y.66 = f32[] parameter(1)
  ROOT %maximum.67 = f32[] maximum(f32[] %x.65, f32[] %y.66)
}

%add_float_.74 (x.75: f32[], y.76: f32[]) -> f32[] {
  %x.75 = f32[] parameter(0)
  %y.76 = f32[] parameter(1)
  ROOT %add.77 = f32[] add(f32[] %x.75, f32[] %y.76)
}

%add_float_.93 (x.94: f32[], y.95: f32[]) -> f32[] {
  %x.94 = f32[] parameter(0)
  %y.95 = f32[] parameter(1)
  ROOT %add.96 = f32[] add(f32[] %x.94, f32[] %y.95)
}

%Mean-reduction.105 (x.106: f32[], y.107: f32[]) -> f32[] {
  %x.106 = f32[] parameter(0)
  %y.107 = f32[] parameter(1)
  ROOT %add.108 = f32[] add(f32[] %x.106, f32[] %y.107)
}

%add_float_.123 (x.124: f32[], y.125: f32[]) -> f32[] {
  %x.124 =

# XLA autoclustering

## Classifying CIFAR-10 with XLA

### This tutorial trains a TensorFlow model to classify the CIFAR-10 dataset, and we compile it using XLA.

### Load and normalize the dataset using the Keras API:

In [24]:
import tensorflow as tf

# # Check that GPU is available: cf. https://colab.research.google.com/notebooks/gpu.ipynb
# assert(tf.test.gpu_device_name())

tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(False) # Start with XLA disabled.

def load_data():
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
    x_train = x_train.astype('float32') / 256
    x_test = x_test.astype('float32') / 256

    # Convert class vectors to binary class matrices.
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=10)
    y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)
    return ((x_train, y_train), (x_test, y_test))

(x_train, y_train), (x_test, y_test) = load_data()

### We define the model, adapted from the Keras CIFAR-10 example:

In [25]:
def generate_model():
    return tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:]),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Conv2D(32, (3, 3)),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),

    tf.keras.layers.Conv2D(64, (3, 3), padding='same'),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Conv2D(64, (3, 3)),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10),
    tf.keras.layers.Activation('softmax')
    ])

model = generate_model()

### We train the model using the RMSprop optimizer:

In [35]:
def compile_model(model):
    opt = tf.keras.optimizers.RMSprop(lr=0.0001, decay=1e-6)
    model.compile(loss='categorical_crossentropy',
                optimizer=opt,
                metrics=['accuracy'])
    return model

model = compile_model(model)

def train_model(model, x_train, y_train, x_test, y_test, epochs=25):
    model.fit(x_train, y_train, batch_size=256, epochs=epochs, validation_data=(x_test, y_test), shuffle=True)

def warmup(model, x_train, y_train, x_test, y_test):
    # Warm up the JIT, we do not wish to measure the compilation time.
    initial_weights = model.get_weights()
    train_model(model, x_train, y_train, x_test, y_test, epochs=1)
    model.set_weights(initial_weights)

warmup(model, x_train, y_train, x_test, y_test)
%time train_model(model, x_train, y_train, x_test, y_test)

scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

 38/196 [====>.........................] - ETA: 13s - loss: 1.8264 - accuracy: 0.3398

KeyboardInterrupt: 

### Now let's train the model again, using the XLA compiler. To enable the compiler in the middle of the application, we need to reset the Keras session.

In [34]:
# We need to clear the session to enable JIT in the middle of the program.
tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(True) # Enable XLA.
model = compile_model(generate_model())
(x_train, y_train), (x_test, y_test) = load_data()

warmup(model, x_train, y_train, x_test, y_test)
%time train_model(model, x_train, y_train, x_test, y_test)



Epoch 1/25
Epoch 2/25

KeyboardInterrupt: 

In [32]:
@tf.function(experimental_compile=True)
def gpu_test():
    sz = 25000
    while True:
        z = tf.math.multiply(
    tf.random.uniform(shape=[sz]), tf.random.uniform(shape=[sz]), name=None) 
        

In [33]:
# Use 'watch nvidida-smi' and htop to monitor GPU and CPU usage repectively.
gpu_test()

KeyboardInterrupt: 