Merge branch 'master' into config-version-update

# Conflicts: # .ps_project/config.yaml
Paperspace · May 14, 2019 · 51b5608 · 51b5608
2 parents c259fd1 + 3b0115a
commit 51b5608
Show file tree

Hide file tree

Showing 6 changed files with 256 additions and 59 deletions.
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,5 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+.idea/
diff --git a/README.md b/README.md
@@ -8,37 +8,109 @@ and
 [tf.layers](https://www.tensorflow.org/api_docs/python/tf/layers)
 APIs.
 
+# Gradient Setup
 
-## Setup
+## Single Node Training on Gradient
 
-To begin, you'll simply need the latest version of TensorFlow installed.
-First make sure you've [added the models folder to your Python path](/official/#running-the-models); otherwise you may encounter an error like `ImportError: No module named mnist`.
+### Install Gradient CLI
+
+```
+pip install paperspace
+```
+
+[Please check our documentation on how to install Gradient CLI and obtain a Token](https://app.gitbook.com/@paperspace/s/gradient/cli/install-the-cli)
+
+### Create project and obtain its handle
 
-Then to train the model, run the following:
+[Please check our documentation on how to create a project](https://app.gitbook.com/@paperspace/s/gradient/projects/create-a-project)
+
+### Create and start single node experiment
 
 ```
-python mnist.py
+paperspace-python experiments createAndStart singlenode --name mnist --projectHandle <your project handle> --experimentEnv "{\"EPOCHS_EVAL\":5,\"TRAIN_EPOCHS\":10,\"MAX_STEPS\":1000,\"EVAL_SECS\":10}" --container tensorflow/tensorflow:1.13.1-gpu-py3 --machineType K80 --command "python mnist.py" --workspaceUrl https://github.com/Paperspace/mnist-sample.git
+```
+
+That's it!
+
+## Multinode Training on Gradient
+
+### Create and start distributed multinode experiment
+
+```
+paperspace-python experiments createAndStart multinode --name mnist-multinode --projectHandle <your project handle> --experimentEnv "{\"EPOCHS_EVAL\":5,\"TRAIN_EPOCHS\":10,\"MAX_STEPS\":1000,\"EVAL_SECS\":10}" --experimentTypeId GRPC --workerContainer tensorflow/tensorflow:1.13.1-gpu-py3 --workerMachineType K80 --workerCommand 'pip install -r requirements.txt && python mnist.py' --workerCount 2 --parameterServerContainer tensorflow/tensorflow:1.13.1-py3 --parameterServerMachineType K80 --parameterServerCommand 'pip install -r requirements.txt && python mnist.py' --parameterServerCount 1 --workspaceUrl https://github.com/Paperspace/mnist-sample.git
+```
+
+### Modify your code to run distributed on Gradient
+
+#### Set `TF_CONFIG` environment variable
+
+First import from gradient-sdk:
+
+```
+from gradient_sdk import get_tf_config
+```
+
+then in your main():
+
+```
+if __name__ == '__main__':
+    get_tf_config()
+```
+
+This function will set `TF_CONFIG`, `INDEX` and `TYPE` for each node.
+
+For multi-worker training, as mentioned before, you need to set the `TF_CONFIG` environment variable for each binary running in your cluster. The `TF_CONFIG` environment variable is a JSON string that specifies the tasks that constitute a cluster, each task's address, and each task's role in the cluster.
+
+### Exporting a Model for deployments
+
+#### Export your Tensorflow model
+
+In order to serve a Tensorflow model, simply export a SavedModel from your Tensorflow program. [SavedModel](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) is a language-neutral, recoverable, hermetic serialization format that enables higher-level systems and tools to produce, consume, and transform TensorFlow models.
+
+Please refer to [Tensorflow documentation](https://www.tensorflow.org/guide/saved_model#save_and_restore_models) for detailed instructions on how to export SavedModels.
+
+#### Example code showing how to export your model:
+
+```
+tf.estimator.train_and_evaluate(mnist_classifier, train_spec, eval_spec)
+
+#Starting to Export model
+image = tf.placeholder(tf.float32, [None, 28, 28])
+input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
+            'image': image,
+        })
+mnist_classifier.export_savedmodel(<export directory>,
+                                    input_fn,
+                                    strip_default_attrs=True)
+#Model Exported
+```
+
+We use TensorFlow's [SavedModelBuilder module](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py) to export the model. SavedModelBuilder saves a "snapshot" of the trained model to reliable storage so that it can be loaded later for inference.
+
+For details on the SavedModel format, please see the documentation at [SavedModel README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+
+For export directory, be sure to set it to `PS_MODEL_PATH` when running a model deployment on Gradient:
+
+```
+export_dir = os.path.abspath(os.environ.get('PS_MODEL_PATH'))
 ```
 
-Distributed Training on Gradient
+You can also use Gradient SDK to ensure you have the correct path:
 
-Just run the example with following parameters:
 ```
-  "name": "Mnist Sample",
-  "projectHandle": "<your project handle",
-  "parameterServerContainer": "tensorflow/tensorflow:1.13.1-gpu-py3",
-  "parameterServerMachineType": "K80",
-  "parameterServerCount": 1,
-  "workerCommand": "python mnist.py",
-  "workerContainer": "tensorflow/tensorflow:1.13.1-gpu-py3",
-  "workspaceUrl": "git+https://github.com/paperspace/mnist-sample.git",
-  "workerMachineType": "K80",
-  "workerCount": 2,
-  "parameterServerCommand": "python mnist.py"
+from gradient_sdk.utils import data_dir, model_dir, export_dir
 ```
-Gradient will generate TF_CONFIG in base64 format for each node so all you need to do in your other projects:
+
+# Local Setup
+
+To begin, you'll simply need the latest version of TensorFlow installed.
+
+First make sure you've [added the models folder to your Python path](/official/#running-the-models); otherwise you may encounter an error like `ImportError: No module named mnist`.
+
+Then, to train the model, simply run:
+
 ```
-paperspace_tf_config = json.loads(base64.urlsafe_b64decode(os.environ.get('TF_CONFIG')).decode('utf-8'))
+python mnist.py
 ```
 
 ## Exporting the model
@@ -51,12 +123,14 @@ python mnist.py --export_dir /tmp/mnist_saved_model
 
 ## Training the model for use with Tensorflow Serving on a CPU
 
-If you are training on Tensorflow using a GPU but would like to export the model for use in Tensorflow Serving on a CPU-only server you can train and/or export the model using ` --data_format=channels_last`:
+If you are training on Tensorflow using a GPU but would like to export the model for use in Tensorflow Serving on a CPU-only server, you can train and/or export the model using `--data_format=channels_last`:
+
 ```
 python mnist.py --data_format=channels_last
 ```
 
 The SavedModel will be saved in a timestamped directory under `/tmp/mnist_saved_model/` (e.g. `/tmp/mnist_saved_model/1513630966/`).
 
-**Getting predictions with SavedModel**
+## Getting predictions with SavedModel
+
 Use [`saved_model_cli`](https://www.tensorflow.org/guide/saved_model#cli_to_inspect_and_execute_savedmodel) to inspect and execute the SavedModel.
diff --git a/inference-rest-client-test.ipynb b/inference-rest-client-test.ipynb
@@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from random import randint\n",
+    "try:\n",
+    "    import matplotlib.pyplot as plt\n",
+    "except ImportError:\n",
+    "    print('Matplotlib not detected - images plotting not available')\n",
+    "\n",
+    "plotting = True\n",
+    "try:\n",
+    "    from matplotlib import image as mpimage\n",
+    "except ImportError:\n",
+    "    from PIL import Image as pilimage\n",
+    "    plotting = False\n",
+    "\n",
+    "import requests\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "def get_image_from_drive(path):\n",
+    "    # Load the image\n",
+    "    try:\n",
+    "        image = pilimage.open(path)\n",
+    "    except ImportError:\n",
+    "        image = mpimage.open(path)\n",
+    "    except Exception:\n",
+    "        raise\n",
+    "    return image\n",
+    "\n",
+    "def get_random_image_from_dataset(image_index=randint(0, 9999)):\n",
+    "    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()\n",
+    "    return x_test[image_index]\n",
+    "\n",
+    "def show_selected_image(image):\n",
+    "    fig = plt.figure()\n",
+    "    plt.subplot(1, 1, 1)\n",
+    "    plt.tight_layout()\n",
+    "    plt.imshow(image, cmap='gray', interpolation='none')\n",
+    "    plt.xticks([])\n",
+    "    plt.yticks([])\n",
+    "    plt.show()\n",
+    "    \n",
+    "def get_random_image_from_dataset(image_index=randint(0, 9999)):\n",
+    "    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()\n",
+    "    return x_test[image_index]\n",
+    "\n",
+    "\n",
+    "def show_selected_image(image):\n",
+    "    fig = plt.figure()\n",
+    "    plt.subplot(1, 1, 1)\n",
+    "    plt.tight_layout()\n",
+    "    plt.imshow(image, cmap='gray', interpolation='none')\n",
+    "    plt.xticks([])\n",
+    "    plt.yticks([])\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "def make_vector(image):\n",
+    "    vector = []\n",
+    "    for item in image.tolist():\n",
+    "        vector.extend(item)\n",
+    "    return vector\n",
+    "\n",
+    "\n",
+    "def make_prediction_request(image, prediction_url):\n",
+    "    vector = make_vector(image)\n",
+    "    json = {\n",
+    "        \"inputs\": [vector]\n",
+    "    }\n",
+    "    response = requests.post(prediction_url, json=json)\n",
+    "\n",
+    "    print(response.status_code)\n",
+    "    print(response.text)\n",
+    "\n",
+    "image = get_random_image_from_dataset()\n",
+    "\n",
+    "show_selected_image(image)\n",
+    "make_prediction_request(image, 'http://127.0.0.1:8501/v1/models/mnist:predict')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/mnist.py b/mnist.py
@@ -17,14 +17,19 @@
 from __future__ import division
 from __future__ import print_function
 
-import base64
-import json
 import os
 
 from absl import app as absl_app
 from absl import flags
 import tensorflow as tf  # pylint: disable=g-bad-import-order
 
+gradient_sdk = True
+try:
+    from gradient_sdk import get_tf_config
+except ImportError:
+    print("Gradient SDK not installed. Distributed training is not possible")
+    gradient_sdk = False
+
 import dataset
 from utils.flags import core as flags_core
 from utils.logs import hooks_helper
@@ -83,36 +88,13 @@ def create_model(data_format):
         ])
 
 
-def get_tf_config():
-    tf_config = os.environ.get('TF_CONFIG')
-    if not tf_config:
-        return
-    return json.loads(tf_config)
-
-
-def get_paperspace_tf_config():
-    tf_config = os.environ.get('TF_CONFIG')
-    if not tf_config:
-        return
-    paperspace_tf_config = json.loads(base64.urlsafe_b64decode(tf_config).decode('utf-8'))
-
-    tf.logging.debug(str(paperspace_tf_config))
-    return paperspace_tf_config
-
-
-def set_tf_config():
-    tf_config = get_paperspace_tf_config()
-    if tf_config:
-        os.environ['TF_CONFIG'] = json.dumps(tf_config)
-
-
 def define_mnist_flags():
-    flags.DEFINE_integer('eval_secs', 60, 'How frequently to run evaluation step')
-    flags.DEFINE_integer('ckpt_steps', 100, 'How frequently to save a model checkpoin')
-    flags.DEFINE_integer('max_ckpts', 2, 'Maximum number of checkpoints to keep')
+    flags.DEFINE_integer('eval_secs', os.environ.get('EVAL_SECS', 600), 'How frequently to run evaluation step')
+    flags.DEFINE_integer('ckpt_steps', os.environ.get('CKPT_STEPS', 600), 'How frequently to save a model checkpoin')
+    flags.DEFINE_integer('max_ckpts', 5, 'Maximum number of checkpoints to keep')
     flags.DEFINE_integer('max_steps', os.environ.get('MAX_STEPS', 150000), 'Max steps')
-    flags.DEFINE_integer('save_summary_steps', 10, 'How frequently to save TensorBoard summaries')
-    flags.DEFINE_integer('log_step_count_steps', 10, 'How frequently to log loss & global steps/s')
+    flags.DEFINE_integer('save_summary_steps', 100, 'How frequently to save TensorBoard summaries')
+    flags.DEFINE_integer('log_step_count_steps', 100, 'How frequently to log loss & global steps/s')
     flags_core.define_base()
     flags_core.define_performance(num_parallel_calls=False)
     flags_core.define_image()
@@ -249,8 +231,12 @@ def eval_input_fn():
 
     tf.estimator.train_and_evaluate(mnist_classifier, train_spec, eval_spec)
 
-    # Export the model if node is master and export_dir is set
-    if flags_obj.export_dir is not None and os.environ.get('TYPE') == 'master':
+    # Export the model if node is master and export_dir is set and if experiment is multinode - check if its master
+    if os.environ.get('PS_CONFIG') and os.environ.get('TYPE') != 'master':
+        tf.logging.debug('No model was exported')
+        return
+
+    if flags_obj.export_dir:
         tf.logging.debug('Starting to Export model to {}'.format(str(flags_obj.export_dir)))
         image = tf.placeholder(tf.float32, [None, 28, 28])
         input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
@@ -268,7 +254,12 @@ def main(_):
 if __name__ == '__main__':
 
     tf.logging.set_verbosity(tf.logging.DEBUG)
-    set_tf_config()
+
+    if gradient_sdk:
+        try:
+            get_tf_config()
+        except:
+            pass
     define_mnist_flags()
     # Print ENV Variables
     tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20)

diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,8 @@ py-cpuinfo>=3.3.0
 scipy>=0.19.1
 typing
 matplotlib
+gradient-sdk
+pillow
+requests
+absl-py # for local run
+tensorflow==1.13.1 # for local run