SeldonIO · axsaucedo · May 30, 2019 · May 24, 2019 · May 24, 2019 · May 24, 2019
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 /target/
 /public
 cluster-manager/.m2/
+.mypy_cache
 
 .ipynb_checkpoints
 
@@ -202,4 +203,4 @@ wrappers/s2i/python/_python/
 
 seldon-controller/go
 
-testing/scripts/go
+testing/scripts/go
diff --git a/examples/kubeflow/README.md b/examples/kubeflow/README.md
diff --git a/examples/kubeflow/deploy_pipeline/pvc-access.yaml b/examples/kubeflow/deploy_pipeline/pvc-access.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pvc-access-container
+spec:
+  containers:
+  - name: pvc-access-container
+    image: busybox
+    command: ["/bin/sh", "-ec", "sleep 1000"]
+    volumeMounts:
+    - name: mypvc
+      mountPath: /mnt
+  volumes:
+  - name: mypvc
+    persistentVolumeClaim:
+      claimName: PVC_NAME
diff --git a/examples/kubeflow/deploy_pipeline/seldon_production_pipeline.yaml b/examples/kubeflow/deploy_pipeline/seldon_production_pipeline.yaml
@@ -0,0 +1,71 @@
+---
+apiVersion: machinelearning.seldon.io/v1alpha2
+kind: SeldonDeployment
+metadata:
+  labels:
+    app: seldon
+  name: nlp-classifier
+  namespace: kubeflow
+spec:
+  annotations:
+    project_name: NLP Pipeline
+    deployment_version: v1
+  name: nlp-classifier
+  oauth_key: oauth-key
+  oauth_secret: oauth-secret
+  predictors:
+  - componentSpecs:
+    - spec:
+        containers:
+        - image: clean_text_transformer:0.1
+          imagePullPolicy: IfNotPresent
+          name: cleantext
+          resources:
+            requests:
+              memory: 1Mi
+        - image: spacy_tokenizer:0.1
+          imagePullPolicy: IfNotPresent
+          name: spacytokenizer
+        - image: tfidf_vectorizer:0.1
+          imagePullPolicy: IfNotPresent
+          name: tfidfvectorizer
+          volumeMounts:
+          - name: mypvc
+            mountPath: /mnt
+        - image: lr_text_classifier:0.1
+          imagePullPolicy: IfNotPresent
+          name: lrclassifier
+          volumeMounts:
+          - name: mypvc
+            mountPath: /mnt
+        terminationGracePeriodSeconds: 20
+        volumes:
+        - name: mypvc
+          persistentVolumeClaim:
+            claimName: PVC_NAME
+    graph:
+      children:
+      - name: spacytokenizer
+        endpoint:
+          type: REST
+        type: MODEL
+        children:
+        - name: tfidfvectorizer
+          endpoint:
+            type: REST
+          type: MODEL
+          children:
+          - name: lrclassifier
+            endpoint:
+              type: REST
+            type: MODEL
+            children: []
+      name: cleantext
+      endpoint:
+        type: REST
+      type: MODEL
+    name: single-model
+    replicas: 1
+    annotations:
+      predictor_version: v1
+
diff --git a/examples/kubeflow/img/completed-pipeline.jpg b/examples/kubeflow/img/completed-pipeline.jpg
diff --git a/examples/kubeflow/img/k-pipeline-dashboard.jpg b/examples/kubeflow/img/k-pipeline-dashboard.jpg
diff --git a/examples/kubeflow/img/kubeflow-seldon-nlp-full.jpg b/examples/kubeflow/img/kubeflow-seldon-nlp-full.jpg
diff --git a/examples/kubeflow/img/kubeflow-seldon-nlp-ml-pipelines-deploy.jpg b/examples/kubeflow/img/kubeflow-seldon-nlp-ml-pipelines-deploy.jpg
diff --git a/examples/kubeflow/img/kubeflow-seldon-nlp-ml-pipelines-training.jpg b/examples/kubeflow/img/kubeflow-seldon-nlp-ml-pipelines-training.jpg
diff --git a/examples/kubeflow/img/kubeflow-seldon-nlp-ml-pipelines-zoom.jpg b/examples/kubeflow/img/kubeflow-seldon-nlp-ml-pipelines-zoom.jpg
diff --git a/examples/kubeflow/img/kubeflow-seldon-nlp-ml-pipelines.jpg b/examples/kubeflow/img/kubeflow-seldon-nlp-ml-pipelines.jpg
diff --git a/examples/kubeflow/img/kubeflow-seldon-nlp-reusable-components.jpg b/examples/kubeflow/img/kubeflow-seldon-nlp-reusable-components.jpg
diff --git a/examples/kubeflow/img/pipeline-only-run.jpg b/examples/kubeflow/img/pipeline-only-run.jpg
diff --git a/examples/kubeflow/img/pipeline-view.jpg b/examples/kubeflow/img/pipeline-view.jpg
diff --git a/examples/kubeflow/img/running-pipeline.jpg b/examples/kubeflow/img/running-pipeline.jpg
diff --git a/examples/kubeflow/img/upload-pipeline.jpg b/examples/kubeflow/img/upload-pipeline.jpg
diff --git a/examples/kubeflow/kubeflow_seldon_e2e_pipeline.ipynb b/examples/kubeflow/kubeflow_seldon_e2e_pipeline.ipynb
diff --git a/examples/kubeflow/pipeline/pipeline_steps/clean_text/.s2i/environment b/examples/kubeflow/pipeline/pipeline_steps/clean_text/.s2i/environment
@@ -0,0 +1,4 @@
+MODEL_NAME=Transformer
+API_TYPE=REST
+SERVICE_TYPE=MODEL
+PERSISTENCE=0
diff --git a/examples/kubeflow/pipeline/pipeline_steps/clean_text/Transformer.py b/examples/kubeflow/pipeline/pipeline_steps/clean_text/Transformer.py
@@ -0,0 +1,43 @@
+import re 
+from html.parser import HTMLParser
+import numpy as np
+import logging
+
+class Transformer():
+    __html_parser = HTMLParser()
+    __uplus_pattern = \
+        re.compile("\<[uU]\+(?P<digit>[a-zA-Z0-9]+)\>")
+    __markup_link_pattern = \
+        re.compile("\[(.*)\]\((.*)\)")
+
+    def predict(self, X, feature_names=[]):
+        logging.warning(X)
+        f = np.vectorize(Transformer.transform_clean_text)
+        X_clean = f(X)
+        logging.warning(X_clean)
+        return X_clean
+
+    def fit(self, X, y=None, **fit_params):
+        return self
+
+    @staticmethod
+    def transform_clean_text(raw_text):
+        try:
+            decoded = raw_text.encode("ISO-8859-1").decode("utf-8")
+        except:
+            decoded = raw_text.encode("ISO-8859-1").decode("cp1252")
+        html_unescaped =Transformer.\
+            __html_parser.unescape(decoded) 
+        html_unescaped = re.sub(r"\r\n", " ", html_unescaped)
+        html_unescaped = re.sub(r"\r\r\n", " ", html_unescaped)
+        html_unescaped = re.sub(r"\r", " ", html_unescaped)
+        html_unescaped = html_unescaped.replace("&gt;", " > ")
+        html_unescaped = html_unescaped.replace("&lt;", " < ")
+        html_unescaped = html_unescaped.replace("--", " - ")
+        html_unescaped = Transformer.__uplus_pattern.sub(
+            " U\g<digit> ", html_unescaped)
+        html_unescaped = Transformer.__markup_link_pattern.sub(
+            " \1 \2 ", html_unescaped)
+        html_unescaped = html_unescaped.replace("\\", "")
+        return html_unescaped
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/clean_text/__init__.py b/examples/kubeflow/pipeline/pipeline_steps/clean_text/__init__.py
diff --git a/examples/kubeflow/pipeline/pipeline_steps/clean_text/build_image.sh b/examples/kubeflow/pipeline/pipeline_steps/clean_text/build_image.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+s2i build . seldonio/seldon-core-s2i-python3:0.6 clean_text_transformer:0.1
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/clean_text/pipeline_step.py b/examples/kubeflow/pipeline/pipeline_steps/clean_text/pipeline_step.py
@@ -0,0 +1,24 @@
+import dill
+import click
+import dill
+try:
+    # Running for tests
+    from .Transformer import Transformer
+except:
+    # Running from CLI
+    from Transformer import Transformer
+
+@click.command()
+@click.option('--in-path', default="/mnt/raw_text.data")
+@click.option('--out-path', default="/mnt/clean_text.data")
+def run_pipeline(in_path, out_path):
+    clean_text_transformer = Transformer()
+    with open(in_path, 'rb') as in_f:
+        x = dill.load(in_f)
+    y = clean_text_transformer.predict(x)
+    with open(out_path, "wb") as out_f:
+        dill.dump(y, out_f)
+
+if __name__ == "__main__":
+    run_pipeline()
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/clean_text/requirements.txt b/examples/kubeflow/pipeline/pipeline_steps/clean_text/requirements.txt
@@ -0,0 +1,3 @@
+dill==0.2.9
+click
+numpy
diff --git a/examples/kubeflow/pipeline/pipeline_steps/data_downloader/Dockerfile b/examples/kubeflow/pipeline/pipeline_steps/data_downloader/Dockerfile
@@ -0,0 +1,4 @@
+FROM python:3.7-slim
+COPY . /microservice
+WORKDIR /microservice
+RUN pip install -r requirements.txt
diff --git a/examples/kubeflow/pipeline/pipeline_steps/data_downloader/__init__.py b/examples/kubeflow/pipeline/pipeline_steps/data_downloader/__init__.py
diff --git a/examples/kubeflow/pipeline/pipeline_steps/data_downloader/build_image.sh b/examples/kubeflow/pipeline/pipeline_steps/data_downloader/build_image.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+docker build . -t data_downloader:0.1
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/data_downloader/pipeline_step.py b/examples/kubeflow/pipeline/pipeline_steps/data_downloader/pipeline_step.py
@@ -0,0 +1,35 @@
+import click
+import numpy as np
+import dill
+import pandas as pd
+
+@click.command()
+@click.option('--labels-path', default="/mnt/labels.data")
+@click.option('--features-path', default="/mnt/features.data")
+@click.option('--csv-url', default="https://raw.githubusercontent.com/axsauze/reddit-classification-exploration/master/data/reddit_train.csv")
+@click.option('--csv-encoding', default="ISO-8859-1")
+@click.option('--features-column', default="BODY")
+@click.option('--labels-column', default="REMOVED")
+def run_pipeline(
+        labels_path, 
+        features_path,
+        csv_url, 
+        csv_encoding,
+        features_column,
+        labels_column):
+
+    df = pd.read_csv(csv_url, encoding=csv_encoding)
+
+    x = df[features_column].values
+
+    with open(features_path, "wb") as out_f:
+        dill.dump(x, out_f)
+
+    y = df[labels_column].values
+
+    with open(labels_path, "wb") as out_f:
+        dill.dump(y, out_f)
+
+if __name__ == "__main__":
+    run_pipeline()
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/data_downloader/requirements.txt b/examples/kubeflow/pipeline/pipeline_steps/data_downloader/requirements.txt
@@ -0,0 +1,5 @@
+dill==0.2.9
+click
+numpy==1.16.3
+pandas==0.24.2
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/.s2i/environment b/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/.s2i/environment
@@ -0,0 +1,4 @@
+MODEL_NAME=Transformer
+API_TYPE=REST
+SERVICE_TYPE=MODEL
+PERSISTENCE=0
diff --git a/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/Transformer.py b/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/Transformer.py
@@ -0,0 +1,17 @@
+
+import dill
+import logging
+
+class Transformer(object):
+    def __init__(self):
+
+        with open('/mnt/lr.model', 'rb') as model_file:
+            self._lr_model = dill.load(model_file)
+
+    def predict(self, X, feature_names):
+        logging.warning(X)
+        prediction = self._lr_model.predict_proba(X)
+        logging.warning(prediction)
+        return prediction
+
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/__init__.py b/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/__init__.py
diff --git a/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/build_image.sh b/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/build_image.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+s2i build . seldonio/seldon-core-s2i-python3:0.6 lr_text_classifier:0.1
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/pipeline_step.py b/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/pipeline_step.py
@@ -0,0 +1,49 @@
+import click
+import numpy as np
+import dill
+from sklearn.linear_model import LogisticRegression
+
+@click.command()
+@click.option('--in-path', default="/mnt/tfidf_vectors.data")
+@click.option('--labels-path', default="/mnt/labels.data")
+@click.option('--out-path', default="/mnt/lr_prediction.data")
+@click.option('--c-param', default=0.1)
+@click.option('--action', default="predict", 
+        type=click.Choice(['predict', 'train']))
+@click.option('--model-path', default="/mnt/lr_text.model")
+def run_pipeline(
+        in_path, 
+        labels_path,
+        out_path, 
+        c_param,
+        action,
+        model_path):
+
+    with open(in_path, 'rb') as in_f:
+        x = dill.load(in_f)
+
+    if action == "train":
+        lr_model = LogisticRegression(
+                C=0.1, 
+                solver='sag')
+
+        with open(labels_path, "rb") as f:
+            labels = dill.load(f)
+
+        lr_model.fit(x, labels)
+
+        with open(model_path, "wb") as model_f:
+            dill.dump(lr_model, model_f)
+
+    elif action == "predict":
+        with open(model_path, "rb") as model_f:
+            lr_model = dill.load(model_f)
+
+    y = lr_model.predict_proba(x)
+
+    with open(out_path, "wb") as out_f:
+        dill.dump(y, out_f)
+
+if __name__ == "__main__":
+    run_pipeline()
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/requirements.txt b/examples/kubeflow/pipeline/pipeline_steps/lr_text_classifier/requirements.txt
@@ -0,0 +1,4 @@
+dill==0.2.9
+click
+numpy==1.16.3
+scikit-learn==0.20.3
diff --git a/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/.s2i/environment b/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/.s2i/environment
@@ -0,0 +1,4 @@
+MODEL_NAME=Transformer
+API_TYPE=REST
+SERVICE_TYPE=MODEL
+PERSISTENCE=0
diff --git a/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/Transformer.py b/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/Transformer.py
@@ -0,0 +1,40 @@
+import spacy
+import numpy as np
+import logging
+
+# from spacy.cli import download
+# import importlib
+# download("en_core_web_sm")
+# importlib.reload(spacy)
+
+nlp = spacy.load('en_core_web_sm', parser=False, entity=False)
+
+class Transformer():
+    __symbols = set("!$%^&*()_+|~-=`{}[]:\";'<>?,./-")
+
+    def predict(self, X, feature_names=[]):
+        logging.warning(X)
+        f = np.vectorize(Transformer.transform_to_token, otypes=[object])
+        X_tokenized = f(X)
+        logging.warning(X_tokenized)
+        return X_tokenized
+
+    def fit(self, X, y=None, **fit_params):
+        return self
+
+    @staticmethod
+    def transform_to_token(text):
+        str_text = str(text)
+        doc = nlp(str_text, disable=['parser', 'tagger', 'ner'])
+        tokens = []
+        for token in doc:
+            if token.like_url:
+                clean_token = "URL"
+            else:
+                clean_token = token.lemma_.lower().strip()
+                if len(clean_token) < 1 or clean_token in \
+                        Transformer.__symbols: 
+                    continue
+            tokens.append(clean_token)
+        return tokens
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/__init__.py b/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/__init__.py
diff --git a/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/build_image.sh b/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/build_image.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+docker build . -t seldon-core-s2i-python3-spacy:0.6 
+s2i build . seldon-core-s2i-python3-spacy:0.6 spacy_tokenizer:0.1
+
diff --git a/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/pipeline_step.py b/examples/kubeflow/pipeline/pipeline_steps/spacy_tokenize/pipeline_step.py
@@ -0,0 +1,23 @@
+import click
+import dill
+try:
+    # Running for tests
+    from .Transformer import Transformer
+except:
+    # Running from CLI
+    from Transformer import Transformer
+
+@click.command()
+@click.option('--in-path', default="/mnt/clean_text.data")
+@click.option('--out-path', default="/mnt/tokenized_text.data")
+def run_pipeline(in_path, out_path):
+    spacy_transformer = Transformer()
+    with open(in_path, 'rb') as in_f:
+        x = dill.load(in_f)
+    y = spacy_transformer.predict(x)
+    with open(out_path, "wb") as out_f:
+        dill.dump(y, out_f)
+
+if __name__ == "__main__":
+    run_pipeline()
+