In [None]:
# This pipeline is to be run in Vertex Workbench
# Install Kubeflow Pipelines and GCP AI Platform
!pip3 install kfp --user -q
!pip3 install --upgrade google-cloud-platform --user -q

In [None]:
from datetime import date, datetime
from typing import NamedTuple # for passing data between steps
import google.cloud.aiplatform as aip
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component

In [None]:
PROJECT_ID = "iowa-steam"
BUCKET_NAME = "iowa-source-steam-data"
BUCKET_URI = f"gs://{BUCKET_NAME}"
TRAINING_DATA_URI = f"{BUCKET_URI}/train_canonical.csv"
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/control"
TIMESTAMP = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")

In [None]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

In [None]:
@component(
    packages_to_install=[
        "pandas", 
        "numpy", 
        "matplotlib", 
        "pickle", 
        "google-cloud-storage"
    ]
)
def preprocess_data_op(bucket_name: str, train_data_uri: str) -> str:
  # The following code is from Andre's preprocessing pipeline,
  # modified to use GCP cloud storage
  from google.cloud import storage
  import pandas as pd

  def process_data(data, content):
    
    '''
    data is in a csv format
    
    content is the column in data set that you want to process
    
    Content_Parsed_4 is the final processed output 
    
    '''
    
    #\r and \n
    data['Content_Parsed_1'] = content.str.replace("\r", " ")
    data['Content_Parsed_1'] = data['Content_Parsed_1'].str.replace("\n", " ")
    data['Content_Parsed_1'] = data['Content_Parsed_1'].str.replace("    ", " ")

    # quotation marks
    data['Content_Parsed_1'] = data['Content_Parsed_1'].str.replace('"', '')

    # Lower casing all words so that upper case words (ex: at the beginning of a sentence) 
    # are read the same as lower case words
    data['Content_Parsed_2'] = data['Content_Parsed_1'].str.lower()

    # punctuation signs
    punctuation_signs = list("?:!.,;")
    data['Content_Parsed_3'] = data['Content_Parsed_2']

    for i in punctuation_signs:
        data['Content_Parsed_3'] = data['Content_Parsed_3'].str.replace(i, '')
    
    # Possessive pronouns 
    data['Content_Parsed_4'] = data['Content_Parsed_3'].str.replace("'s", "")
    
    return data

  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_name)

  # read raw source data from GCS
  data = pd.read_csv(train_data_uri)

  # process data
  processed = process_data(data)
  
  # Store as ephemeral CSV
  processed_filename = f"train_canonical_{TIMESTAMP}.csv"
  processed.to_csv(processed_filename, index=False, header=False)

  # upload to GCS
  gcs_dest_path = f"data_processed/{processed_filename}"
  blob = bucket.blob(gcs_dest_path)
  blob.upload_from_filename(processed_filename)

  return gcs_dest_path

In [None]:
# Pre-processing component
# To do: integrate Andre's component here
@component(
    packages_to_install=[
        "pandas",
        "google-cloud-storage",
        "google-cloud-aiplatform"
    ])
def create_automl_import_file_op(bucket_name: str) -> NamedTuple('Outputs', [('uri', str), ('size', int)]):
  # Todo: integrate Andre's work
  return ("gs://test", 1000)

In [None]:
@dsl.pipeline(
    name="iowa-steam-sentiment-pipeline",
    pipeline_root=PIPELINE_ROOT
)

def pipeline():
  import_op = create_automl_import_file_op(BUCKET_NAME)

  ds_op = gcc_aip.TextDatasetCreateOp(
      project=PROJECT_ID,
      display_name=f"iowa-steam-reviews-processed-{TIMESTAMP}",
      gcs_source=import_op.outputs['uri'],
      import_schema_uri='aiplatform.schema.dataset.ioformat.text.single_label_classification',
      sync=sync)
  ds_op.wait()

  # todo: chain together training of model

In [None]:
compiler.Compiler.compile(
    pipeline_func=pipeline,
    package_path="iowa-steam-sentiment-pipeline.json"
)

In [None]:
job = aip.PipelineJob(
    display_name="iowa-steam-pipeline",
    template_path="iowa-steam-sentiment-pipeline.json",
    pipeline_root=PIPELINE_ROOT
)

job.submit(service_account="TODO: add service account from GCP")