In [1]:
import apache_beam as beam
import glob
import os
import sys

import google.auth
from datetime import datetime
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions, StandardOptions
from apache_beam.runners import DataflowRunner

from apache_beam.ml.gcp import naturallanguageml as nlp
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners import DataflowRunner
from apache_beam.runners import DirectRunner


from apache_beam import DoFn, GroupByKey, io, ParDo, Pipeline, PTransform, WindowInto, WithKeys

In [2]:
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(streaming=True, save_main_session=True)

# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-east1'

options.view_as(GoogleCloudOptions).job_name = f'sa-{datetime.now().strftime("%Y%m%d-%H%M%S")}'

dataflow_gcs_location = f'gs://text-analysis-323506/{options.view_as(GoogleCloudOptions).job_name}'

# The directory to store the output files of the job.
output_gcs_location = f"{dataflow_gcs_location}/output"

# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = f"{dataflow_gcs_location}/staging"

# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = f"{dataflow_gcs_location}/temp"

In [3]:
def get_review(message: bytes):
    import json

    return json.loads(message.decode("utf-8"))['review']

In [4]:
def strip_lines(line: str):
    line = line.strip()
    line = ' '.join(line.split())
    return line

In [5]:
def remove_emojis(line: str):
    import re
    
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', line)

In [6]:
def parse_response(response):
    import json
    
    final_sentence = ''
    
    for sentence in response.sentences:
        final_sentence += sentence.text.content
        
    message = json.dumps({'review': final_sentence, 'sentiment': response.document_sentiment.score})
    return message.encode("utf-8")

In [7]:
def convert_to_doc(line: str):
    from apache_beam.ml.gcp import naturallanguageml as nlp
    
    return nlp.Document(line, type='PLAIN_TEXT')

In [8]:
# Create pipeline object
pipeline = beam.Pipeline(options=options)

### Build Pipeline

In [9]:
# Add pipeline components
features = nlp.types.AnnotateTextRequest.Features(
    extract_document_sentiment=True)

sa_results =  (
        pipeline
        | 'Consume messages' >> io.gcp.pubsub.ReadFromPubSub(topic='projects/text-analysis-323506/topics/reviews-texts')
        | 'get review' >> beam.Map(get_review)
        | 'Strip lines' >> beam.Map(strip_lines)
        | 'Remove emojis' >> beam.Map(remove_emojis)
        | 'convert to doc' >> beam.Map(convert_to_doc)
        | 'Call gcloud nlp api' >> nlp.AnnotateText(features)
        | 'process response' >> beam.Map(parse_response)
        | 'To result topic' >> beam.io.WriteToPubSub(topic='projects/text-analysis-323506/topics/sa-results')
    )



### Direct Runner

In [None]:
pipeline_result = DirectRunner().run_pipeline(pipeline, options=options).wait_until_finish()

### DataFlow Runner

In [None]:
pipeline_result = DataflowRunner().run_pipeline(pipeline, options=options)