In [5]:
import apache_beam as beam
import glob
import os
import sys

import google.auth
from google.cloud import language_v1

from datetime import datetime
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.runners import DataflowRunner

import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner

In [6]:
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=[])

# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-east1'

options.view_as(GoogleCloudOptions).job_name = f'textparser-{datetime.now().strftime("%Y%m%d-%H%M%S")}'

dataflow_gcs_location = f'gs://text-analysis-323506/{options.view_as(GoogleCloudOptions).job_name}'

# The directory to store the output files of the job.
output_gcs_location = f"{dataflow_gcs_location}/output"

# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = f"{dataflow_gcs_location}/staging"

# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = f"{dataflow_gcs_location}/temp"

In [7]:
# Create pipeline object
p = beam.Pipeline(InteractiveRunner(), options=options)

In [14]:
def strip_lines(message: bytes):
    import json
    
    return json.loads(message.decode("utf-8"))

In [8]:
def strip_lines(message: dict):
    line = line.strip()
    line = ' '.join(line.split())
    return lin

In [11]:
def detect_sentiments(line: str):
    document = {'content': text, 'type_': language_v1.Document.Type.PLAIN_TEXT, 'language': 'en'}
    response  = language_client.analyze_sentiment(request= {'document': document})
    if response.document_sentiment.score > 0:
        sentiment = 1
    else:
        sentiment = 0
    return {'sentence': line, 'sentiment': sentiment}
    

In [13]:
# Add pipeline components
language_client = language_v1.LanguageServiceClient()
csv_details =   (
                    p 
                    | 'Consume messages' >> beam.io.ReadFromPubSub(topic='projects/text-analysis-323506/topics/reviews-texts')
                    | 'Convert to str' >> beam.Map()
                    | 'Strip lines' >> beam.Map(strip_lines)
                    | 'Remove special characters' >> beam.Map(remove_special_chars)
                    | 'Remove emojis' >> beam.Map(remove_emojis)
                    | 'Call gcloud nlp api' >> beam.Map(detect_sentiments)
                    | 'To result topic' >> beam.io.WriteToPubSub(topic='projects/text-analysis-323506/topics/sa-results')
)

TypeCheckError: Type hint violation for '[13]: Strip lines': requires <class 'str'> but got <class 'bytes'> for line
Full type hint:
IOTypeHints[inputs=((<class 'str'>,), {}), outputs=((Any,), {})]
strip_iterable()

based on:
  IOTypeHints[inputs=((<class 'str'>,), {}), outputs=((Iterable[Any],), {})]
  File "/root/apache-beam-2.37.0/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3444, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
  File "/root/apache-beam-2.37.0/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3524, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-cbfa07d004fc>", line 10, in <module>
      | 'To result topic' >> beam.io.WriteToPubSub(topic='projects/text-analysis-323506/topics/sa-results')
  File "/root/apache-beam-2.37.0/lib/python3.7/site-packages/apache_beam/transforms/core.py", line 1656, in Map
      wrapper = with_output_types(typehints.Iterable[output_hint])(wrapper)
  File "/root/apache-beam-2.37.0/lib/python3.7/site-packages/apache_beam/typehints/decorators.py", line 863, in annotate_output_types
      f._type_hints = th.with_output_types(return_type_hint)  # pylint: disable=protected-access
  
  based on:
    IOTypeHints[inputs=((<class 'str'>,), {}), outputs=None]
    File "/root/apache-beam-2.37.0/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3444, in run_ast_nodes
        if (await self.run_code(code, result,  async_=asy)):
    File "/root/apache-beam-2.37.0/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3524, in run_code
        exec(code_obj, self.user_global_ns, self.user_ns)
    File "<ipython-input-13-cbfa07d004fc>", line 10, in <module>
        | 'To result topic' >> beam.io.WriteToPubSub(topic='projects/text-analysis-323506/topics/sa-results')
    File "/root/apache-beam-2.37.0/lib/python3.7/site-packages/apache_beam/transforms/core.py", line 1653, in Map
        wrapper)
    File "/root/apache-beam-2.37.0/lib/python3.7/site-packages/apache_beam/typehints/decorators.py", line 776, in annotate_input_types
        *converted_positional_hints, **converted_keyword_hints)

In [None]:
ib.show_graph(p)

In [None]:
# Submit job
# Important: Enable dataflow api if not enabled, in google cloud platform console
pipeline_result = DataflowRunner().run_pipeline(p, options=options)