In [1]:
import apache_beam as beam
import glob
import os
import sys
import dill

import pandas as pd
import numpy as np
import google.auth

from typing import Dict
from datetime import datetime
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions, StandardOptions
from apache_beam.runners import DataflowRunner

from google.cloud import bigquery

from apache_beam.runners import DataflowRunner
from apache_beam.runners import DirectRunner


from apache_beam import DoFn, GroupByKey, io, ParDo, Pipeline, PTransform, WindowInto, WithKeys

In [2]:
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(streaming=True, save_main_session=True)

# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-east1'

options.view_as(GoogleCloudOptions).job_name = f'sa-{datetime.now().strftime("%Y%m%d-%H%M%S")}'

dataflow_gcs_location = f'gs://text-analysis-323506/{options.view_as(GoogleCloudOptions).job_name}'

# The directory to store the output files of the job.
output_gcs_location = f"{dataflow_gcs_location}/output"

# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = f"{dataflow_gcs_location}/staging"

# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = f"{dataflow_gcs_location}/temp"

In [3]:
def load_comment(message: bytes):
    import json
    message = json.loads(message.decode("utf-8"))
    
    return message

In [4]:
def preprocess_comment(message: Dict):
    line = message['text']

    # Remove extra spaces, hastags and new line characters
    line = line.strip()
    line = line.replace('\n', '')
    line = line.replace('\\', '')
    line = line.replace('#', '')
    line = line.replace('&', ' ')
    line = ' '.join(line.split())

    # Href strings in comments
    re.sub("<[^>]+>", "", line)

    # Remove @ mentions and URLs
    line = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", line)

    # Remove extra spaces
    line = " ".join(line.split())

    # Remove special characters
    line = re.sub('[-+.^:,!]', '', line)

    # Remove Numbers
    line = ' '.join(c for c in line.split() if not c.isdigit())

    # Remove Emojies
    emoj = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002500-\U00002BEF"  # chinese char
    u"\U00002702-\U000027B0"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u2640-\u2642" 
    u"\u2600-\u2B55"
    u"\u200d"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\ufe0f"  # dingbats
    u"\u3030"
                  "]+", re.UNICODE)
    line = re.sub(emoj, '', line)

    line = line.lower().strip()
    
    # Expanding short forms
    contraction_dict = {"ain't": "are not", "'s": " is", "i'm": "i am", "aren't": "are not", "don't": "do not",
                        "didn't": "did not", "won't": "will not",
                        "can't": "cannot", "wouldn't": "would not", "hv": "have", "ik": "i know", "fr": "for real"}

    words = line.split()
    for i in range(len(words)):
        if words[i] in contraction_dict:
            words[i] = contraction_dict[words[i]]
    line = ' '.join(words)
    
    message['preprocessed'] = nlp.Document(line, type='PLAIN_TEXT')
    return message


In [5]:
def detect_sentiments(message: Dict):
    from google.cloud import language
    
    client = language.LanguageServiceClient()
    line = message['preprocessed']
    
    try:
        message['response'] = client.analyze_sentiment(document={'content': line.content, 'type': line.type})
    except Exception:
        message['response'] = None
        
    return message

In [6]:
def prepare_results(message):
    import json
    
    response = message['response']
    
    if response:
        message['score'] = response.document_sentiment.score
        message['sentiment'] = 'hate' if message['score'] <= -0.6 else 'normal'
    else:
        message['score'] = np.nan
        message['sentiment'] = 'NA'
        
    del message['preprocessed']
    del message['response']
    
    print(message)
    return message

In [7]:
# Separate Results into Hate comments or Normal comments

class ResultsFilter(beam.DoFn):
    
    OUTPUT_TAG_HATE = 'Hate comments'
    OUTPUT_TAG_NORM = 'Normal comments'
        
    def process(self, result):
        import json
        from apache_beam import pvalue
        
        sentiment = result['sentiment']
        
        if sentiment == 'hate':
            yield pvalue.TaggedOutput(self.OUTPUT_TAG_HATE, result)
        else:
            yield pvalue.TaggedOutput(self.OUTPUT_TAG_NORM, result)

In [8]:
def convert_to_bytes(result):
    import json
    return json.dumps(result).encode("utf-8")

In [9]:
# Create pipeline object
pipeline = beam.Pipeline(options=options)

### Build Pipeline

In [10]:
# Get tweet sentiments
results =  (
        pipeline
        | 'From PubSub' >> io.gcp.pubsub.ReadFromPubSub(topic='projects/text-analysis-323506/topics/yt-comments')
        | 'Load Comments' >> beam.Map(load_comment)
        | 'Preprocess Comments' >> beam.Map(preprocess_comment)
        | 'Detect Sentiments' >> beam.Map(detect_sentiments)
        | 'Prepare Results' >> beam.Map(prepare_results)
    )



In [11]:
separated_results = (results | 'Divide Results' >> beam.ParDo(ResultsFilter()).with_outputs('Hate comments', 'Normal comments'))

### Results to pubsub
In this example we are sending only hate comments results to result pubsub topic.

In [12]:
# Hate comments results to PubSub topic
hate_comments_pubsub = (
                    separated_results['Hate comments']
                    | 'Bytes Conversion' >> beam.Map(convert_to_bytes)
                    | 'PS Hate Comments' >> beam.io.WriteToPubSub(topic='projects/text-analysis-323506/topics/hs-results')
                )

### Results to Bigquery

We will send normal comments and hate comments to separate tables. These can then be used for analysis.

In [13]:
schema = [
        bigquery.SchemaField("timestamp", "TIMESTAMP", mode="NULLABLE"),
        bigquery.SchemaField("text", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("user_name", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("user_id", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("user_profile", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("video_id", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("score", "FLOAT", mode="NULLABLE"),
        bigquery.SchemaField("sentiment", "STRING", mode="NULLABLE")
    ]

hate_comments_bq = (
                separated_results['Hate comments']
                | 'BQ Hate Comments' >> beam.io.WriteToBigQuery(table='hate_comments', dataset='yt_comments_analysis', project='text-analysis-323506')
            )   

normal_comments_bq = (
                separated_results['Normal comments']
                | 'BQ Norm Comments' >> beam.io.WriteToBigQuery(table='normal_comments', dataset='yt_comments_analysis', project='text-analysis-323506')
            )

### Direct Runner

In [None]:
pipeline_result = DirectRunner().run_pipeline(pipeline, options=options).wait_until_finish()

### DataFlow Runner

In [14]:
pipeline_result = DataflowRunner().run_pipeline(pipeline, options=options)

