In [1]:
import apache_beam as beam
import glob
import os
import sys

import google.auth

from datetime import datetime
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.runners import DataflowRunner

import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner

In [None]:
! gsutil -m cp -r gs://text-analysis-323506/data ./
! unzip ./data/data_archive.zip 
! bzip2 -d train.ft.txt.bz2 
! bzip2 -d test.ft.txt.bz2
! mv *.ft.txt ./data/
! mv ./data/ ./split_data
! gsutil -m cp -r ./split_data gs://text-analysis-323506/

In [2]:
def strip_lines(line: str):
    line = line.strip()
    line = ' '.join(line.split())
    return line

In [3]:
def contract_lines(line: str):
    contraction_dict = {"ain't": "are not", "'s":" is", "aren't": "are not", "don't": "do not", "didn't": "did not", "won't": "will not", 
                   "can't": "cannot"}
    
    words = line.split()
    for i in range(len(words)):
        if words[i] in contraction_dict:
            words[i] = contraction_dict[words[i]]
    return ' '.join(words)

In [4]:
def to_lower_case(line: str):
    return line.lower()

In [5]:
def remove_punctuations(line: str):
    import string
    
    line = line.translate(str.maketrans('', '', string.punctuation))
    return line

In [6]:
def remove_stopwords(line: str):
    from nltk.corpus import stopwords
    
    stop_words = set(stopwords.words('english'))
    return " ".join([word for word in line.split() if word not in stop_words])

In [7]:
def remove_special_chars(line: str):
    import re
    line = re.sub('[-+.^:,]','',line)
    return line

In [8]:
def remove_emojis(line: str):
    import re
    
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', line)

In [9]:
def split_data(line: str):
    line = line.split(maxsplit=1)
    if len(line) == 2:
        value = line[1]
        label = 1 if line[0] == 'label1' else 0
        return [value, label]

In [10]:
def to_csv(line):
    return ','.join([str(entry) for entry in line])

## Create Apache Beam Pipeline

__Configurations__

In [11]:
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=[])

# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-east1'

options.view_as(GoogleCloudOptions).job_name = f'textparser-{datetime.now().strftime("%Y%m%d-%H%M%S")}'

dataflow_gcs_location = f'gs://text-analysis-323506/{options.view_as(GoogleCloudOptions).job_name}'

# The directory to store the output files of the job.
output_gcs_location = f"{dataflow_gcs_location}/output"

# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = f"{dataflow_gcs_location}/staging"

# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = f"{dataflow_gcs_location}/temp"

In [12]:
# Create pipeline object
p = beam.Pipeline(InteractiveRunner(), options=options)

In [13]:
INPUT_FILE = 'gs://text-analysis-323506/split_data/train.ft.txt'

In [18]:
# Add pipeline components
csv_details =   (
                    p 
                    | 'Read text file' >> beam.io.ReadFromText(INPUT_FILE)
                    | 'Strip lines' >> beam.Map(strip_lines)
                    | 'Contract lines' >> beam.Map(contract_lines)
                    | 'Lower case' >> beam.Map(to_lower_case)
                    | 'Remove punctuations' >> beam.Map(remove_punctuations)
                    # | 'Remove stopwords' >> beam.Map(remove_stopwords)
                    | 'Remove special characters' >> beam.Map(remove_special_chars)
                    | 'Remove emojis' >> beam.Map(remove_emojis)
                    | 'Split data' >> beam.Map(split_data)
                    | 'Filter none values' >> beam.Filter(lambda x: x is not None)
                    | 'To csv' >> beam.Map(to_csv)
                    | 'Write as csv' >> beam.io.WriteToText('gs://text-analysis-323506/preprocessed/train', file_name_suffix='.csv', 
                                                            header='text, label')

)

In [19]:
ib.show_graph(p)

### Run the pipeline on Cloud Dataflow

In [215]:
# Submit job
# Important: Enable dataflow api if not enabled, in google cloud platform console
pipeline_result = DataflowRunner().run_pipeline(p, options=options)

