In [54]:
import apache_beam as beam
import pandas as pd
import glob
import os
import sys

from google.cloud import bigquery
from apache_beam.io.gcp.internal.clients import bigquery as BeamBigQuery

import google.auth

from datetime import datetime

from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.runners import DataflowRunner

import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner

To access data in our csv files, it needs to be put on a bigquery table first.

## Pushing data in csv files to a bigquery table  

In [6]:
bq_client = bigquery.Client()

### Create Dataset

In [7]:
dataset = bigquery.Dataset('text-analysis-323506.dataflow_dataset')

In [8]:
dataset.location = "us-east1"

In [9]:
dataset = bq_client.create_dataset(dataset, timeout=30)

### Create Table 

In [26]:
schema = [
        bigquery.SchemaField("Series_reference", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("Period", "FLOAT", mode="NULLABLE"),
        bigquery.SchemaField("Data_value", "FLOAT", mode="NULLABLE"),
        bigquery.SchemaField("STATUS", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("UNITS", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("MAGNTUDE", "INTEGER", mode="NULLABLE"),
        bigquery.SchemaField("Subject", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("Group", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("Series_title_1", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("Series_title_2", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("Series_title_3", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("Series_title_4", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("Series_title_5", "STRING", mode="NULLABLE"),
    ]

In [27]:
table_id = 'text-analysis-323506.dataflow_dataset.df_table'

In [28]:
table = bigquery.Table(table_id, schema=schema)

In [29]:
table = bq_client.create_table(table)

## Insert data into table

In [30]:
file_list = glob.glob('./data/*.csv')
gcs_dir = 'gs://text-analysis-323506/data'

In [31]:
# Job Config

job_config = bigquery.LoadJobConfig(
    schema=schema,
    skip_leading_rows=1,
    # The source format defaults to CSV, so the line below is optional.
    source_format=bigquery.SourceFormat.CSV,
    # WRITE_TRUNCATE replaces existing data 
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND
)

In [32]:
for file in file_list:
    os.system(f'gsutil -m cp {file} {gcs_dir}')
    gcs_uri = os.path.join(gcs_dir, os.path.basename(file))
    
    load_job = bq_client.load_table_from_uri(
        gcs_uri, table_id, job_config=job_config, 
    )
    
    load_job.result()  # Waits for the job to complete.

In [33]:
destination_table = bq_client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

Loaded 1318215 rows.


#### Data is in BigQuery table !!!

## Run Pipeline as a dataflow job

In [61]:
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=[])

# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-east1'

In [62]:
options.view_as(GoogleCloudOptions).job_name = f'dataflow_csvreader-{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}'

In [63]:
dataflow_gcs_location = f'gs://text-analysis-323506/{options.view_as(GoogleCloudOptions).job_name}'

# The directory to store the output files of the job.
output_gcs_location = f"{dataflow_gcs_location}/output"

In [64]:
# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = f"{dataflow_gcs_location}/staging"

# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = f"{dataflow_gcs_location}/temp"

In [65]:
count = 0
def get_info(df):
    global count
    count = count + 1
    return {'df_num': count, 'columns': list(df.columns), 'shape': df.shape}

In [66]:
# Create pipeline object
p = beam.Pipeline(InteractiveRunner(), argv=argv)

In [67]:
# Add pipeline components
csv_details =   (
                    p 
                    | 'Get data from BigQuery' >> beam.io.ReadFromBigQuery(query='Select * From text-analysis-323506.dataflow_dataset.df_table',
                                                                           use_standard_sql=True, project='text-analysis-323506')
                    | 'Write data to Cloud Storage' >> beam.io.WriteToText(f"{output_gcs_location}/csv_details-output.txt")
                 )

  temp_location = pcoll.pipeline.options.view_as(


#### Visualize Pipeline

In [68]:
ib.show_graph(p)

In [69]:
# Submit job
# Important: Enable dataflow api if not enabled, in google cloud platform console
pipeline_result = DataflowRunner().run_pipeline(p, options=options)

