In [1]:
import apache_beam as beam
import pandas as pd
import glob
import os

from datetime import datetime

import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner

In [2]:
count = 0
def print_info(df):
    global count
    count = count + 1
    
    print("")
    print("Dataframe:", count)
    print("columns: ", list(df.columns))
    print("Shape: ", df.shape)
    print("")

In [3]:
# Create pipeline object
p = beam.Pipeline(InteractiveRunner())

In [4]:
# Add pipeline components
csv_details =   (
                    p 
                    | 'List csv files' >> beam.Create(glob.glob('./data/*.csv'))
                    | 'Read csv files' >> beam.Map((pd.read_csv))
                    | 'Print csv details' >> beam.Map(print_info)
                 )

#### Visualize Pipeline

In [5]:
ib.show_graph(p)

##### Make sure tar file in './data' folder is un-tarred. That will yield 4 csv files.

In [6]:
start_time = datetime.now()
p.run().wait_until_finish()
end_time = datetime.now()

  bundle_processor.process_bundle(instruction_id))



Dataframe: 1
columns:  ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5']
Shape:  (1118488, 13)


Dataframe: 2
columns:  ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5']
Shape:  (25468, 13)



  bundle_processor.process_bundle(instruction_id))



Dataframe: 3
columns:  ['STATUS', 'SER_NBR', 'Series_reference', 'Period', 'Data_value', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Age Group 3 brackets', 'Age Group', 'Age Group 6 brackets', 'Duration of unemployment', 'Employed and Unemployed Persons, Full-Time and Part-Time Status', 'Employment relationship', 'Employment status', 'Ethnic Single / Combination', 'Ethnic Total Response', 'Formal study status', 'Highest qualification', 'Hours Worked', 'Household Composition', 'Household Labour Force Status', 'Industry ANZSIC06', 'Industry ANZSIC06 Supplementary', 'Job', 'Job tenure', 'Labour force and education status', 'Labour Force Status', 'Main activity', 'Main job', 'Methods of seeking employment', 'Occupation ANZSCO Level 1', 'Percentage change from previous period and same period previous year', 'Persons Employed, Unemployed, Not in Labour Force (for current quarter)', 'Reason for leaving last job', 'Reason not seeking work', 'Reason not wanting work', 'Reasons not available for 

  bundle_processor.process_bundle(instruction_id))



Dataframe: 4
columns:  ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5']
Shape:  (174259, 13)



### Time taken when using pipeline

In [7]:
print(f"Time taken: {(end_time - start_time).total_seconds()} Seconds")

Time taken: 19.270276 Seconds


In [8]:
start_time = datetime.now()
for csv_file in glob.glob('./data/*.csv'):
    df = pd.read_csv(csv_file)
    print_info(df)
end_time = datetime.now()

  interactivity=interactivity, compiler=compiler, result=result)



Dataframe: 5
columns:  ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5']
Shape:  (1118488, 13)


Dataframe: 6
columns:  ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5']
Shape:  (25468, 13)



  interactivity=interactivity, compiler=compiler, result=result)



Dataframe: 7
columns:  ['STATUS', 'SER_NBR', 'Series_reference', 'Period', 'Data_value', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Age Group 3 brackets', 'Age Group', 'Age Group 6 brackets', 'Duration of unemployment', 'Employed and Unemployed Persons, Full-Time and Part-Time Status', 'Employment relationship', 'Employment status', 'Ethnic Single / Combination', 'Ethnic Total Response', 'Formal study status', 'Highest qualification', 'Hours Worked', 'Household Composition', 'Household Labour Force Status', 'Industry ANZSIC06', 'Industry ANZSIC06 Supplementary', 'Job', 'Job tenure', 'Labour force and education status', 'Labour Force Status', 'Main activity', 'Main job', 'Methods of seeking employment', 'Occupation ANZSCO Level 1', 'Percentage change from previous period and same period previous year', 'Persons Employed, Unemployed, Not in Labour Force (for current quarter)', 'Reason for leaving last job', 'Reason not seeking work', 'Reason not wanting work', 'Reasons not available for 

  interactivity=interactivity, compiler=compiler, result=result)



Dataframe: 8
columns:  ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5']
Shape:  (174259, 13)



### Time taken when using python directly

In [9]:
print(f"Time taken: {(end_time - start_time).total_seconds()} Seconds")

Time taken: 12.626352 Seconds


##### Python looks slightly faster here. But when the same job is submitted to cloud dataflow, it'll be faster since it uses distributed systems and also it's scalable. 