In [1]:
import apache_beam as beam
import pandas as pd
import glob
import os

from datetime import datetime

import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner

In [2]:
count = 0
def get_info(df):
    global count
    count = count + 1
    return {'count': count, 'columns': list(df.columns), 'Shape': df.shape}

In [3]:
# Create pipeline object
p = beam.Pipeline(InteractiveRunner())

In [4]:
# Add pipeline components
csv_details =   (
                    p 
                    | 'List csv files' >> beam.Create(glob.glob('./data/*.csv'))
                    | 'Read csv files' >> beam.Map((pd.read_csv))
                    | 'Get dataframe details' >> beam.Map(get_info)
                    | 'Print information' >> beam.Map(print)
                 )

#### Visualize Pipeline

In [5]:
ib.show_graph(p)

##### Make sure tar file in './data' folder is un-tarred. That will yield 3 csv files.

In [6]:
start_time = datetime.now()
p.run().wait_until_finish()
end_time = datetime.now()



{'count': 1, 'columns': ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5'], 'Shape': (25468, 13)}


  bundle_processor.process_bundle(instruction_id))


{'count': 2, 'columns': ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5'], 'Shape': (1118488, 13)}


  bundle_processor.process_bundle(instruction_id))


{'count': 3, 'columns': ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5'], 'Shape': (174259, 13)}


### Time taken when using pipeline

In [7]:
print(f"Time taken: {(end_time - start_time).total_seconds()} Seconds")

Time taken: 4.920789 Seconds


In [8]:
start_time = datetime.now()
for csv_file in glob.glob('./data/*.csv'):
    df = pd.read_csv(csv_file)
    print(get_info(df))
end_time = datetime.now()

{'count': 4, 'columns': ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5'], 'Shape': (25468, 13)}


  interactivity=interactivity, compiler=compiler, result=result)


{'count': 5, 'columns': ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5'], 'Shape': (1118488, 13)}
{'count': 6, 'columns': ['Series_reference', 'Period', 'Data_value', 'STATUS', 'UNITS', 'MAGNTUDE', 'Subject', 'Group', 'Series_title_1', 'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5'], 'Shape': (174259, 13)}


  interactivity=interactivity, compiler=compiler, result=result)


### Time taken when using python directly

In [9]:
print(f"Time taken: {(end_time - start_time).total_seconds()} Seconds")

Time taken: 2.835588 Seconds


##### Python looks faster here. But when the same job is submitted to cloud dataflow, it'll be faster since it uses parallel computation on distributed systems and also it's scalable. 