In [3]:
!pip install apache-beam
import apache_beam as beam

You should consider upgrading via the '/Users/Shehryar/Documents/Virgin Media 02/venv/bin/python3 -m pip install --upgrade pip' command.[0m


## Read the input file - Change path to read from Bucket once complete 

In [4]:
with beam.Pipeline() as pipeline:

    data = (
        pipeline
        | 'Read CSV file' >> beam.io.ReadFromText('/Users/Shehryar/Downloads/transactions.csv')
    )

    data | 'Print data' >> beam.Map(print)

timestamp,origin,destination,transaction_amount
2009-01-09 02:54:25 UTC,wallet00000e719adfeaa64b5a,wallet00001866cb7e0f09a890,1021101.99
2017-01-01 04:22:23 UTC,wallet00000e719adfeaa64b5a,wallet00001e494c12b3083634,19.95
2017-03-18 14:09:16 UTC,wallet00001866cb7e0f09a890,wallet00001e494c12b3083634,2102.22
2017-03-18 14:10:44 UTC,wallet00001866cb7e0f09a890,wallet00000e719adfeaa64b5a,1.00030
2017-08-31 17:00:09 UTC,wallet00001e494c12b3083634,wallet00005f83196ec58e4ffe,13700000023.08
2018-02-27 16:04:11 UTC,wallet00005f83196ec58e4ffe,wallet00001866cb7e0f09a890,129.12


## Find all transactions have a `transaction_amount` greater than `20`

In [8]:
class FilterTransactionsGreaterThan20(beam.DoFn):
    def process(self, element):
        transaction_amount = float(element.split(',')[3])  # Transaction amount is the 4th column in csv 
        if transaction_amount > 20:
            yield element

with beam.Pipeline() as pipeline:
    data = (
        pipeline
        | 'Read CSV file' >> beam.io.ReadFromText('/Users/Shehryar/Downloads/transactions.csv', skip_header_lines=1)
        | 'Filter transactions' >> beam.ParDo(FilterTransactionsGreaterThan20())
    )

    data | 'Print filtered data' >> beam.Map(print)


2009-01-09 02:54:25 UTC,wallet00000e719adfeaa64b5a,wallet00001866cb7e0f09a890,1021101.99
2017-03-18 14:09:16 UTC,wallet00001866cb7e0f09a890,wallet00001e494c12b3083634,2102.22
2017-08-31 17:00:09 UTC,wallet00001e494c12b3083634,wallet00005f83196ec58e4ffe,13700000023.08
2018-02-27 16:04:11 UTC,wallet00005f83196ec58e4ffe,wallet00001866cb7e0f09a890,129.12


## Exclude all transactions made before the year `2010`. Haven't excluded amounts less than 20, is that correct?

In [11]:
from datetime import datetime

class CheckTimestampYear(beam.DoFn):
    def process(self, element):
        timestamp_str = element.split(',')[0]  # Timestamp is the 1st column
        timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S %Z')
        if timestamp.year >= 2010:
            yield element


with beam.Pipeline() as pipeline:
    data = (
        pipeline
        | 'Read CSV file' >> beam.io.ReadFromText('/Users/Shehryar/Downloads/transactions.csv',  skip_header_lines=1)
        | 'Filter transactions for year' >> beam.ParDo(CheckTimestampYear())
    )

    
    data | 'Print data' >> beam.Map(print)


2017-01-01 04:22:23 UTC,wallet00000e719adfeaa64b5a,wallet00001e494c12b3083634,19.95
2017-03-18 14:09:16 UTC,wallet00001866cb7e0f09a890,wallet00001e494c12b3083634,2102.22
2017-03-18 14:10:44 UTC,wallet00001866cb7e0f09a890,wallet00000e719adfeaa64b5a,1.00030
2017-08-31 17:00:09 UTC,wallet00001e494c12b3083634,wallet00005f83196ec58e4ffe,13700000023.08
2018-02-27 16:04:11 UTC,wallet00005f83196ec58e4ffe,wallet00001866cb7e0f09a890,129.12


## Sum the total by `date`

In [12]:
class CheckTimestampYear(beam.DoFn):
    def process(self, element):
        timestamp_str = element.split(',')[0]  
        timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S %Z')
        if timestamp.year >= 2010:
            yield element

class ExtractDateAmount(beam.DoFn):
    def process(self, element):
        timestamp_str, _, _, amount_str = element.split(',')
        timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S %Z')
        amount = float(amount_str)
        date = timestamp.strftime('%Y-%m-%d')
        yield (date, amount)

# Define a pipeline
with beam.Pipeline() as pipeline:
    data = (
        pipeline
        | 'Read CSV file' >> beam.io.ReadFromText('/Users/Shehryar/Downloads/transactions.csv',  skip_header_lines=1)
        | 'Filter transactions for year' >> beam.ParDo(CheckTimestampYear())
        | 'Extract date and amount' >> beam.ParDo(ExtractDateAmount())
        | 'Sum amounts by date' >> beam.CombinePerKey(sum)  #GROUP BY 
    )


    data | 'Print data' >> beam.Map(print)


('2017-01-01', 19.95)
('2017-03-18', 2103.2203)
('2017-08-31', 13700000023.08)
('2018-02-27', 129.12)


## Save the output into `output/results.jsonl.gz` and make sure all files in the `output/` directory is git ignored
Change path to specified location once complete

In [14]:
class CheckTimestampYear(beam.DoFn):
    def process(self, element):
        timestamp_str = element.split(',')[0]  
        timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S %Z')
        if timestamp.year >= 2010:
            yield element

class ExtractDateAmount(beam.DoFn):
    def process(self, element):
        timestamp_str, _, _, amount_str = element.split(',')
        timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S %Z')
        amount = float(amount_str)
        date = timestamp.strftime('%Y-%m-%d')
        yield (date, amount)

class FormatOutput(beam.DoFn):
    def process(self, element):
        date, total_amount = element
        yield f'{date}, {total_amount}'

# Define a pipeline
with beam.Pipeline() as pipeline:
    data = (
        pipeline
        | 'Read CSV file' >> beam.io.ReadFromText('/Users/Shehryar/Downloads/transactions.csv',  skip_header_lines=1)
        | 'Filter transactions for year' >> beam.ParDo(CheckTimestampYear())
        | 'Extract date and amount' >> beam.ParDo(ExtractDateAmount())
        | 'Sum amounts by date' >> beam.CombinePerKey(sum)  #GROUP BY 
        | 'Format output' >> beam.ParDo(FormatOutput())
        | 'Write to CSV file' >> beam.io.WriteToText('output/summed_amounts_v3', file_name_suffix='.csv', header='date, total_amount'
    )
    )


output/summed_amounts_v3-00000-of-00001.csv
