In [1]:
pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.51.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.7/14.7 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting crcmod<2.0,>=1.7 (from apache-beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting orjson<4,>=3.9.7 (from apache-beam)
  Downloading orjson-3.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.7/138.7 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.2,>=0.3.1.1 (from apache-beam)
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m16.0 MB/s[0m eta [36m0:00:0

In [2]:
import apache_beam as beam

class FilterAndCountNames(beam.PTransform):
    def expand(self, pcoll):
        # Step 1: Filter users based on age
        filtered_users = pcoll | "Filter Users" >> beam.Filter(lambda record: record['age'] >= 25)

        # Step 2: Count the occurrences of each name
        name_counts = (
            filtered_users
            | "Extract Names" >> beam.Map(lambda record: (record['name'], 1))
            | "Count Names" >> beam.CombinePerKey(sum)
        )

        return name_counts


In [7]:
def run_pipeline():
    # Define the pipeline options. For local execution, we use the DirectRunner.
    pipeline_options = beam.options.pipeline_options.PipelineOptions(runner="DirectRunner")

    # Set up the pipeline
    with beam.Pipeline(options=pipeline_options) as p:
        # Read the dataset
        records = (
            p
            | "Read CSV" >> beam.io.ReadFromText('/content/users_dataset.csv', skip_header_lines=1)
            | "Parse CSV" >> beam.Map(lambda line: dict(zip(['user_id', 'name', 'email', 'age'], line.split(','))))
            | "Convert Age to Int" >> beam.Map(lambda record: {**record, 'age': int(record['age'])})
        )

        # Apply the composite transform
        name_counts = records | "Filter and Count Names" >> FilterAndCountNames()

        # Write the results
        name_counts | "Write Results" >> beam.io.WriteToText('/content/name_counts.txt')

    print("Pipeline executed successfully!")

# Uncomment and run the function to execute the pipeline
# run_pipeline()


In [8]:
run_pipeline()





Pipeline executed successfully!
