In [7]:
%%writefile Dockerfile
# Use the official Apache Beam SDK image as the base
FROM apache/beam_python3.10_sdk:2.53.0

COPY --from=gcr.io/dataflow-templates-base/python311-template-launcher-base:20230622_RC00 /opt/google/dataflow/python_template_launcher /opt/google/dataflow/python_template_launcher

RUN apt-get update && apt-get install -y

# Install dependencies with specific versions
RUN pip install --no-cache-dir \
    paramiko \
    protobuf \
    pyarrow \ 
    google-cloud-storage

# Download and install rclone
# RUN curl -O https://downloads.rclone.org/rclone-current-linux-amd64.zip \
#     && unzip rclone-current-linux-amd64.zip \
#     && cd rclone-*-linux-amd64 \
#     && cp rclone /usr/bin/ \
#     && chown root:root /usr/bin/rclone \
#     && chmod 755 /usr/bin/rclone \
#     && mkdir -p /usr/local/share/man/man1 \
#     && cp rclone.1 /usr/local/share/man/man1/
    # && mandb

# Fix protoco and gcp-storage problem
ENV PB_REL="https://github.com/protocolbuffers/protobuf/releases"
RUN curl -LO $PB_REL/download/v30.2/protoc-30.2-linux-x86_64.zip \
    && unzip protoc-30.2-linux-x86_64.zip -d /root/.local \
    && export PATH="$PATH:/root/.local/bin" \
    && pip install protobuf \
    && pip install --upgrade google-cloud-storage \
    && pip install --upgrade grpcio

# Set the working directory (optional, but good practice)

# Copy your Beam pipeline code into the container
ARG WORKDIR=/template
WORKDIR ${WORKDIR}

COPY main.py .
# COPY pyproject.toml .
# COPY requirements.txt .
# COPY setup.py .
COPY src src


ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/main.py"


# You might want to set a default command to run your pipeline
ENTRYPOINT ["/opt/apache/beam/boot"]


Writing Dockerfile


In [None]:
%%writefile tranformation.py
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.parquetio import ReadFromParquet, WriteToParquet
import typing
import pyarrow as pa
from apache_beam.io.filesystems import FileSystems
import pyarrow.parquet as pq
from apache_beam.transforms.combiners import Sample
import json 

# 1. Extract column-value pairs
def extract_object_column_values(record: dict): # this is 
    for col, val in record.items():
        if isinstance(val, str):  # Candidate for label encoding
            yield (col, val)


# 2. Create label mapping
def create_label_mapping(kv: typing.Tuple[str, typing.Iterable[str]]):
    col, values = kv
    unique_vals = set(values)
    if len(unique_vals) <= 15:
        mapping = {v: i for i, v in enumerate(sorted(unique_vals))}
        return (col, mapping)
    return None  # skip this column

class Convert(beam.DoFn):
    def process(self,record):
        new_mappings = {}
        for k,v in record.items():
            # print(k,v)
            new_mappings[k] = v
        return [new_mappings]

# 3. Encode
class FilterUsingLength(beam.DoFn):
    def process(self,record: dict, mappings: list):                
        new_record = record.copy()
        for col, mapping in mappings[0].items():
            val = new_record.get(col)
            if val in mapping:
                new_record[col] = mapping[val]
            elif val is not None:
                new_record[col] = -1  # Unknown
        yield new_record

def infer_schema(parquet_path: str):
    with FileSystems.open(parquet_path) as f:
        return pq.ParquetFile(f).schema_arrow


def run():
    options = PipelineOptions()
    with beam.Pipeline(options=options) as p:

        ## because no schema is provided for the parquet file
        schema_directory = 'gs://new_remove/test_transformation/result.parquet'
        orig = pq.ParquetFile(FileSystems.open(schema_directory)).schema_arrow

        mapped_cols = [
            'po_priority_common_key',
            'po_to_department_key',
            'po_department_key',
            'po_mode_common_key',
            'po_type_common_key',
            'po_comm_common_key',
            'po_user_department_key',
            'po_encounter_type_common_key',
            'po_prescription_status_common_key',
            'duration_uom_common_key',
            'status_common_key',
            'result_status_common_key',
            'priority_common_key',
            'status_flag',
            'result_type',
            'HLN',
        ]


        schema = pa.schema(
            # first, your newly-int64 columns:
            [pa.field(c, pa.int64()) for c in mapped_cols] +
            # then all the other fields untouched:
            [f for f in orig if f.name not in mapped_cols]
        )

        
        # Read data
        records = p | 'Read Parquet' >> ReadFromParquet('gs://new_remove/test_transformation/result.parquet')

        column_values = records | 'Extract Col-Value' >> beam.FlatMap(extract_object_column_values) # for each record check the value type, then send them as pair to match, ex: "(ID,3)"
        
        grouped = column_values | 'Group by Column' >> beam.GroupByKey() # group the values under column name, ex: ID:[3,5,1,6]
        
        # Create mappings for low-cardinality columns
        raw_mappings = grouped | 'Create Mappings' >> beam.Map(create_label_mapping) # this takes the a single tuple that contain the key "column", and iteratable for all values in it, it takes a single column at a time because it's a map function
        label_mappings = raw_mappings | 'Drop Nones' >> beam.Filter(lambda x: x is not None) ## this filter out the columns with no mapping 
        # label_mappings | "print_label" >> beam.Map(print)
        
        # Convert mapping to dict for side input
        mappings_dict = label_mappings | 'To Dict' >> beam.combiners.ToDict() # Convert all the 
        # mappings_dict | 'print' >> beam.Map(print)
        
        mappings_dict2 = mappings_dict | 'test1' >> beam.ParDo(Convert()) 
        # mappings_dict2 | "test2" >> beam.Map(print)
        
        encoded = records | 'Encode Records' >> beam.ParDo(FilterUsingLength(),beam.pvalue.AsList(mappings_dict2))

        encoded | 'Write Compressed Parquet' >> WriteToParquet(
            file_path_prefix='gs://new_remove/test_transformation/output2/encoded',
            schema=schema,            # use the pre-computed schema
            codec='snappy',
            num_shards=1,
            shard_name_template='',
            file_name_suffix='.parquet'
        )
        # encoded | beam.io.WriteToText(
        #           'poc_samples/Data/encoded_sample',
        #           file_name_suffix='.json',
        #           shard_name_template=''
        # )


if __name__ == '__main__':
    run()


Writing tranformation.py


In [None]:
# Commands for deployment

In [None]:
# Variables needed to be set
export PROJECT=""
export BUCKET="new_remove"
export REGION="us-central1"
export TAG="parquet-processing"
export SDK_CONTAINER_IMAGE="gcr.io/$PROJECT/dataflow-flex-template-parquest-processing:$TAG"
export TEMPLATE_FILE=gs://$BUCKET/dataflow_template_configuration-$TAG.json

In [None]:
# build and push to image repo
gcloud builds submit . --tag $SDK_CONTAINER_IMAGE --project $PROJECT
# build flex-template file in bucket
gcloud dataflow flex-template build $TEMPLATE_FILE      --image $SDK_CONTAINER_IMAGE     --sdk-language "PYTHON"     --metadata-file=metadata.txt     --project $PROJECT
# run the flex-template
gcloud dataflow flex-template run "flex-parquet-$(date +%Y%m%d-%H%M%S)"   --template-file-gcs-location="$TEMPLATE_FILE"   --region="$REGION"   --staging-location="gs://$BUCKET/staging"   --parameters=sdk_container_image=$SDK_CONTAINER_IMAGE,project=$PROJECT,schema_directory=gs://$BUCKET/test_transformation/result.parquet,target_file_location=gs://$BUCKET/test_transformation/result.parquet,result_file_location=gs://$BUCKET/test_transformation/output2/encoded   --project="$PROJECT"

In [None]:
# to test run
docker run --rm -it --entrypoint=/bin/bash $SDK_CONTAINER_IMAGE

python main.py \
  --project= \
  --schema_directory=gs://$PROJECT/test_transformation/result.parquet \
  --target_file_location=gs://$PROJECT/test_transformation/result.parquet \
  --result_file_location=gs://$PROJECT/test_transformation/output2/encoded \
  --pipeline_temp_location=$PROJECT \
  --codec=snappy \
