In [None]:
import apache_beam as beam

In [6]:
# Sample data
data = """Name,Age,City
Alice,30,New York
Bob,25,San Franciso
Charlie,35,Los Angeles"""

split_data = data.split("\n")

# Apache Beam pipeline
with beam.Pipeline() as p:
    sample_data = (
        p
        | "Read from sample data" >> beam.Create(split_data[1:])
    )

    sample_data | "write to txt file" >> beam.io.WriteToText("gs://source-bucket-data/sink/output.txt")

    # Write to csv file
    sample_data | "write to csv file" >> beam.io.WriteToText("gs://source-bucket-data/sink/output.csv")



In [4]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

# sample_data
data = """Name,Age,City
Alice,30,New York
Bob,25,San Franciso
Charlie,35,Los Angele"""

split_data = data.split("\n")

# define a function to convert into JSON
def json_parse(record):
    fields = split_data[0].split(",")
    values = record.split(",")
    return {fields[i]: values[i] for i in range(len(fields))}

# Define the pipeline options if needed
# options = PipelineOptions(
#     project='gcp-instance-cloud-sql',
#     temp_location='gs://source-bucket-dataset/temp',
#     runner='DataflowRunner',  # Or DirectRunner for local
#     region='us-central1'
# )

# Create pipeline object
with beam.Pipeline() as p:
    
    sample_data = (
        p
        | "read from sample data" >> beam.Create(split_data[1:])  # Skip the header row
    )

    # Write to text file
    sample_data | "write to txt file" >> beam.io.WriteToText("gs://source-bucket-data/sink/output.txt")

    # Write to csv file
    sample_data | "write to csv file" >> beam.io.WriteToText("gs://source-bucket-data/sink/output.csv")

    # Parse to JSON file
    json_data = (
        sample_data 
        | "parse the json" >> beam.Map(json_parse)
    )

    # Write to json file (as text, since Beam doesn't have native JSON sink)
    json_data | "write to json file" >> beam.io.WriteToText("gs://source-bucket-data/sink/output.json")


    # write to avro file
    (json_data
    | "Write to avro" >> WriteToAvro("gs://source-bucket-data/sink/output.avro",
        schema={
            'fields': [
                {'name': 'Name', 'type': 'string'},
                {'name': 'Age', 'type': 'string'},
                {'name': 'City', 'type': 'string'}
            ],
            'type': 'record',
            'name': 'AvroRecord'
        }))



In [6]:
import apache_beam as beam

# Sample data
data = """Name,Age,City
Alice,30,New York
Bob,25,San Franciso
Charlie,35,Los Angeles"""

split_data = data.split("\n")

# Define a function to convert CSV lines to JSON
def json_parse(record):
    fields = split_data[0].split(",")
    values = record.split(",")
    return {fields[i]: values[i] for i in range(len(fields))}

# Apache Beam pipeline
with beam.Pipeline() as p:
    sample_data = (
        p
        | "Read from sample data" >> beam.Create(split_data[1:])
    )

     # Write to text file
    sample_data | "write to txt file" >> beam.io.WriteToText("gs://source-bucket-data/sink/output.txt")

    # Write to text (CSV style)
    sample_data | "Write to CSV" >> beam.io.WriteToText(
        "gs://source-bucket-data/sink/output.csv",
        file_name_suffix=".csv"
    )

    # Convert to JSON
    json_data = (
        sample_data
        | "Parse to JSON" >> beam.Map(json_parse)
    )

    # Write JSON to text file
    json_data | "Write to JSON" >> beam.io.WriteToText(
        "gs://source-bucket-data/sink/output.json",
        file_name_suffix=".json"
    )

    # Write to Avro
    json_data | "Write to Avro" >> beam.io.avroio.WriteToAvro(
        "gs://source-bucket-data/sink/output.avro",
        schema={
            'type': 'record',
            'name': 'AvroRecord',
            'fields': [
                {'name': 'Name', 'type': 'string'},
                {'name': 'Age', 'type': 'string'},
                {'name': 'City', 'type': 'string'}
            ]
        }
    )

    # Write to BigQuery
    json_data | "Write to BigQuery" >> beam.io.WriteToBigQuery(
        table="new-gcp-cloud-sql-project.master_ds.df_table",
        schema="Name:STRING,Age:STRING,City:STRING",
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        custom_gcs_temp_location="gs://source-bucket-data/sink/temp"
    )


# write to parquet (assignment)

