In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

#   https://kashif-sohail.medium.com/read-files-from-google-cloud-storage-bucket-using-local-pyspark-and-jupyter-notebooks-f8bd43f4b42e

## Build Master

In [None]:
# Use spark gcs connector
conf = (SparkConf()
        .setMaster("spark://localhost:7077")
        .setAppName("GCSRead")
        .set("spark.jars", "./spark-jars/gcs-connector-hadoop3-latest.jar")
        .set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
        .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "./infra/creds.json")
        # Set memory usage to 500m
        .set("spark.sql.legacy.parquet.nanosAsLong", "true")
        .set("spark.shuffle.service.enabled", "false")
        .set("spark.dynamicAllocation.enabled", "false")
        .set("spark.executor.memory", "512m")
        .set("spark.executor.cores", "1")
        .set("spark.executor.instances", "1")
        .set("spark.driver.memory", "512m")
)

sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("google.cloud.auth.service.account.enable", "true")
hadoop_conf.set("google.cloud.auth.service.account.json.keyfile", "./infra/creds.json")

spark: SparkSession = SparkSession.builder.config(conf=sc.getConf()).getOrCreate()

## Read DataFrame

In [None]:
path=f"gs://weather_data_de_bucket/Actuele10mindataKNMIstations/2/2024/04/11/11/10/weather_data.parquet"

df = spark.read.parquet(path)

In [None]:
df.show(5)

### Schema Read

In [None]:
# Enforce struct
from pyspark.sql import types

schema = types.StructType([
    types.StructField("dispatching_base_num", types.StringType()),
    types.StructField("pickup_datetime", types.TimestampType()),
    types.StructField("dropoff_datetime", types.TimestampType()),
    types.StructField("PULocationID", types.IntegerType()),
    types.StructField("DOLocationID", types.IntegerType()),
    types.StructField("SR_Flag", types.IntegerType(), True),
])

In [None]:
df = spark.read.csv("fhv_tripdata_2019-10.csv", header=True, schema=schema)

In [None]:
df.head(5)

## Save to Big Query