<a href="https://colab.research.google.com/github/PedroTechy/CarrisInsight/blob/streaming_development/spark_jobs/extract_carris_vehicles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Authenticate with Google Cloud


In [2]:
!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2

Collecting google-auth
  Downloading google_auth-2.37.0-py2.py3-none-any.whl.metadata (4.8 kB)
Downloading google_auth-2.37.0-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.8/209.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-auth
  Attempting uninstall: google-auth
    Found existing installation: google-auth 2.27.0
    Uninstalling google-auth-2.27.0:
      Successfully uninstalled google-auth-2.27.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires google-auth==2.27.0, but you have google-auth 2.37.0 which is incompatible.[0m[31m
[0mSuccessfully installed google-auth-2.37.0


In [1]:
from google.colab import auth
auth.authenticate_user()

MessageError: Error: credential propagation was unsuccessful

# Step 2: Install Spark and BigQuery connector

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.mirror.digitalpacific.com.au/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark

# Step 3: Set environment variables for Spark and Java

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

# Step 4: Install BigQuery Connector for Spark

In [None]:
!wget -q https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.29.0.jar


# Step 5: Initialize Spark session

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GCS to BigQuery Streaming with Auto-Table Creation") \
    .config("spark.jars", "/content/spark-bigquery-with-dependencies_2.12-0.29.0.jar") \
    .getOrCreate()

# Step 6: Define GCS input path and output BigQuery table

In [None]:
input_path = "gs://edit-de-project-streaming-data/carris-vehicles"
output_table = "data-eng-dev-437916.data_eng_project_group3.raw_vehicles"

# Step 7: Read streaming data from GCS

In [None]:
streaming_df = spark.readStream \
    .format("json") \
    .load(input_path)

# Step 8: Write streaming data to BigQuery with auto-table creation

In [None]:
streaming_query = streaming_df.writeStream \
    .format("bigquery") \
    .option("table", output_table) \
    .option("checkpointLocation", "gs://edit-data-eng-project-group3/streaming_data/checkpoints") \
    .option("temporaryGcsBucket", "gs://edit-data-eng-project-group3/streaming_data/temp-bucket") \
    .option("writeDisposition", "WRITE_APPEND") \
    .option("createDisposition", "CREATE_IF_NEEDED") \
    .outputMode("append") \
    .start()

# Step 9: Wait for the streaming query to finish

In [None]:
streaming_query.awaitTermination()