<a href="https://colab.research.google.com/github/PedroTechy/CarrisInsight/blob/streaming_development/spark_jobs/extract_carris_vehicles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Authenticate with Google Cloud


In [8]:
from google.colab import auth
auth.authenticate_user()

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/root/.config/gcloud/application_default_credentials.json"

# Step 2: Install Spark and BigQuery connector

In [2]:
# Install OpenJDK 8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Spark from a reliable source
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz

# Extract the downloaded Spark tarball
!tar xf spark-3.2.1-bin-hadoop3.2.tgz

!rm -rf spark-3.2.1-bin-hadoop3.2.tgz # No longer needed

# Install the GCS Connector
!rm -rf /content/spark-3.2.1-bin-hadoop3.2/jars/gcs-connector-hadoop3-latest.jar
!wget -q https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar -P /content/spark-3.2.1-bin-hadoop3.2/jars/

# Install the BigQuery Connector
!rm -rf /content/spark-3.2.1-bin-hadoop3.2/jars/spark-bigquery-with-dependencies_2.12-0.29.0.jar
!wget -q https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.29.0.jar -P /content/spark-3.2.1-bin-hadoop3.2/jars/

# Install Findspark
!pip install -q findspark


# Step 3: Set environment variables for Spark and Java

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

# Step 4: Initialize Spark session

In [4]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GCS to BigQuery Streaming") \
    .config("spark.jars", "/content/spark-3.2.1-bin-hadoop3.2/jars/gcs-connector-hadoop3-latest.jar,/content/spark-3.2.1-bin-hadoop3.2/jars/spark-bigquery-with-dependencies_2.12-0.29.0.jar") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

print("Spark Session successfully created!")


Spark Session successfully created!


# Step 5: Define GCS input path and output BigQuery table

In [5]:
input_path = "gs://edit-de-project-streaming-data/carris-vehicles"
output_table = "data-eng-dev-437916.data_eng_project_group3_raw.vehicles"

# Step 6: Read streaming data from GCS

In [7]:

streaming_df = spark.readStream \
    .format("json") \
    .load(input_path)

IllegalArgumentException: No valid credential configuration discovered:  [CredentialOptions{serviceAccountEnabled=false, serviceAccountPrivateKeyId=null, serviceAccountPrivateKey=null, serviceAccountEmail=null, serviceAccountKeyFile=null, serviceAccountJsonKeyFile=null, nullCredentialEnabled=false, transportType=JAVA_NET, tokenServerUrl=https://oauth2.googleapis.com/token, proxyAddress=null, proxyUsername=null, proxyPassword=null, authClientId=null, authClientSecret=null, authRefreshToken=null}]

# Step 7: Write streaming data to BigQuery with auto-table creation

In [None]:
streaming_query = streaming_df.writeStream \
    .format("bigquery") \
    .option("table", output_table) \
    .option("checkpointLocation", "gs://edit-data-eng-project-group3/streaming_data/checkpoints") \
    .option("temporaryGcsBucket", "gs://edit-data-eng-project-group3/streaming_data/temp-bucket") \
    .option("writeDisposition", "WRITE_APPEND") \
    .option("createDisposition", "CREATE_IF_NEEDED") \
    .outputMode("append") \
    .start()

# Step 8: Wait for the streaming query to finish

In [None]:
streaming_query.awaitTermination()