In [1]:
!python --version

Python 3.7.12


In [2]:
import os
import json


In [3]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [4]:
print("Hello world!")
print("Todays date: ", TIMESTAMP) 

Hello world!
Todays date:  20220822170753


In [5]:
USER_FLAG = ""
USER_FLAG = "--user"

In [6]:
if os.getenv("IS_TESTING"):
    """
    The testing suite does not currently support testing on Dataproc clusters,
    so the testing environment is setup to replicate Dataproc via the following steps.
    """
    JAVA_VER = "8u332-b09"
    JAVA_FOLDER = "/tmp/java"
    FILE_NAME = f"openlogic-openjdk-{JAVA_VER}-linux-x64"
    TAR_FILE = f"{JAVA_FOLDER}/{FILE_NAME}.tar.gz"
    DOWNLOAD_LINK = f"https://builds.openlogic.com/downloadJDK/openlogic-openjdk/{JAVA_VER}/openlogic-openjdk-{JAVA_VER}-linux-x64.tar.gz"
    PYSPARK_VER = "3.1.3"

    # Download Open JDK 8. Spark requires Java to execute.
    ! rm -rf $JAVA_FOLDER
    ! mkdir $JAVA_FOLDER
    ! wget -P $JAVA_FOLDER $DOWNLOAD_LINK
    os.environ["JAVA_HOME"] = f"{JAVA_FOLDER}/{FILE_NAME}"
    ! tar -zxf $TAR_FILE -C $JAVA_FOLDER
    ! echo $JAVA_HOME

    # Pin the Spark version to match that the Dataproc 2.0 cluster.
    ! pip install {USER_FLAG} pyspark==$PYSPARK_VER -q

In [7]:
!pip install {USER_FLAG} pyspark=="3.1.3"

Collecting pyspark==3.1.3
  Using cached pyspark-3.1.3-py2.py3-none-any.whl
Installing collected packages: pyspark
Successfully installed pyspark-3.1.3


In [8]:
import pyspark

In [9]:
from pyspark.sql.functions import *

In [10]:
!jupyter kernelspec list --json

{
  "kernelspecs": {
    "python3": {
      "resource_dir": "/opt/conda/share/jupyter/kernels/python3",
      "spec": {
        "argv": [
          "/bin/bash",
          "-c",
          "PYSPARK_DRIVER_PYTHON_OPTS='kernel -f {connection_file}' pyspark"
        ],
        "env": {
          "PYSPARK_DRIVER_PYTHON": "/opt/conda/bin/ipython",
          "PYSPARK_PYTHON": "/opt/conda/bin/python"
        },
        "display_name": "PySpark",
        "language": "python",
        "interrupt_mode": "signal",
        "metadata": {}
      }
    }
  }
}


In [11]:
if not os.getenv("IS_TESTING"):
    CLUSTER_NAME = "my-dataproc-cluster-01"  # @param {type: "string"}
    CLUSTER_REGION = "[your-region]"  # @param {type: "string"}

    if CLUSTER_REGION == "[your-region]":
        CLUSTER_REGION = "us-central1"

    print(f"CLUSTER_NAME: {CLUSTER_NAME}")
    print(f"CLUSTER_REGION: {CLUSTER_REGION}")

CLUSTER_NAME: my-dataproc-cluster-01
CLUSTER_REGION: us-central1


In [12]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    #shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    #PROJECT_ID = shell_output[0]
    PROJECT_ID = "vertex-and-spark-demo"
    print("Project ID: ", PROJECT_ID)

Project ID:  vertex-and-spark-demo


In [13]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "vertex-and-spark-demo"  # @param {type: "string"}

In [14]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

In [15]:
if not os.getenv("IS_TESTING"):
    DATASET_NAME = f"spark_dataset_{TIMESTAMP}"  # @param {type:"string"}

    if (
        DATASET_NAME == ""
        or DATASET_NAME is None
        or DATASET_NAME == "[your-dataset-name]"
    ):
        DATASET_NAME = f"{PROJECT_ID}{UUID}"
else:
    DATASET_NAME = f"python_docs_samples_tests_spark_{UUID}"

In [16]:
DATASET_NAME

'spark_dataset_20220822170753'

In [17]:
REGION="us-central1"

In [18]:
from google.cloud import bigquery
import pandas

In [21]:
!gcloud services enable bigquery.googleapis.com

In [22]:
!bq mk --location=$REGION $PROJECT_ID:$DATASET_NAME 



BigQuery error in mk operation: BigQuery API has not been used in project
457198359346 before or it is disabled. Enable it by visiting https://console.dev
elopers.google.com/apis/api/bigquery.googleapis.com/overview?project=45719835934
6 then retry. If you enabled this API recently, wait a few minutes for the
action to propagate to our systems and retry.


In [None]:
# You use Spark SQL in a "SparkSession" to create DataFrames
from pyspark.sql import SparkSession
# PySpark functions
from pyspark.sql.functions import avg, col, count, desc, round, size, udf
# These allow us to create a schema for our data
from pyspark.sql.types import ArrayType, IntegerType, StringType

In [30]:
# Initialize the "SparkSession" with the following config.
VER = "0.26.0"
FILE_NAME = f"spark-bigquery-with-dependencies_2.12-{VER}.jar"
connector = f"gs://spark-lib/bigquery/{FILE_NAME}"

spark = (
    SparkSession.builder.appName("spark-bigquery-polyglot-language-demo")
    .config("spark.jars", connector)
    .config("spark.sql.debug.maxToStringFields", "500")
    .getOrCreate()
)

In [23]:
#!/usr/bin/python
"""BigQuery I/O PySpark example."""
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName('spark-bigquery-demo') \
  .getOrCreate()


In [24]:
# Use the Cloud Storage bucket for temporary BigQuery export data used
# by the connector.
bucket = "vertex-and-spark-demo-bucket"
spark.conf.set('temporaryGcsBucket', bucket)

In [25]:
# Load data from BigQuery.
words = spark.read.format('bigquery') \
  .option('table', 'bigquery-public-data:samples.shakespeare') \
  .load()
words.createOrReplaceTempView('words')

In [26]:
# Perform word count.
word_count = spark.sql(
    'SELECT word, SUM(word_count) AS word_count FROM words GROUP BY word')
word_count.show()
word_count.printSchema()

22/08/22 17:19:57 WARN org.apache.hadoop.hive.metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException

+---------+----------+
|     word|word_count|
+---------+----------+
|     XVII|         2|
|    spoil|        28|
|    Drink|         7|
|forgetful|         5|
|   Cannot|        46|
|    cures|        10|
|   harder|        13|
|  tresses|         3|
|      few|        62|
|  steel'd|         5|
| tripping|         7|
|   travel|        35|
|   ransom|        55|
|     hope|       366|
|       By|       816|
|     some|      1169|
|    those|       508|
|    still|       567|
|      art|       893|
|    feign|        10|
+---------+----------+
only showing top 20 rows

root
 |-- word: string (nullable = false)
 |-- word_count: long (nullable = true)



                                                                                

In [27]:
# Saving the data to BigQuery
word_count.write.format('bigquery') \
  .option('table', 'wordcount_dataset.wordcount_output') \
  .save()

                                                                                

In [None]:
# Initialize the "SparkSession" with the following config.
VER = "0.26.0"
FILE_NAME = f"spark-bigquery-with-dependencies_2.12-{VER}.jar"

if os.getenv("IS_TESTING"):
    connector = f"https://github.com/GoogleCloudDataproc/spark-bigquery-connector/releases/download/{VER}/{FILE_NAME}"
else:
    connector = f"gs://spark-lib/bigquery/{FILE_NAME}"

spark = (
    SparkSession.builder.appName("spark-bigquery-polyglot-language-demo")
    .config("spark.jars", connector)
    .config("spark.sql.debug.maxToStringFields", "500")
    .getOrCreate()
)