In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Waiting for headers] [Connecting to security.ubuntu.com] [1 InRelease 0 B/30% [Waiting for headers] [Connecting to security.ubuntu.com] [Waiting for heade                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.38)] [Wa0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Connecting to security.ubu                                                                               Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
0% [1 InRelease gpgv 3,626 B] [3 InRelease 14.2 kB/88.7 kB 16%] [Waiting for he                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InR

In [2]:
# Install great expectations
!pip install great_expectations

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting great_expectations
  Downloading great_expectations-0.15.17-py3-none-any.whl (5.1 MB)
[K     |████████████████████████████████| 5.1 MB 5.2 MB/s 
Collecting Ipython>=7.16.3
  Downloading ipython-7.34.0-py3-none-any.whl (793 kB)
[K     |████████████████████████████████| 793 kB 47.5 MB/s 
[?25hCollecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.11-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 53.7 MB/s 
Collecting jsonpatch>=1.22
  Downloading jsonpatch-1.32-py2.py3-none-any.whl (12 kB)
Collecting notebook>=6.4.10
  Downloading notebook-6.4.12-py3-none-any.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 35.1 MB/s 
[?25hCollecting ruamel.yaml<0.17.18,>=0.16
  Downloading ruamel.yaml-0.17.17-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 43.1 MB/s 
Collecting pyparsing<3,>=2.4
  Downloading pyp

In [3]:
# Import SparkSession and great expectations 
from pyspark.sql import SparkSession
import great_expectations as ge

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
# Start Spark session
from pyspark import SparkFiles

In [4]:
# Read the data into a Spark DataFrame.
customer_df = spark.read.csv('/content/customer_data.csv', sep=',', header=True)
customer_df.show()

+---+-----------+---------+--------------------+-------------+
| id| first_name|last_name|               email|          car|
+---+-----------+---------+--------------------+-------------+
|  0|       Jeff|  Gregory|johnsonmichael@ex...|        Buick|
|  1|      Robin|  Johnson|ernest54@example.net|        Lexus|
|  2|    Shannon| Thompson| jmathis@example.com|        Honda|
|  3|      Sandy|  Collier|thomaskaren@examp...|       Toyota|
|  4|     Morgan|  Simpson|allendakota@examp...|        Dodge|
|  5|  Katherine|  Rosario|tateamanda@exampl...|      Porsche|
|  6|      Riley| Castillo|brittneypeterson@...|        Honda|
|  7|     Justin|      Lee| lprince@example.org|          BMW|
|  8|    Suzanne|    Smith| bbarnes@example.com|         Jeep|
|  9|    Charles| Marshall|heididiaz@example...|      Porsche|
| 10|    Gregory|  Shelton|   tward@example.org|       Subaru|
| 11|Christopher|   Hudson|larrypeters@examp...|      Porsche|
| 12|        Kim|   Knight| xnelson@example.net|       

In [5]:
# Create the Great Expectations DataFrame
customer_df_ge = ge.dataset.SparkDFDataset(customer_df)
customer_df_ge.head()

Unnamed: 0,id,first_name,last_name,email,car
0,0,Jeff,Gregory,johnsonmichael@example.net,Buick
1,1,Robin,Johnson,ernest54@example.net,Lexus
2,2,Shannon,Thompson,jmathis@example.com,Honda
3,3,Sandy,Collier,thomaskaren@example.com,Toyota
4,4,Morgan,Simpson,allendakota@example.org,Dodge


In [6]:
# Test that each column doesn't contain null values.
for column in ['id', 'first_name', 'last_name', 'email', 'car']:
  print(customer_df_ge.expect_column_values_to_not_be_null(column=column)['success'])

True
True
True
False
False


In [7]:
# Use a function to remove the null values from a temporary view and saves them to a parquet file. 
def separate_nulls():
  # Create a temporary view
  customer_df.createOrReplaceTempView('customers')
  # Use Spark SQL to retrieve all the null values form the temporary view.
  transformed_df = spark.sql(
      """
      SELECT
        *
      FROM customers
      WHERE car IS NULL OR email IS NULL
      """
  )
  # Add the null values to a new temporary view that we'll reference later.
  transformed_df.createOrReplaceTempView('removed_customers')
  # Write the null values to a parquet file. 
  transformed_df.write.mode("overwrite").parquet("customers_null_car_or_email")
  print("Null values have been found and written to a temporary view, 'removed_customers', and a parquet file,'customers_null_car_or_email'.")

In [8]:
# Retrieve the "success" fields for the "email" and "car" columns. 
result = []
result.append(customer_df_ge.expect_column_values_to_not_be_null('email')['success'])
result.append(customer_df_ge.expect_column_values_to_not_be_null('car')['success'])
# If the "success" results are False then call the separate_null() function. 
# If there are no null values print "success".
if False in result:
  separate_nulls()
else:
  print("success")

Null values have been found and written to a temporary view, 'removed_customers', and a parquet file,'customers_null_car_or_email'.


In [9]:
# Use a function to transform the customers temporary view if the id not in the removed_customers.
def transform_data():
  transformed_df = spark.sql(
      """
      SELECT
        UPPER(first_name) AS FIRST_NAME,
        UPPER(last_name) AS LAST_NAME,
        UPPER(email) AS EMAIL,
        UPPER(car) AS CAR,
        UNIX_TIMESTAMP() AS CREATED_AT,
        UNIX_TIMESTAMP() AS UPDATED_AT,
        'FROM CLIENT' AS SOURCE
      FROM customers
      WHERE id not in (select id from removed_customers)
      """
  )
  return transformed_df

In [10]:
# Call the transformation function and convert the transformed DataFrame into a Great Expectations DataFrame 
transformed_df = transform_data()
transformed_df_ge = ge.dataset.SparkDFDataset(transformed_df)
# Retrieve the "success" fields for the all the modified columns in the transformed DataFrame.
transformed_result = []
transformed_result.append(transformed_df_ge.expect_column_values_to_not_be_null('EMAIL')['success'])
transformed_result.append(transformed_df_ge.expect_column_values_to_not_be_null('CAR')['success'])
transformed_result.append(transformed_df_ge.expect_column_values_to_not_be_null('CREATED_AT')['success'])
transformed_result.append(transformed_df_ge.expect_column_values_to_not_be_null('UPDATED_AT')['success'])
transformed_result.append(transformed_df_ge.expect_column_values_to_not_be_null('source')['success'])
# If the "success" field is 'False' print "Failed", if not then write the transformed DataFrame to a parquet file.
if False in transformed_result:
  print('Failed')
else:
  transformed_df.write.mode("overwrite").parquet("customers")
  print("Success: Wrote the transformed data to a parquet file, 'customers'.")


Success: Wrote the transformed data to a parquet file, 'customers'.


In [11]:
# Read in our customers parquet data
customers_p_df=spark.read.parquet('customers')
customers_p_df.show(truncate=False)

+-----------+---------+----------------------------+-------------+----------+----------+-----------+
|FIRST_NAME |LAST_NAME|EMAIL                       |CAR          |CREATED_AT|UPDATED_AT|SOURCE     |
+-----------+---------+----------------------------+-------------+----------+----------+-----------+
|JEFF       |GREGORY  |JOHNSONMICHAEL@EXAMPLE.NET  |BUICK        |1660247133|1660247133|FROM CLIENT|
|ROBIN      |JOHNSON  |ERNEST54@EXAMPLE.NET        |LEXUS        |1660247133|1660247133|FROM CLIENT|
|SHANNON    |THOMPSON |JMATHIS@EXAMPLE.COM         |HONDA        |1660247133|1660247133|FROM CLIENT|
|SANDY      |COLLIER  |THOMASKAREN@EXAMPLE.COM     |TOYOTA       |1660247133|1660247133|FROM CLIENT|
|MORGAN     |SIMPSON  |ALLENDAKOTA@EXAMPLE.ORG     |DODGE        |1660247133|1660247133|FROM CLIENT|
|KATHERINE  |ROSARIO  |TATEAMANDA@EXAMPLE.NET      |PORSCHE      |1660247133|1660247133|FROM CLIENT|
|RILEY      |CASTILLO |BRITTNEYPETERSON@EXAMPLE.COM|HONDA        |1660247133|1660247133|FRO