In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease [15.9 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu b

In [2]:
# Install great expectations
!pip install great_expectations

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting great_expectations
  Downloading great_expectations-0.15.17-py3-none-any.whl (5.1 MB)
[K     |████████████████████████████████| 5.1 MB 5.1 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.11-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 42.2 MB/s 
[?25hCollecting colorama>=0.4.3
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Collecting notebook>=6.4.10
  Downloading notebook-6.4.12-py3-none-any.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 33.1 MB/s 
[?25hCollecting jsonpatch>=1.22
  Downloading jsonpatch-1.32-py2.py3-none-any.whl (12 kB)
Collecting Ipython>=7.16.3
  Downloading ipython-7.34.0-py3-none-any.whl (793 kB)
[K     |████████████████████████████████| 793 kB 56.3 MB/s 
Collecting ruamel.yaml<0.17.18,>=0.16
  Downloading ruamel.yaml-0.17.17-py3-none-any.whl (109 kB)
[K     |█████████████

In [3]:
# Import SparkSession and great expectations 
from pyspark.sql import SparkSession
import great_expectations as ge

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
# Start Spark session
from pyspark import SparkFiles

In [4]:
# Read the data into a Spark DataFrame.
heart_health_df = spark.read.csv('/content/heart_health.csv', sep=',', header=True)
heart_health_df.show()

+----+----------+------------------+---------------+--------------------+-------------+----------+--------------------+
|Year|     State|      LocationDesc|GeographicLevel|               Class|        Topic|Death_Rate|     Data_Value_Unit|
+----+----------+------------------+---------------+--------------------+-------------+----------+--------------------+
|2015|  New York|   Saratoga County|         County|Cardiovascular Di...|Heart Disease|    363.48|per 100,000 popul...|
|2015|     Texas|     Travis County|         County|Cardiovascular Di...|Heart Disease|     435.4|per 100,000 popul...|
|2015|California|     Tulare County|         County|Cardiovascular Di...|Heart Disease|    394.27|per 100,000 popul...|
|2015|      Ohio|      Miami County|         County|Cardiovascular Di...|Heart Disease|    525.12|per 100,000 popul...|
|2015|      Ohio|   Mahoning County|         County|Cardiovascular Di...|Heart Disease|    416.35|per 100,000 popul...|
|2015|  New York|Schenectady County|    

In [5]:
# Create the Great Expectations DataFrame
heart_health_df_ge = ge.dataset.SparkDFDataset(heart_health_df)
heart_health_df_ge.head()

Unnamed: 0,Year,State,LocationDesc,GeographicLevel,Class,Topic,Death_Rate,Data_Value_Unit
0,2015,New York,Saratoga County,County,Cardiovascular Diseases,Heart Disease,363.48,"per 100,000 population"
1,2015,Texas,Travis County,County,Cardiovascular Diseases,Heart Disease,435.4,"per 100,000 population"
2,2015,California,Tulare County,County,Cardiovascular Diseases,Heart Disease,394.27,"per 100,000 population"
3,2015,Ohio,Miami County,County,Cardiovascular Diseases,Heart Disease,525.12,"per 100,000 population"
4,2015,Ohio,Mahoning County,County,Cardiovascular Diseases,Heart Disease,416.35,"per 100,000 population"


In [7]:
# Retrieve the "success" fields for the column and row count. 
result = []
result.append(heart_health_df_ge.expect_table_row_count_to_equal(799)['success'])
result.append(heart_health_df_ge.expect_column_values_to_be_in_set('State', ['New York',
                                                                      'Texas',
                                                                      'California',
                                                                      'Ohio',
                                                                      'Washington'])['success'])

print(result)

[True, True]


In [8]:
# Transform the heart_health_df DataFrame and retrieve the "State" and "LocationDesc" columns where the state is `'New York'. 
def transform_data():
  heart_health_df.createOrReplaceTempView('heart_health')

  transformed_df = spark.sql(
      """
      SELECT
        State,
        LocationDesc
      FROM heart_health
      WHERE State = 'New York'
      """
  )

  transformed_df = ge.dataset.SparkDFDataset(transformed_df)
  return transformed_df

In [9]:
# Run tests against the transformed DataFrame if the column and row count are correct in the original DataFrame. 
if False in result:
  print("Failed")
else:
  transformed_df = transform_data()
  transformed_result = []
  transformed_result.append(transformed_df.expect_column_values_to_be_in_set('State', ['New York'])['success'])
  transformed_result.append(transformed_df.expect_column_values_to_not_be_in_set('State', ['Texas', 'California', 'Ohio', 'Washington']))
  transformed_result.append(transformed_df.expect_table_row_count_to_be_between(1, 500))

  if False in transformed_result:
    print('Failed')
  else:
    print('success')


success
