In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [903 kB]
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelea

In [2]:
# Install great expectations
!pip install great_expectations

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting great_expectations
  Downloading great_expectations-0.15.17-py3-none-any.whl (5.1 MB)
[K     |████████████████████████████████| 5.1 MB 7.2 MB/s 
Collecting cryptography>=3.2
  Downloading cryptography-37.0.4-cp36-abi3-manylinux_2_24_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 35.0 MB/s 
Collecting ruamel.yaml<0.17.18,>=0.16
  Downloading ruamel.yaml-0.17.17-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 71.1 MB/s 
Collecting notebook>=6.4.10
  Downloading notebook-6.4.12-py3-none-any.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 43.9 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.11-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 69.5 MB/s 
[?25hCollecting Ipython>=7.16.3
  Downloading ipython-7.34.0-py3-none-any.whl (793 kB)
[K     |████████████████████

In [3]:
# Import SparkSession and great expectations 
from pyspark.sql import SparkSession
import great_expectations as ge

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
# Start Spark session
from pyspark import SparkFiles

In [5]:
# Read the data into a Spark DataFrame.
orgs_df = spark.read.csv('/content/orgs.csv', sep=',', header=True)
orgs_df.show()

+-----------+------------+--------------------+--------------------+------------+-------------+--------------+-----+
|        EIN|ORGANIZATION|                TYPE|            ACTIVITY|ASSET_AMOUNT|INCOME_AMOUNT|REVENUE_AMOUNT|STATE|
+-----------+------------+--------------------+--------------------+------------+-------------+--------------+-----+
|Corporation|     1049459|Parent Teacher Group|Educational Activ...|       86473|       187507|        266375|   CA|
|Corporation|     1049595|Professional Serv...|Extracurricular A...|       90718|       129918|        155113|   CA|
|Corporation|     1050054|Continuing Education|Educational Activ...|      102330|       167472|        175640|   IL|
|Corporation|     1050036| School Organization|Educational Activ...|       85930|       214223|        164111|   NY|
|Corporation|     1049229| School Organization|Extracurricular A...|      103675|       189733|        224577|   CA|
|Corporation|     1050409|Parent Teacher Group|Extracurricular A

In [6]:
# Create the Great Expectations DataFrame
orgs_df_ge = ge.dataset.SparkDFDataset(orgs_df)
orgs_df_ge.head()

Unnamed: 0,EIN,ORGANIZATION,TYPE,ACTIVITY,ASSET_AMOUNT,INCOME_AMOUNT,REVENUE_AMOUNT,STATE
0,Corporation,1049459,Parent Teacher Group,Educational Activities,86473,187507,266375,CA
1,Corporation,1049595,Professional Services,Extracurricular Activities,90718,129918,155113,CA
2,Corporation,1050054,Continuing Education,Educational Activities,102330,167472,175640,IL
3,Corporation,1050036,School Organization,Educational Activities,85930,214223,164111,NY
4,Corporation,1049229,School Organization,Extracurricular Activities,103675,189733,224577,CA


In [8]:
# Retrieve the values from the "success" fields for the column and row count and matching columns.
result = []
result.append(orgs_df_ge.expect_table_column_count_to_equal(8)['success'])
result.append(orgs_df_ge.expect_table_row_count_to_equal(337)['success'])
result.append(orgs_df_ge.expect_table_columns_to_match_ordered_list(['EIN',
                                                                 'ORGANIZATION',
                                                                 'TYPE',
                                                                 'ACTIVITY',
                                                                 'ASSET_AMOUNT',
                                                                 'INCOME_AMOUNT',
                                                                 'REVENUE_AMOUNT',
                                                                 'STATE'])['success'])

print(result)

[True, True, True]


In [9]:
# Create a temporary view and transform the temporary view to retrieve only the "School Organization" values from the "TYPE" column. 
def transform_data():
    orgs_df.createOrReplaceTempView('orgs')
  
    transformed_df = spark.sql(
        """
        SELECT
          ORGANIZATION,
          TYPE,
          ACTIVITY,
          STATE
        FROM ORGS
        WHERE TYPE = 'School Organization'
        """
    )
  
    transformed_df = ge.dataset.SparkDFDataset(transformed_df)
    return transformed_df

In [10]:
# Run tests against the transformed DataFrame if the column and row count, 
# and matching columns are correct in the original DataFrame. 
if False in result:
  print("Failed")
else:
  transformed_df = transform_data()
  transformed_result = []
  transformed_result.append(transformed_df.expect_table_columns_to_match_ordered_list(['ORGANIZATION',
                                                                                       'TYPE',
                                                                                       'ACTIVITY',
                                                                                       'STATE'])['success'])
  transformed_result.append(transformed_df.expect_column_values_to_be_in_set('type', ['School Organization'])['success'])

In [11]:
# Determine if the transformation passed or failed. 
if False in transformed_result:
  print('Failed')
else:
  print('success')

success
