In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [903 kB]
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:13 http://security.ubuntu.com/ubuntu bionic-security/universe am

In [2]:
# Install great expectations
!pip install great_expectations

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting great_expectations
  Downloading great_expectations-0.15.17-py3-none-any.whl (5.1 MB)
[K     |████████████████████████████████| 5.1 MB 4.2 MB/s 
Collecting pyparsing<3,>=2.4
  Downloading pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.5 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.11-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 45.1 MB/s 
Collecting notebook>=6.4.10
  Downloading notebook-6.4.12-py3-none-any.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 38.3 MB/s 
Collecting Ipython>=7.16.3
  Downloading ipython-7.34.0-py3-none-any.whl (793 kB)
[K     |████████████████████████████████| 793 kB 57.4 MB/s 
[?25hCollecting jsonpatch>=1.22
  Downloading jsonpatch-1.32-py2.py3-none-any.whl (12 kB)
Collecting cryptography>=3.2
  Downloading cryptography-37.0.4-c

In [3]:
# Import SparkSession and great expectations 
from pyspark.sql import SparkSession
import great_expectations as ge

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
# Start Spark session
from pyspark import SparkFiles

In [5]:
# Read the data into a DataFrame.
consultants_df = spark.read.csv('/content/consultants.csv', sep=',', header=True)
consultants_df.show()

+----------------+---+-----+--------------------+--------------------+
|       full_name|age|state|hours_worked_in_2021|     consultant_type|
+----------------+---+-----+--------------------+--------------------+
|  Nicholas Smith| 47|   CA|                1883|Senior Software E...|
|      Joel Davis| 50|   WA|                1877|      Technical Lead|
|    Scott Norton| 31|   WA|                1875|      Technical Lead|
|   Chad Robinson| 50|   NY|                1868|   Software Engineer|
|      Ryan Glenn| 50|   FL|                1907|Senior Software E...|
|   Patrick Lewis| 48|   FL|                1866|      Technical Lead|
|   Richard Huang| 48|   CA|                1852|      Technical Lead|
| Valerie Estrada| 42|   WA|                1878|Senior Software E...|
|    William Hill| 52|   CA|                1889| Technical Architect|
|  Charles Mendez| 38|   WA|                1873|      Technical Lead|
|      Nancy Hill| 38|   WA|                1892|   Software Engineer|
|   Da

In [6]:
# Convert the Spark DataFrame into a Great Expectations DataFrame. 
consultants_df_ge = ge.dataset.SparkDFDataset(consultants_df)

In [7]:
consultants_df_ge.head()

Unnamed: 0,full_name,age,state,hours_worked_in_2021,consultant_type
0,Nicholas Smith,47,CA,1883,Senior Software Engineer
1,Joel Davis,50,WA,1877,Technical Lead
2,Scott Norton,31,WA,1875,Technical Lead
3,Chad Robinson,50,NY,1868,Software Engineer
4,Ryan Glenn,50,FL,1907,Senior Software Engineer


In [None]:
# Assert that there are 3 columns in the DataFrame. 
print(consultants_df_ge.expect_table_column_count_to_equal(3))

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 5
  },
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_table_column_count_to_equal",
    "meta": {},
    "kwargs": {
      "value": 3,
      "result_format": "BASIC"
    }
  },
  "meta": {}
}


In [8]:
# Assert that there are 5 columns in the DataFrame. 
print(consultants_df_ge.expect_table_column_count_to_equal(5))

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "meta": {},
    "kwargs": {
      "value": 5,
      "result_format": "BASIC"
    },
    "expectation_type": "expect_table_column_count_to_equal"
  },
  "result": {
    "observed_value": 5
  }
}


In [9]:
# Assert that there are 8831 rows in the DataFrame.
print(consultants_df_ge.expect_table_row_count_to_equal(8831))

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "meta": {},
    "kwargs": {
      "value": 8831,
      "result_format": "BASIC"
    },
    "expectation_type": "expect_table_row_count_to_equal"
  },
  "result": {
    "observed_value": 8831
  }
}


In [10]:
# Assert that the columns are in the correct order in the DataFrame.
print(consultants_df_ge.expect_table_columns_to_match_ordered_list(['full_name',
                                                                    'age',
                                                                    'state',
                                                                    'hours_worked_in_2021',
                                                                    'consultant_type']))

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "meta": {},
    "kwargs": {
      "column_list": [
        "full_name",
        "age",
        "state",
        "hours_worked_in_2021",
        "consultant_type"
      ],
      "result_format": "BASIC"
    },
    "expectation_type": "expect_table_columns_to_match_ordered_list"
  },
  "result": {
    "observed_value": [
      "full_name",
      "age",
      "state",
      "hours_worked_in_2021",
      "consultant_type"
    ]
  }
}


In [11]:
# Assert that the consultant_type column contains four job type values.
print(consultants_df_ge.expect_column_values_to_be_in_set('consultant_type', ['Senior Software Engineer',
                                                                              'Technical Lead',
                                                                              'Software Engineer',
                                                                              'Technical Architect']))

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "meta": {},
    "kwargs": {
      "column": "consultant_type",
      "value_set": [
        "Senior Software Engineer",
        "Technical Lead",
        "Software Engineer",
        "Technical Architect"
      ],
      "result_format": "BASIC"
    },
    "expectation_type": "expect_column_values_to_be_in_set"
  },
  "result": {
    "element_count": 8831,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}
