In [1]:
import datetime
import pandas as pd
import great_expectations as gx
import great_expectations.jupyter_ux

2023-02-08T12:42:23-0500 - INFO - Great Expectations logging enabled at 20 level by JupyterUX module.


In [2]:
import sqlite3
import boto3
import yaml
import csv

In [3]:
conn = sqlite3.connect("noaa_goes_date.db")
cursor = conn.cursor()

In [4]:
# Execute your SELECT statement
cursor.execute("SELECT * FROM noaa_goes_date")

# Fetch all the rows from the result set
rows = cursor.fetchall()

columns = [column[0] for column in cursor.description]

# The name of the CSV file you want to create
filename = "data.csv"

# Open the CSV file for writing
with open(filename, "w", newline="") as file:
    # Create a CSV writer
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(columns)

    # Loop through the rows and write each row to the CSV file
    for row in rows:
        writer.writerow(row)

# Close the cursor and connection
cursor.close()
conn.close()

In [5]:
my_df = gx.read_csv("data.csv")

In [6]:
my_df.head()

Unnamed: 0,year,day,hour
0,2023,22,2
1,2022,264,5
2,2022,308,2
3,2023,21,12
4,2022,316,20


#### GREAT EXPECTATION 1: Checking if the year column in the dataset is between 2022 and 2023 

In [7]:
my_df.expect_column_values_to_be_between("year", 2022, 2023)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 4517,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true
}

#### GREAT EXPECTATION 2: Checking if the days column in the dataset is between 1 and 366(considering leap years also)

In [8]:
my_df.expect_column_values_to_be_between("day", 1, 366)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 4517,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true
}

#### GREAT EXPECTATION 3: Checking if the hour column in the dataset is between 0 and 24 

In [9]:
my_df.expect_column_values_to_be_between("hour", 0, 24)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 4517,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true
}

#### GREAT EXPECTATION 4: Checking if the year, day and hour column in the dataset is of type integer

In [10]:
my_df.expect_column_values_to_be_of_type(column="year", type_="int")

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "observed_value": "int64"
  },
  "success": false
}

In [11]:
my_df.expect_column_values_to_be_of_type(column="day", type_="int")

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "observed_value": "int64"
  },
  "success": false
}

In [12]:
my_df.expect_column_values_to_be_of_type(column="hour", type_="int")

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "observed_value": "int64"
  },
  "success": false
}