# Performs Data Quality Checks

In [1]:
import os
import sys
import unittest
import configparser
import psycopg2
import psycopg2.extras
from sql_queries import prod_tables

config = configparser.ConfigParser()
config.read('config.ini')

['config.ini']

In [2]:
# host = config.get('RDS','host')
# username = config.get('RDS','user')
# password = config.get('RDS','password')
# database = config.get('RDS','database')      
# db = psycopg2.connect(host=host, user=username, password=password, database=database)
# curr = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
# curr.execute("SELECT COUNT(*) FROM {}".format('prod_police_shootings'))
# record_count = (self.curr).fetchone()
# record_count
# curr.close()

In [3]:
host = config.get('postgres','host')
username = config.get('postgres','user')
password = config.get('postgres','password')
database = config.get('postgres','database')      
db = psycopg2.connect(host=host, user=username, password=password, database=database)
curr = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
curr.execute("SELECT COUNT(*) FROM {}".format('prod_police_shootings'))
record_count = (curr).fetchone()
record_count
curr.close()

[8775]

In [4]:
def run_tests(test_class):
    suite = unittest.TestLoader().loadTestsFromTestCase(test_class)
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(suite)

In [5]:
class TestQualityCheck(unittest.TestCase):
    curr = None
    
    @classmethod
    def setUpClass(cls):
        # Set up db connections
        host = config.get('postgres','host')
        username = config.get('postgres','user')
        password = config.get('postgres','password')
        database = config.get('postgres','database')      
        db = psycopg2.connect(host=host, user=username, password=password, database=database)
        cls.curr = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
        
    @classmethod
    def tearDownClass(cls):
        if (cls.curr):
            (cls.curr).close()
        
    def test_connection(self):
        self.assertNotEqual(self.curr, None)
        
    def test_distinct_records(self):
        """
        The following test ensures consistency, 
        making sure duplicate records doesn't exist
        """        
        for table in prod_tables:
            self.curr.execute("SELECT COUNT(*) FROM {}".format(table))
            record_count = (self.curr).fetchone()
            
            self.curr.execute("SELECT COUNT(*) FROM (SELECT DISTINCT * FROM {}) AS temp".format(table))
            distinct_record_count = (self.curr).fetchone()
            
            print("{}: Record Count {}, Distinct Count {}".format(table, record_count, distinct_record_count))
            self.assertEqual(record_count, distinct_record_count)
            
    def test_date_formatting(self):
        """
        This following test ensures that the date value
        is of the following format YYYY-MM-DD. 
        Makes parsing out the year consistent.
        """
        
        self.curr.execute("SELECT count(date) FROM prod_police_shootings")
        control_count = (self.curr).fetchone()
        
        self.curr.execute("""
            SELECT count(date) FROM prod_police_shootings WHERE CAST(date AS varchar) ~ '^\d\d\d\d-\d\d-\d\d$'
        """)
        record_count = (self.curr).fetchone()
        
        print("Date in  this format YYYY-MM-DD: Control Count {}, Record Count {}" \
              .format(control_count, record_count))
        self.assertEqual(control_count, record_count)
        
    def test_duplicate_records(self):
        """
        This following test ensures that there are no null rows
        """
        control_count = 0
        
        self.curr.execute("""
            SELECT count(*) AS count FROM 
            (
                SELECT name, age, count(*) AS count
                FROM prod_police_shootings
                GROUP BY name, age
            ) AS a
            WHERE count > 1 AND name IS NOT null
        """)
        record_count = (self.curr).fetchone()        
        
        print("Number of Duplicate Entries: Record Count {}" \
              .format(control_count, record_count))
        self.assertEqual(control_count, record_count)        

In [6]:
run_tests(TestQualityCheck)

test_connection (__main__.TestQualityCheck) ... ok
test_date_formatting (__main__.TestQualityCheck) ... ok
test_distinct_records (__main__.TestQualityCheck) ... ok
test_duplicate_records (__main__.TestQualityCheck) ... 

Date in  this format YYYY-MM-DD: Control Count [8775], Record Count [8775]
prod_police_agencies: Record Count [3325], Distinct Count [3325]
prod_us_cities: Record Count [28338], Distinct Count [28338]
prod_us_demographics: Record Count [87976], Distinct Count [87976]
prod_unemployment: Record Count [3238], Distinct Count [3238]
prod_police_shootings: Record Count [8775], Distinct Count [8775]
Number of Duplicate Entries: Record Count 0


FAIL

FAIL: test_duplicate_records (__main__.TestQualityCheck)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-5-0f625455a58f>", line 75, in test_duplicate_records
    self.assertEqual(control_count, record_count)
AssertionError: 0 != [2]

----------------------------------------------------------------------
Ran 4 tests in 0.145s

FAILED (failures=1)
