In [1]:
import pandas as pd
import dask.dataframe as dd
import pyodbc
import os
import glob
import numpy as np

## Database

In [2]:
# Connect to DDAMWSQL16 to access staging table and create separate dataframes for each year
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

# tbl = 'socioec_data.lehd_lodes.od_7_5'
tbl = '[dpoe_stage].[lehd_lodes].[rac_8_0]'

yrs = range(2002, 2021)

database = {}

for x in yrs:
    qry = "SELECT * FROM " + tbl + " WHERE yr=" + str(x)
    database[x] = pd.read_sql_query(qry, conn)

In [16]:
# Connect to DDAMWSQL16 to access staging table and create separate dataframes for each year
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

# tbl = 'socioec_data.lehd_lodes.od_7_5'
tbl = '[dpoe_stage].[lehd_lodes].[rac_8_0]'

yrs = range(2002, 2021)
result = {}
for x in yrs:
    qry = "SELECT yr FROM " + tbl + " WHERE yr=" + str(x)
    result_yr = pd.read_sql_query(qry, conn)
    result[x] = result_yr
    print('{0}'.format(x))

2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


# Check data totals

In [4]:
sql_qc = {}

for x in yrs:       
    # Remove non-numeric columns
    sql_qcyr = database[x].select_dtypes(include=np.number)
    del sql_qcyr['yr']
    
    # Summary row
    total = sql_qcyr.sum()
    total.name = 'Total'
    
    sql_qc[x] = sql_qcyr.append(total.transpose())

## Source data

In [5]:
sources = dict()
for year in yrs:
    # source data
    csv = glob.glob(r"R:\DPOE\LEHD LODES\8.0\Source\rac" + "/*" + str(year) + ".csv")
    dfs = []

    for filename in csv:
        df = dd.read_csv(filename)
        dfs.append(df)

    dfs_year = dd.concat(dfs, axis=0)
    
    sources[year] = pd.DataFrame(dfs_year[dfs_year['h_geocode'].astype(str).str.startswith('6073')].compute())
    print('----complete: ' + str(year))

----complete: 2002
----complete: 2003
----complete: 2004
----complete: 2005
----complete: 2006
----complete: 2007
----complete: 2008
----complete: 2009
----complete: 2010
----complete: 2011
----complete: 2012
----complete: 2013
----complete: 2014
----complete: 2015
----complete: 2016
----complete: 2017
----complete: 2018
----complete: 2019
----complete: 2020


In [7]:
csv_qc = {}

for x in yrs:
    # Select only numeric columns
    csv_qcyr = sources[x].iloc[:, 1:-1]
    
    # Summary row
    total = csv_qcyr.sum()
    total.name = 'Total'
    
    csv_qc[x] = csv_qcyr.append(total.transpose())

In [20]:
sql_qc[2002]

Unnamed: 0,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,...,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811542,25,13,7,5,0,25,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
811543,49,19,24,6,0,49,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
811544,11,5,2,4,0,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
811545,46,2,35,9,0,0,46,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
for x in yrs:   
    # Compare dataframe shapes
    shape_qc = sql_qc[x].shape == csv_qc[x].shape
    
    if shape_qc == True:
        # Get only total rows from each data frame
        sql_tot = sql_qc[x].iloc[-1:]
        csv_tot = csv_qc[x].iloc[-1:]
        
        # Check summary totals
        tot_check = csv_tot.equals(sql_tot)
        print('{0} - sql:{1} || csv:{2}'.format(x, sql_qc[x].shape, csv_qc[x].shape))
        print('sql:{0} || csv:{1}'.format(sql_tot.sum().sum(), csv_tot.sum().sum()))
        print('{0} - Shapes match: {1} | Totals match: {2}'.format(x, shape_qc, tot_check))
        print('----------------------------------------------------------------------------')
    else:
        print('{0}: data frame shape error'.format(x))
        

2002 - sql:(811547, 41) || csv:(811547, 41)
sql:66944832 || csv:66944832
2002 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2003 - sql:(814823, 41) || csv:(814823, 41)
sql:68549200 || csv:68549200
2003 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2004 - sql:(820053, 41) || csv:(820053, 41)
sql:69964096 || csv:69964096
2004 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2005 - sql:(821573, 41) || csv:(821573, 41)
sql:71022480 || csv:71022480
2005 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2006 - sql:(824503, 41) || csv:(824503, 41)
sql:72262976 || csv:72262976
2006 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2007 - sql:(802

In [9]:
# Connection to DDAMWSQL16
# Check distinct census blocks in SQL table and CSVs

conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

tbl = '[dpoe_stage].[lehd_lodes].[rac_8_0]'

sql_blocks = {}

for x in yrs:
    # Check distinct census blocks in year SQL table
    qry = "SELECT DISTINCT h_geocode FROM " + tbl + " WHERE yr=" + str(x)
    sql_blocks[x] = pd.read_sql_query(qry, conn)
    
    sql_blocksyr = sql_blocks[x].shape[0]
    
    # Check distinct census blocks in year CSV
    csv_blockssdyr = sources[x].drop_duplicates(subset=['h_geocode']).shape[0]
    
    sql_csv_blocks = csv_blockssdyr == sql_blocksyr
    
    print('{0}: {1} - SQL: {2}; CSV: {3}'.format(str(x), sql_csv_blocks, sql_blocksyr, csv_blockssdyr))

2002: True - SQL: 22275; CSV: 22275
2003: True - SQL: 22334; CSV: 22334
2004: True - SQL: 22412; CSV: 22412
2005: True - SQL: 22417; CSV: 22417
2006: True - SQL: 22469; CSV: 22469
2007: True - SQL: 22331; CSV: 22331
2008: True - SQL: 22649; CSV: 22649
2009: True - SQL: 22518; CSV: 22518
2010: True - SQL: 22627; CSV: 22627
2011: True - SQL: 22175; CSV: 22175
2012: True - SQL: 22149; CSV: 22149
2013: True - SQL: 22202; CSV: 22202
2014: True - SQL: 22220; CSV: 22220
2015: True - SQL: 22464; CSV: 22464
2016: True - SQL: 22438; CSV: 22438
2017: True - SQL: 22411; CSV: 22411
2018: True - SQL: 22416; CSV: 22416
2019: True - SQL: 22436; CSV: 22436
2020: True - SQL: 22797; CSV: 22797


In [10]:
# Connection to sql2014b8

conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=sql2014b8.sandag.org;'
                      'Database=GeoDepot;'
                      'Trusted_Connection=yes;')

# Check number of distinct census blocks in CENSUSBLOCKS layer
qry = "SELECT DISTINCT CTBLOCK FROM GeoDepot.gis.CENSUSBLOCKS"
gis = pd.read_sql_query(qry, conn)

gis_blocks = len(gis)

print('Total Census Blocks in GeoDepot: {0}'.format(gis_blocks))

Total Census Blocks in GeoDepot: 28474
