In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import os
import pyodbc 
import glob

# R Drive Source

In [2]:
yrs = range(2002, 2021)

In [3]:
sources = dict()
for year in yrs:
    # source data
    csv = glob.glob(r"R:\DPOE\LEHD LODES\8.0\Source\wac" + "/*" + str(year) + ".csv")
    dfs = []

    for filename in csv:
        df = dd.read_csv(filename)
        dfs.append(df)

    dfs_year = dd.concat(dfs, axis=0)
    
    sources[year] = pd.DataFrame(dfs_year[dfs_year['w_geocode'].astype(str).str.startswith('6073')].compute())
    print('----complete: ' + str(year))

----complete: 2002
----complete: 2003
----complete: 2004
----complete: 2005
----complete: 2006
----complete: 2007
----complete: 2008
----complete: 2009
----complete: 2010
----complete: 2011
----complete: 2012
----complete: 2013
----complete: 2014
----complete: 2015
----complete: 2016
----complete: 2017
----complete: 2018
----complete: 2019
----complete: 2020


# Database

In [4]:
# Connect to DDAMWSQL16 to access staging table and create separate dataframes for each year
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

# tbl = 'socioec_data.lehd_lodes.od_7_5'
tbl = '[dpoe_stage].[lehd_lodes].[wac_8_0]'

database = {}

for x in yrs:
    qry = "SELECT * FROM " + tbl + " WHERE yr=" + str(x)
    database[x] = pd.read_sql_query(qry, conn)

# Check data totals

In [5]:
sql_qc = {}

for x in yrs:       
    # Remove non-numeric columns
    sql_qcyr = database[x].select_dtypes(include=np.number)
    del sql_qcyr['yr']
    
    # Summary row
    total = sql_qcyr.sum()
    total.name = 'Total'
    
    sql_qc[x] = sql_qcyr.append(total.transpose())
    
csv_qc = {}

for x in yrs:
    # Select only numeric columns
    csv_qcyr = sources[x].iloc[:, 1:-1]
    
    # Summary row
    total = csv_qcyr.sum()
    total.name = 'Total'
    
    csv_qc[x] = csv_qcyr.append(total.transpose())

In [9]:
for x in yrs:   
    # Compare dataframe shapes
    shape_qc = sql_qc[x].shape == csv_qc[x].shape
    
    if shape_qc == True:
        # Get only total rows from each data frame
        sql_tot = sql_qc[x].iloc[-1:]
        csv_tot = csv_qc[x].iloc[-1:]
        
        # Check summary totals
        tot_check = csv_tot.equals(sql_tot)
        print('{0} - sql:{1} || csv:{2}'.format(x, sql_qc[x].shape, csv_qc[x].shape))
        print('sql:{0} || csv:{1}'.format(sql_tot.sum().sum(), csv_tot.sum().sum()))
        print('{0} - Shapes match: {1} | Totals match: {2}'.format(x, shape_qc, tot_check))
        print('----------------------------------------------------------------------------')
    else:
        print('{0}: data frame shape error'.format(x))

2002 - sql:(369501, 51) || csv:(369501, 51)
sql:67094160 || csv:67094160
2002 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2003 - sql:(376805, 51) || csv:(376805, 51)
sql:67698848 || csv:67698848
2003 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2004 - sql:(383423, 51) || csv:(383423, 51)
sql:68507376 || csv:68507376
2004 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2005 - sql:(389881, 51) || csv:(389881, 51)
sql:69654928 || csv:69654928
2005 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2006 - sql:(395457, 51) || csv:(395457, 51)
sql:70606336 || csv:70606336
2006 - Shapes match: True | Totals match: True
----------------------------------------------------------------------------
2007 - sql:(438

In [7]:
# Connection to DDAMWSQL16
# Check distinct census blocks in SQL table and CSVs

conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

tbl = '[dpoe_stage].[lehd_lodes].[wac_8_0]'

sql_blocks = {}

for x in yrs:
    # Check distinct census blocks in year SQL table
    qry = "SELECT DISTINCT w_geocode FROM " + tbl + " WHERE yr=" + str(x)
    sql_blocks[x] = pd.read_sql_query(qry, conn)
    
    sql_blocksyr = sql_blocks[x].shape[0]
    
    # Check distinct census blocks in year CSV
    csv_blockssdyr = sources[x].drop_duplicates(subset=['w_geocode']).shape[0]
    
    sql_csv_blocks = csv_blockssdyr == sql_blocksyr
    
    print('{0}: {1} - SQL: {2}; CSV: {3}'.format(str(x), sql_csv_blocks, sql_blocksyr, csv_blockssdyr))

2002: True - SQL: 12983; CSV: 12983
2003: True - SQL: 13218; CSV: 13218
2004: True - SQL: 13400; CSV: 13400
2005: True - SQL: 13574; CSV: 13574
2006: True - SQL: 13726; CSV: 13726
2007: True - SQL: 15750; CSV: 15750
2008: True - SQL: 16104; CSV: 16104
2009: True - SQL: 16119; CSV: 16119
2010: True - SQL: 16115; CSV: 16115
2011: True - SQL: 14262; CSV: 14262
2012: True - SQL: 14844; CSV: 14844
2013: True - SQL: 14863; CSV: 14863
2014: True - SQL: 15074; CSV: 15074
2015: True - SQL: 15303; CSV: 15303
2016: True - SQL: 15760; CSV: 15760
2017: True - SQL: 15922; CSV: 15922
2018: True - SQL: 16106; CSV: 16106
2019: True - SQL: 16287; CSV: 16287
2020: True - SQL: 15675; CSV: 15675


In [8]:
# Connection to sql2014b8

conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=sql2014b8.sandag.org;'
                      'Database=GeoDepot;'
                      'Trusted_Connection=yes;')

# Check number of distinct census blocks in CENSUSBLOCKS layer
qry = "SELECT DISTINCT CTBLOCK FROM GeoDepot.gis.CENSUSBLOCKS"
gis = pd.read_sql_query(qry, conn)

gis_blocks = len(gis)

print('Total Census Blocks in GeoDepot: {0}'.format(gis_blocks))

Total Census Blocks in GeoDepot: 28474
