# 2021-58 LEHD LODES ETL QC - RAC (DLE)

RAC = Residence Area Characteristic data, jobs totaled by home **Census Block**

**Source data:** R:\DPOE\LEHD LODES\7.5\Source \
**ETL files:** R:\DPOE\LEHD LODES\7.5\ETL

**Server:** ddamwsql16 \
**Database:** socioec_data.lehd_lodes.rac_7_5

In [1]:
import pandas as pd
import pyodbc
import os
import glob
import numpy as np

## Access SQL database table

**Server:** ddamwsql16 \
**Database:** socioec_data.lehd_lodes.rac_7_5

In [2]:
%%time

# Connect to DDAMWSQL16 to access staging table and create separate dataframes for each year
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=dpoe_stage;'
                      'Trusted_Connection=yes;')

tbl = 'dpoe_stage.lehd_lodes.rac_7_5_20211018'

yrs = ['2019']

sql = {}

for x in yrs:
    qry = "SELECT * FROM " + tbl + " WHERE yr=" + x
    sql[x] = pd.read_sql_query(qry, conn)

Wall time: 1min 3s


In [3]:
# SQL Table query result, available for each year
sql['2019']

Unnamed: 0,h_geoid,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,...,CT02,CD01,CD02,CD03,CD04,CS01,CS02,type,segment,yr
0,060730002011044,1,0,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,JT04,SI03,2019
1,060730003003005,1,0,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,JT04,SI03,2019
2,060730007003001,1,0,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,JT04,SI03,2019
3,060730008001006,1,0,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,JT04,SI03,2019
4,060730008002005,1,0,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,JT04,SI03,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091737,060730132051022,13,4,7,2,2,6,5,0,0,...,13,1,3,4,1,6,7,JT01,S000,2019
1091738,060730191014083,4,1,2,1,1,0,3,0,0,...,2,1,1,1,0,3,1,JT01,S000,2019
1091739,060730186082004,2,0,1,1,2,0,0,0,0,...,0,1,1,0,0,1,1,JT00,SE01,2019
1091740,060730168022067,5,0,4,1,1,1,3,0,0,...,0,1,0,2,2,3,2,JT01,S000,2019


In [4]:
%%time
#totals
sql_qc = {}

for x in yrs:       
    # Remove non-numeric columns
    sql_qcyr = sql[x].select_dtypes(include=np.number)
    del sql_qcyr['yr']
    
    # Summary row
    total = sql_qcyr.sum()
    total.name = 'Total'
    
    sql_qc[x] = sql_qcyr.append(total.transpose())

Wall time: 1.05 s


In [5]:
# SQL table result data frame (e.g. sql_2016), with the descriptive columns removed and a total summary row appended
sql_qc['2019']

Unnamed: 0,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,...,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02
0,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
1,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
2,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
3,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
4,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091738,4,1,2,1,1,0,3,0,0,0,...,0,0,2,2,1,1,1,0,3,1
1091739,2,0,1,1,2,0,0,0,0,0,...,0,0,2,0,1,1,0,0,1,1
1091740,5,0,4,1,1,1,3,0,0,0,...,0,0,5,0,1,0,2,2,3,2
1091741,4,0,3,1,1,0,3,0,0,0,...,0,0,4,0,1,2,1,0,3,1


## Access Original RAC data

**Source data:** R:\DPOE\LEHD LODES\7.5\Source

Source data contains all census blocks in California. San Diego County records are indicated by h_geocode IDs starting with 6073.

In [6]:
%%time

csv_sd ={}

# Aggregate each year's CSV files and output separate dataframe for each year
for x in yrs:
    csv_yr = '*_' + x + '.csv'
    
    # Join all the CSV files in the folder
    files = os.path.join(r'R:/DPOE/LEHD LODES/7.5_2021_11Nov/Source/LODES_Download_2021-11-19-20-07-45/rac', csv_yr)
    
    # Return list of all joined file names
    files_list = glob.glob(files)
    
    # Join all the csv into single dataframe
    joined_csv = pd.concat(map(pd.read_csv, files_list), ignore_index=True)
    
    # Extract only SD County records (6073*)
    csv_sd[x] = joined_csv[joined_csv['h_geocode'].astype(str).str.startswith('6073')]

Wall time: 8min 38s


In [7]:
# CSV join result, available for each year
csv_sd['2019']

Unnamed: 0,h_geocode,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,...,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02,createdate
261774,60730001001000,140,23,82,35,28,19,93,0,0,...,8,129,11,6,20,32,59,74,66,20211018
261775,60730001001001,2,1,1,0,1,0,1,0,0,...,0,2,0,0,0,0,1,2,0,20211018
261776,60730001001002,6,1,2,3,2,1,3,0,0,...,0,6,0,0,1,1,3,3,3,20211018
261777,60730001001003,12,4,5,3,5,1,6,0,0,...,0,12,0,0,2,2,4,6,6,20211018
261778,60730001001004,6,1,4,1,0,0,6,0,0,...,0,4,2,2,1,1,1,2,4,20211018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14563197,60730221003003,1,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,20211018
14563198,60730221003004,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,1,0,20211018
14563199,60730221003007,1,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,1,1,0,20211018
14563200,60730221003012,1,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,20211018


In [8]:
%%time

csv_qc = {}

for x in yrs:
    # Select only numeric columns
    csv_qcyr = csv_sd[x].iloc[:, 1:-1]
    
    # Summary row
    total = csv_qcyr.sum()
    total.name = 'Total'
    
    csv_qc[x] = csv_qcyr.append(total.transpose())

Wall time: 392 ms


In [9]:
# CSV table data frame (e.g. csv_sd2016), with the descriptive columns removed and a total summary row appended

csv_qc['2019']

Unnamed: 0,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,...,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02
261774,140,23,82,35,28,19,93,0,0,1,...,0,8,129,11,6,20,32,59,74,66
261775,2,1,1,0,1,0,1,0,0,0,...,0,0,2,0,0,0,0,1,2,0
261776,6,1,2,3,2,1,3,0,0,0,...,0,0,6,0,0,1,1,3,3,3
261777,12,4,5,3,5,1,6,0,0,0,...,0,0,12,0,0,2,2,4,6,6
261778,6,1,4,1,0,0,6,0,0,0,...,0,0,4,2,2,1,1,1,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14563198,1,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
14563199,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
14563200,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,1,0
14563201,1,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1


## ✅ QC Test: Compare SQL tables vs CSV tables

**DLE QC Notes:** Pass
1. Data frame dimensions match
2. Summary totals match

In [10]:
%%time

for x in yrs:   
    # Compare dataframe shapes
    shape_qc = sql_qc[x].shape == csv_qc[x].shape
    
    if shape_qc == True:
        # Get only total rows from each data frame
        sql_tot = sql_qc[x].iloc[-1:]
        csv_tot = csv_qc[x].iloc[-1:]
        
        # Check summary totals
        tot_check = csv_tot.equals(sql_tot)
        
        print('{0} - Shapes match: {1} | Totals match: {2}'.format(x, shape_qc, tot_check))
    else:
        print('{0}: data frame shape error'.format(x))

2019 - Shapes match: True | Totals match: True
Wall time: 4.98 ms


## ⚠️ QC Test: Check Census Blocks Included

**DLE QC Notes:** SME response needed
1. The number of LEHD census blocks included each year varies - is this expected?
2. The number of LEHD census blocks is less than the total number of census blocks - is this expected?

This is also the case for 7.4 (2014, 2015)

In [15]:
# Connection to DDAMWSQL16
# Check distinct census blocks in SQL table and CSVs

conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=dpoe_stage;'
                      'Trusted_Connection=yes;')

tbl = 'dpoe_stage.lehd_lodes.rac_7_5_20211018'

sql_blocks = {}

for x in yrs:
    # Check distinct census blocks in year SQL table
    qry = "SELECT DISTINCT h_geoid FROM " + tbl + " WHERE yr=" + x
    sql_blocks[x] = pd.read_sql_query(qry, conn)
    
    sql_blocksyr = len(sql_blocks[x])
    
    # Check distinct census blocks in year CSV
    csv_blockssdyr = len(csv_sd[x].drop_duplicates(subset=['h_geocode']))
    
    sql_csv_blocks = csv_blockssdyr == sql_blocksyr
    
    print('{0}: {1} - SQL: {2}; CSV: {3}'.format(x, sql_csv_blocks, sql_blocksyr, csv_blockssdyr))

2019: True - SQL: 28429; CSV: 28429


In [12]:
# Connection to sql2014b8

# conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
#                       'Server=sql2014b8.sandag.org;'
#                       'Database=GeoDepot;'
#                       'Trusted_Connection=yes;')

# # Check number of distinct census blocks in CENSUSBLOCKS layer
# qry = "SELECT DISTINCT CTBLOCK FROM GeoDepot.gis.CENSUSBLOCKS"
# gis = pd.read_sql_query(qry, conn)

# gis_blocks = len(gis)

# print('Total Census Blocks in GeoDepot: {0}'.format(gis_blocks))