# 2021-58 LEHD LODES ETL QC - RAC (DLE)

RAC = Residence Area Characteristic data, jobs totaled by home **Census Block**

**Source data:** R:\DPOE\LEHD LODES\7.5\Source \
**ETL files:** R:\DPOE\LEHD LODES\7.5\ETL

**Server:** ddamwsql16 \
**Database:** socioec_data.lehd_lodes.rac_7_5

In [1]:
import pandas as pd
import pyodbc
import os
import glob
import numpy as np

## Access SQL database table

**Server:** ddamwsql16 \
**Database:** socioec_data.lehd_lodes.rac_7_5

In [2]:
%%time

# Connect to DDAMWSQL16 to access staging table and create separate dataframes for each year
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

tbl = 'socioec_data.lehd_lodes.od_7_5'
tbl = 'socioec_data.lehd_lodes.rac_7_5'

yrs = ['2016', '2017', '2018']

sql = {}

for x in yrs:
    qry = "SELECT * FROM " + tbl + " WHERE yr=" + x
    sql[x] = pd.read_sql_query(qry, conn)

Wall time: 8min 21s


In [3]:
# SQL Table query result, available for each year
sql['2017']

Unnamed: 0,h_geoid,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,...,CT02,CD01,CD02,CD03,CD04,CS01,CS02,type,segment,yr
0,060730171061005,9,0,9,0,0,1,8,0,0,...,2,1,3,4,1,7,2,JT00,SA02,2017
1,060730171061010,10,0,10,0,0,3,7,0,0,...,0,2,0,2,6,6,4,JT00,SA02,2017
2,060730171061011,2,0,2,0,0,0,2,0,0,...,0,0,1,0,1,1,1,JT00,SA02,2017
3,060730171061012,20,0,20,0,2,5,13,1,0,...,2,5,4,7,4,14,6,JT00,SA02,2017
4,060730171061015,6,0,6,0,1,1,4,0,0,...,0,3,0,2,1,4,2,JT00,SA02,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089133,060730083293003,24,4,18,2,3,3,18,0,0,...,2,3,2,3,12,8,16,JT00,SI03,2017
1089134,060730083293004,66,13,45,8,6,16,44,0,0,...,9,4,13,8,28,34,32,JT00,SI03,2017
1089135,060730083301000,265,41,156,68,40,51,174,0,0,...,30,14,45,59,106,117,148,JT00,SI03,2017
1089136,060730083301001,28,5,15,8,2,7,19,0,0,...,0,2,2,8,11,17,11,JT00,SI03,2017


In [4]:
%%time

sql_qc = {}

for x in yrs:       
    # Remove non-numeric columns
    sql_qcyr = sql[x].select_dtypes(include=np.number)
    del sql_qcyr['yr']
    
    # Summary row
    total = sql_qcyr.sum()
    total.name = 'Total'
    
    sql_qc[x] = sql_qcyr.append(total.transpose())

Wall time: 11.5 s


In [5]:
# SQL table result data frame (e.g. sql_2016), with the descriptive columns removed and a total summary row appended
sql_qc['2017']

Unnamed: 0,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,...,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02
0,9,0,9,0,0,1,8,0,0,0,...,0,0,7,2,1,3,4,1,7,2
1,10,0,10,0,0,3,7,0,0,0,...,0,0,10,0,2,0,2,6,6,4
2,2,0,2,0,0,0,2,0,0,0,...,0,0,2,0,0,1,0,1,1,1
3,20,0,20,0,2,5,13,1,0,0,...,0,0,18,2,5,4,7,4,14,6
4,6,0,6,0,1,1,4,0,0,0,...,0,0,6,0,3,0,2,1,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089134,66,13,45,8,6,16,44,0,0,0,...,1,1,57,9,4,13,8,28,34,32
1089135,265,41,156,68,40,51,174,0,0,0,...,0,5,235,30,14,45,59,106,117,148
1089136,28,5,15,8,2,7,19,0,0,0,...,0,0,28,0,2,2,8,11,17,11
1089137,43,3,26,14,9,7,27,0,0,0,...,0,1,42,1,3,4,15,18,19,24


## Access Original RAC data

**Source data:** R:\DPOE\LEHD LODES\7.5\Source

Source data contains all census blocks in California. San Diego County records are indicated by h_geocode IDs starting with 6073.

In [6]:
%%time

csv_sd ={}

# Aggregate each year's CSV files and output separate dataframe for each year
for x in yrs:
    csv_yr = '*_' + x + '.csv'
    
    # Join all the CSV files in the folder
    files = os.path.join(r'R:\DPOE\LEHD LODES\7.5\Source\RAC\Unzipped_RD', csv_yr)
    
    # Return list of all joined file names
    files_list = glob.glob(files)
    
    # Join all the csv into single dataframe
    joined_csv = pd.concat(map(pd.read_csv, files_list), ignore_index=True)
    
    # Extract only SD County records (6073*)
    csv_sd[x] = joined_csv[joined_csv['h_geocode'].astype(str).str.startswith('6073')]

Wall time: 24min 19s


In [7]:
# CSV join result, available for each year
csv_sd['2017']

Unnamed: 0,h_geocode,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,...,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02,createdate
261641,60730001001000,135,23,83,29,26,31,78,1,0,...,2,112,23,14,21,31,46,67,68,20201110
261642,60730001001001,2,0,2,0,0,1,1,0,0,...,0,1,1,0,0,0,2,0,2,20201110
261643,60730001001002,5,1,3,1,1,0,4,0,0,...,0,4,1,0,1,0,3,1,4,20201110
261644,60730001001003,9,1,4,4,2,0,7,0,0,...,0,9,0,0,1,5,2,3,6,20201110
261645,60730001001004,12,1,6,5,0,3,9,0,0,...,0,3,9,3,2,3,3,6,6,20201110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14557263,60730221002022,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,1,0,20201110
14557264,60730221003001,2,0,1,1,0,0,2,0,0,...,1,1,1,0,0,0,2,0,2,20201110
14557265,60730221003004,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,1,0,20201110
14557266,60730221003005,1,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,20201110


In [8]:
%%time

csv_qc = {}

for x in yrs:
    # Select only numeric columns
    csv_qcyr = csv_sd[x].iloc[:, 1:-1]
    
    # Summary row
    total = csv_qcyr.sum()
    total.name = 'Total'
    
    csv_qc[x] = csv_qcyr.append(total.transpose())

Wall time: 2.55 s


In [9]:
# CSV table data frame (e.g. csv_sd2016), with the descriptive columns removed and a total summary row appended

csv_qc['2017']

Unnamed: 0,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,...,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02
261641,135,23,83,29,26,31,78,1,0,1,...,0,2,112,23,14,21,31,46,67,68
261642,2,0,2,0,0,1,1,0,0,0,...,0,0,1,1,0,0,0,2,0,2
261643,5,1,3,1,1,0,4,0,0,0,...,0,0,4,1,0,1,0,3,1,4
261644,9,1,4,4,2,0,7,0,0,0,...,0,0,9,0,0,1,5,2,3,6
261645,12,1,6,5,0,3,9,0,0,0,...,0,0,3,9,3,2,3,3,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14557264,2,0,1,1,0,0,2,0,0,0,...,0,1,1,1,0,0,0,2,0,2
14557265,1,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
14557266,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,1,0
14557267,1,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,1


## ✅ QC Test: Compare SQL tables vs CSV tables

**DLE QC Notes:** Pass
1. Data frame dimensions match
2. Summary totals match

In [10]:
%%time

for x in yrs:   
    # Compare dataframe shapes
    shape_qc = sql_qc[x].shape == csv_qc[x].shape
    
    if shape_qc == True:
        # Get only total rows from each data frame
        sql_tot = sql_qc[x].iloc[-1:]
        csv_tot = csv_qc[x].iloc[-1:]
        
        # Check summary totals
        tot_check = csv_tot.equals(sql_tot)
        
        print('{0} - Shapes match: {1} | Totals match: {2}'.format(x, shape_qc, tot_check))
    else:
        print('{0}: data frame shape error'.format(x))

2016 - Shapes match: True | Totals match: True
2017 - Shapes match: True | Totals match: True
2018 - Shapes match: True | Totals match: True
Wall time: 120 ms


## ⚠️ QC Test: Check Census Blocks Included

**DLE QC Notes:** SME response needed
1. The number of LEHD census blocks included each year varies - is this expected?
2. The number of LEHD census blocks is less than the total number of census blocks - is this expected?

This is also the case for 7.4 (2014, 2015)

In [11]:
# Connection to DDAMWSQL16
# Check distinct census blocks in SQL table and CSVs

conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

tbl = 'socioec_data.lehd_lodes.rac_7_5'

sql_blocks = {}

for x in yrs:
    # Check distinct census blocks in year SQL table
    qry = "SELECT DISTINCT h_geoid FROM " + tbl + " WHERE yr=" + x
    sql_blocks[x] = pd.read_sql_query(qry, conn)
    
    sql_blocksyr = len(sql_blocks[x])
    
    # Check distinct census blocks in year CSV
    csv_blockssdyr = len(csv_sd[x].drop_duplicates(subset=['h_geocode']))
    
    sql_csv_blocks = csv_blockssdyr == sql_blocksyr
    
    print('{0}: {1} - SQL: {2}; CSV: {3}'.format(x, sql_csv_blocks, sql_blocksyr, csv_blockssdyr))

2016: True - SQL: 28422; CSV: 28422
2017: True - SQL: 28394; CSV: 28394
2018: True - SQL: 28418; CSV: 28418


In [12]:
# Connection to sql2014b8

conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=sql2014b8.sandag.org;'
                      'Database=GeoDepot;'
                      'Trusted_Connection=yes;')

# Check number of distinct census blocks in CENSUSBLOCKS layer
qry = "SELECT DISTINCT CTBLOCK FROM GeoDepot.gis.CENSUSBLOCKS"
gis = pd.read_sql_query(qry, conn)

gis_blocks = len(gis)

print('Total Census Blocks in GeoDepot: {0}'.format(gis_blocks))

Total Census Blocks in GeoDepot: 39932
