In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc
import sys
import dask.dataframe as dd

In [2]:
path = r'R:\DPOE\LEHD LODES\8.0\Source\od'

In [3]:
sources = dict()
for year in range(2002, 2021):
    # source data
    csv = glob.glob(path + "/*" + str(year) + ".csv")
    dfs = []

    for filename in csv:
        df = dd.read_csv(filename)
        w_geo = df['w_geocode'].astype(str).str.startswith('6073')
        h_geo = df['h_geocode'].astype(str).str.startswith('6073')
        dfs.append(df[w_geo|h_geo])

    dfs_year = dd.concat(dfs, axis=0)
    sources[year] = dfs_year
    print('----complete: ' + str(year))

----complete: 2020


In [4]:
sources[2020].head()

Unnamed: 0,w_geocode,h_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,createdate
128743,60730001001021,220050303012005,1,0,1,0,0,0,1,1,0,0,20230321
128744,60730001002025,171859573001020,1,0,1,0,1,0,0,0,0,1,20230321
128745,60730002011000,420171004021021,1,0,1,0,1,0,0,0,0,1,20230321
128746,60730002011008,511076112041001,1,0,1,0,0,0,1,0,0,1,20230321
128747,60730002012004,80410045151023,1,0,1,0,0,0,1,0,0,1,20230321


In [5]:
oos_path =  r'R:\DPOE\LEHD LODES\8.0\Source\oos'
oos_sources = dict()
for year in range(2002, 2021):
    
    csv = glob.glob(oos_path + "/*" + str(year) + ".csv")
    dfs = []

    for filename in csv:
        df = dd.read_csv(filename)
        w_geo = df['w_geocode'].astype(str).str.startswith('6073')
        h_geo = df['h_geocode'].astype(str).str.startswith('6073')
        dfs.append(df[h_geo|w_geo])

    dfs_year = dd.concat(dfs, axis=0)
    oos_sources[year] = dfs_year
    print('----complete: ' + str(year))

----complete: 2020


In [6]:
oos_sources[2020].head()

Unnamed: 0,w_geocode,h_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,createdate
2389,10030114112049,60730083472001,1,0,0,1,0,0,1,0,1,0,20230321
6613,10139529001016,60730185093004,1,0,1,0,1,0,0,0,1,0,20230321
6642,10139529002036,60730200303013,1,0,0,1,0,0,1,0,1,0,20230321
6830,10150007002017,60730136044005,1,1,0,0,0,1,0,0,0,1,20230321
7450,10150012012011,60730029031000,1,1,0,0,1,0,0,0,1,0,20230321


**Loading the staging tables** 

In [7]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

In [8]:
database = dict()

for year in range(2002, 2021):
    od = """
    SELECT *
    FROM [dpoe_stage].[lehd_lodes].[od_8_0]
    WHERE yr = """ + str(year)
    db = pd.read_sql_query(od, conn)
    database[year] = db
    print('----complete: ' + str(year))

----complete: 2020


### QC checks
1. Compare the row counts
2. Compare data types<br>
**Note**- The source data only contains those that work within San Diego geoid (starting with 06073) or live within California (06). OOS data contains those that work outside of California, but live in San Diego.

In [None]:
for year in range(2002, 2021):
    source_df = sources[year].shape[0].compute()
    oos_df = oos_sources[year].shape[0]
    database_df = database[year].shape[0]
    print(str(year) + ': database sub - {0} || source sub - {1} || diff - {2}'.format(database_df, source_df, database_df-(source_df + oos_df))

### Compare data types

In [None]:
source_subs[2020].dtypes

In [None]:
database[2020].dtypes

### Compare sum totals

In [None]:
for year in range(2002, 2021):
    od_tot = sources[year].iloc[:,2:-1].sum().compute()
    oos_tot = oos_sources[year].iloc[:,2:-1].sum().compute()
    print('{0} - od_tot: {1} || oos_tot: {2}'.format(year, od_tot, oos_tot))
    print('{0} - od_tot: {1} || oos_tot: {2}'.format(year, od_tot.sum(), oos_tot.sum()))
    print('-------------------------------')

```
SQL

-- Get Table Totals
SELECT yr, 
SUM(S000) AS S000,
SUM(SA01) as SA01,
SUM(SA02) AS SA02,
SUM(SA03) AS SA03,
SUM(SE01) AS SE01,
SUM(SE02) AS SE02,
SUM(SE03) AS SE03,
SUM(SI01) AS SI01,
SUM(SI02) AS SI02,
SUM(SI03) AS SI03,
SUM(S000)+SUM(SA01)+SUM(SA02)+SUM(SA03)+SUM(SE01)+SUM(SE02)+SUM(SE03)+SUM(SI01)+SUM(SI02)+SUM(SI03) AS TOTAL
FROM [dpoe_stage].[lehd_lodes].[od_8_0]
GROUP BY yr
ORDER BY yr
```

### Check Census Blocks included

In [None]:
db_count_h = dict()
db_count_w = dict()

for year in range(2013, 2021):
    qry = "SELECT DISTINCT h_geocode FROM [dpoe_stage].[lehd_lodes].[od_8_0] WHERE yr = " + str(year)
    db_blocks = pd.read_sql_query(qry, conn)
    db_count_h[year] = db_blocks.shape[0]
    print('h_geoid: {0} - {1}'.format(year, db_count_h[year]))

    # w_geoid blocks
    qry = "SELECT DISTINCT w_geocode FROM [dpoe_stage].[lehd_lodes].[od_8_0] WHERE yr = " + str(year)
    db_blocks = pd.read_sql_query(qry, conn)
    db_count_w[year] = db_blocks.shape[0]
    print('w_geoid: {0} - {1}'.format(year, db_count_w[year]))

In [None]:
source_subs[2017]['h_geocode'].nunique()

In [None]:
## CHECKING CSV

source_count_h = {}
source_count_w = {}

for year in range(2013, 2021):
    # h_geocode blocks
    od_count_h = len(np.unique(np.concatenate(sources[year]['h_geocode'],
                  oos_sources[year]['h_geocode'],axis = 0)))
    print('h_geocode: {0} - {1}'.format(year, od_count_h))
    source_count_h[year] = od_count_h

    # w_geocode blocks
    od_count_w = len(np.unique(np.concatenate(sources[year]['w_geocode'],
                  oos_sources[year]['w_geocode'],axis = 0)))
    print('w_geocode: {0} - {1}'.format(year, od_count_w))
    source_count_w[year] = od_count_w

In [None]:
for year in range(2013, 2021):
    print('h difference: {0} - {1}'.format(year, source_count_h[year] - db_count_h[year]))
    print('w difference: {0} - {1}'.format(year, source_count_w[year] - db_count_w[year]))