In [8]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Loading Source Data (From R Drive)

In [9]:
path = r'R:/DPOE/LEHD LODES/7.5/Source/OD_Data/Unzipped_RD'

In [10]:
# 2018 source data
csv_2018 = glob.glob(path + "/*2018.csv")
list_2018 = []

for filename in csv_2018:
    df = pd.read_csv(filename, index_col=None, header=0)
    list_2018.append(df)

In [11]:
od_2018 = pd.concat(list_2018, axis=0, ignore_index=True)

In [12]:
od_2018.dtypes

w_geocode     float64
h_geocode     float64
S000            int64
SA01            int64
SA02            int64
SA03            int64
SE01            int64
SE02            int64
SE03            int64
SI01            int64
SI02            int64
SI03            int64
createdate      int64
dtype: object

# Loading The Staging Table (SQL Server)

In [13]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

#2018
od_db18 ="SELECT * "\
                   "FROM [socioec_data].[lehd_lodes].[od_7_5]"\
                    "WHERE yr= 2018"
db18 = pd.read_sql_query(od_db18, conn)

In [14]:
db18.dtypes

w_geoid    object
h_geoid    object
S000        int64
SA01        int64
SA02        int64
SA03        int64
SE01        int64
SE02        int64
SE03        int64
SI01        int64
SI02        int64
SI03        int64
type       object
yr          int64
dtype: object

# Cleaning Source Data

In [15]:
od_2018 = od_2018.rename({'w_geocode': 'w_geoid', 'h_geocode': 'h_geoid'}, axis=1)

In [16]:
od_2018 = od_2018.drop(['createdate',], axis=1)

# Cleaning Staging Data

In [17]:
db18 = db18.drop(['type','yr'], axis=1)

In [18]:
db18['w_geoid'] = [i.lstrip('0') for i in db18['w_geoid']]

In [19]:
db18['h_geoid'] = [i.lstrip('0') for i in db18['h_geoid']]

# Filtering for just SD Data (Source Data)

In [20]:
f_od_2018 = od_2018 [od_2018['w_geoid'].astype(str).str.startswith('6073') | od_2018['h_geoid'].astype(str).str.startswith('6073')]

# Filtering for just SD Data (Staging Data)

In [21]:
f_db18 = db18[db18['w_geoid'].astype(str).str.startswith('6073') | db18['h_geoid'].astype(str).str.startswith('6073')]

# Comparison 

In [22]:
f_db18.columns == f_od_2018.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

# Checking the shape of the data

In [23]:
print(f_od_2018.shape)
print(f_db18.shape)

(5685834, 12)
(5685834, 12)


In [24]:
f_od_2018.shape[0] - f_db18.shape[0]

0

# Checking Length

In [25]:
print(len(f_od_2018))
print(len(f_db18))

5685834
5685834


# To Numeric and Filter by Geoid

In [26]:
f_od_2018 = f_od_2018.apply(pd.to_numeric)
f_od_2018 = f_od_2018.sort_values(by=['w_geoid'])

In [27]:
f_db18 = f_db18.apply(pd.to_numeric)
f_db18 = f_db18.sort_values(by=['w_geoid'])

# Count the number of unique IDs

In [28]:
print(f_od_2018['w_geoid'].nunique())
print(f_db18['w_geoid'].nunique())

60759
60759


# Check if stats are the same

In [29]:
f_od_2018.describe() == f_db18.describe()

Unnamed: 0,w_geoid,h_geoid,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
count,True,True,True,True,True,True,True,True,True,True,True,True
mean,True,False,True,True,True,True,True,True,True,True,True,True
std,True,False,False,False,False,False,False,False,False,False,False,False
min,True,True,True,True,True,True,True,True,True,True,True,True
25%,True,True,True,True,True,True,True,True,True,True,True,True
50%,True,True,True,True,True,True,True,True,True,True,True,True
75%,True,True,True,True,True,True,True,True,True,True,True,True
max,True,True,True,True,True,True,True,True,True,True,True,True


# Round std to make sure they are identical

In [30]:
# Since this prints out false we see that the rounded version of the standard deviations are all the same
for i in f_od_2018.describe().loc['std',:].round() == f_db18.describe().loc['std',:].round():
    if i == False:
        print(False)