In [19]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Loading Source Data (From R Drive)

In [20]:
path = r'R:/DPOE/LEHD LODES/7.5/Source/OD_Data/Unzipped_RD'

In [21]:
# 2018 source data
csv_2018 = glob.glob(path + "/*2018.csv")
list_2018 = []

for filename in csv_2018:
    df = pd.read_csv(filename, index_col=None, header=0)
    list_2018.append(df)

In [22]:
od_2018 = pd.concat(list_2018, axis=0, ignore_index=True)

In [23]:
od_2018.dtypes

w_geocode     float64
h_geocode     float64
S000            int64
SA01            int64
SA02            int64
SA03            int64
SE01            int64
SE02            int64
SE03            int64
SI01            int64
SI02            int64
SI03            int64
createdate      int64
dtype: object

# Loading The Staging Table (SQL Server)

In [24]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=socioec_data;'
                      'Trusted_Connection=yes;')

#2018
od_db18 ="SELECT * "\
                   "FROM [socioec_data].[lehd_lodes].[od_7_5]"\
                    "WHERE yr= 2018"
db18 = pd.read_sql_query(od_db18, conn)

In [25]:
db18.dtypes

w_geoid    object
h_geoid    object
S000        int64
SA01        int64
SA02        int64
SA03        int64
SE01        int64
SE02        int64
SE03        int64
SI01        int64
SI02        int64
SI03        int64
type       object
yr          int64
dtype: object

# Cleaning Source Data

In [26]:
od_2018 = od_2018.rename({'w_geocode': 'w_geoid', 'h_geocode': 'h_geoid'}, axis=1)

In [29]:
od_2018 = od_2018.drop(['createdate',], axis=1)

MemoryError: Unable to allocate 4.24 GiB for an array with shape (10, 56867864) and data type int64

# Cleaning Staging Data

In [None]:
db18 = db18.drop(['type','yr'], axis=1)

In [None]:
db18['w_geoid'] = [i.lstrip('0') for i in db18['w_geoid']]

In [None]:
db18['h_geoid'] = [i.lstrip('0') for i in db18['h_geoid']]

# Filtering for just SD Data (Source Data)

In [None]:
f_od_2018 = od_2018 [od_2018['w_geoid'].astype(str).str.startswith('6073') | od_2018['h_geoid'].astype(str).str.startswith('6073')]

# Filtering for just SD Data (Staging Data)

In [None]:
f_db18 = db18[db18['w_geoid'].astype(str).str.startswith('6073') | db18['h_geoid'].astype(str).str.startswith('6073')]

# Comparison 

In [None]:
f_db18.columns == f_od_2018.columns

# Checking the shape of the data

In [None]:
print(f_od_2018.shape)
print(f_db18.shape)

In [None]:
f_od_2018.shape[0] - f_db18.shape[0]

# Checking Length

In [None]:
print(len(f_od_2018))
print(len(f_db18))

# To Numeric and Filter by Geoid

In [None]:
f_od_2018 = f_od_2018.apply(pd.to_numeric)
f_od_2018 = f_od_2018.sort_values(by=['w_geoid'])

In [None]:
f_db18 = f_db18.apply(pd.to_numeric)
f_db18 = f_db18.sort_values(by=['w_geoid'])

# Count the number of unique IDs

In [None]:
print(f_od_2018['w_geoid'].nunique())
print(f_db18['w_geoid'].nunique())

# Check if stats are the same

In [None]:
f_od_2018.describe() == f_db18.describe()

# Round std to make sure they are identical

In [None]:
# Since this prints out false we see that the rounded version of the standard deviations are all the same
for i in f_od_2018.describe().loc['std',:].round() == f_db18.describe().loc['std',:].round():
    if i == False:
        print(False)