In [1]:
import pandas as pd
import numpy as np
import sys
import sqlalchemy as sql
import re
from tqdm import tqdm
import zipfile
import os
import glob
import polars as pl
import json
import glob

In [2]:
server = 'ddamwsql16'
database = 'acs'
driver = 'ODBC Driver 17 for SQL Server'

engine = sql.create_engine('mssql+pyodbc://@{}/{}?trusted_connection=yes&driver={}'.format(server, database, driver))
conn = engine.connect()

In [3]:
year_ranges = ['1y', '5y']
years = ['2010', '2011']
# years = ['2012', '2013']
# years = ['2014', '2015']
# years = ['2016', '2017']
# years = ['2018', '2019']
# years = ['2020', '2021', '2022']
data_types = ['persons', 'households']

# Row Column Comparisons

In [None]:
def data_dictionary(r, y, d='both'):
    pums_year = int(y)
    
    if r == '5y':
        dd = pd.read_csv(f'https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_{pums_year-4}-{pums_year}.csv', 
                     header=None, names=['Name/Val', 'Var', 'Type', 'Length', 'Desc', 'Values', 'Val Desc'])
    else:
        dd = pd.read_csv(f'https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_{pums_year}.csv', 
                     header=None, names=['Name/Val', 'Var', 'Type', 'Length', 'Desc', 'Values', 'Val Desc'])
    
    # Create dictionary of dtypes
    dd['cast_dtypes'] = dd['Type'].apply(lambda x: str if x=='C' else float)
    dd['Dtypes'] = dd['Type'].replace({'C':'varchar', 'N':'float'})
    dd_names = dd[dd['Name/Val'] == 'NAME'][['Var', 'Dtypes', 'Length']].reset_index(drop=True)
    dd_cast_types = dd[dd['Name/Val'] == 'NAME'][['Var', 'cast_dtypes']].reset_index(drop=True)

    # Get index of second RT variable to split between households and persons
    p_ind = dd_names[dd_names['Var'] == 'RT'].index[1]
    
    dd_housing = {*dd_names[:p_ind].squeeze().itertuples(index=False, name=None)}
    dd_housing_casting = dict({*dd_cast_types[:p_ind].squeeze().itertuples(index=False, name=None)})
    
    dd_persons = {*dd_names[p_ind:].squeeze().itertuples(index=False, name=None)}
    dd_persons_casting = dict({*dd_cast_types[p_ind:].squeeze().itertuples(index=False, name=None)})

    if d == 'households':
        return dd_housing, dd_housing_casting
    elif d == 'persons':
        return dd_persons, dd_persons_casting
    return (dd_housing, dd_persons), (dd_housing_casting, dd_persons_casting)

def get_pums_database_data(r, y, d, conn):
    rows_query = """
    SELECT count(serialno)
    FROM [acs].[pums].[{}_{}_{}]
    """.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d)
    
    columns_query = """
    SELECT COLUMN_NAME
    FROM INFORMATION_SCHEMA.COLUMNS
    WHERE TABLE_CATALOG = 'acs'
        AND TABLE_SCHEMA = 'pums'
        AND TABLE_NAME = '{}_{}_{}'
    """.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d)
    
    columns = pd.read_sql_query(columns_query, con=conn).squeeze().to_list()
    rows = pd.read_sql_query(rows_query, con=conn).iloc[0][0]

    print('----database complete: [{}_{}_{}]'.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d))
    return (int(rows), columns)
          

def get_pums_source_data(r, y, d):
    path = r'C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2024\2024-009 ACS PUMS ETL 1Y and 5Y 2010 - 2022\Data'
    print(path + r"\{} {}r PUMS\raw\csv_{}.zip".format(y if r == '1y' else '{}-{}'.format(str(int(y)-4), y), r, 'hus' if d =='households' else 'pus'))
    with zipfile.ZipFile(path + r"\{} {}r PUMS\raw\csv_{}.zip".format(y if r == '1y' else '{}-{}'.format(str(int(y)-4), y), r, 'hus' if d =='households' else 'pus')) as z:
        dfs = {}
        for text_file in z.infolist():
            if text_file.filename.endswith('.csv'):
                df = pl.read_csv(z.open(text_file, mode='r').read(),
                                dtypes = {"SERIALNO":str},
                                null_values = ['N.A.'])
                dfs[text_file.filename] = (int(df.shape[0]), df.columns)
                print(f"--------source complete: [{r}_{y if r == '1y' else '{}_{}'.format(str(int(y)-4), y)}_{d}] - {text_file.filename}")
    print(f"----source complete: [{r}_{y if r == '1y' else '{}_{}'.format(str(int(y)-4), y)}_{d}]")
    return dfs

In [None]:
outputs = {
    'database':{},
    'source':{}
}

for year_range in year_ranges:
    outputs['database'][year_range] = {}
    outputs['source'][year_range] = {}
    for year in years:
        if year_range == '1y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2021':
            continue
        else:
            outputs['database'][year_range][year] = {}
            outputs['source'][year_range][year] = {}
        for data_type in data_types:
            outputs['database'][year_range][year][data_type] = get_pums_database_data(year_range, year, data_type, conn)
            outputs['source'][year_range][year][data_type] = get_pums_source_data(year_range, year, data_type)

In [None]:
with open('output.txt', 'w') as convert_file: 
     convert_file.write(json.dumps(outputs))

with open('output.txt') as f_in:
        outputs = json.load(f_in)

In [None]:
for year_range in year_ranges:
    for year in years:
        if year_range == '1y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2021':
            continue
        else:
            for data_type in data_types:
                database_sum = outputs['database'][year_range][year][data_type][0]
                source_sum = 0
                for key in outputs['source'][year_range][year][data_type].keys():
                    source_sum += outputs['source'][year_range][year][data_type][key][0]
                if database_sum == source_sum:
                    print(f'Row Match for [{year_range}_{year}_{data_type}]' )
                else:
                    print('-'*8 + f'Mismatch for [{year_range}_{year}_{data_type}]' + '-'*8)

In [None]:
for year_range in year_ranges:
    for year in years:
        if year_range == '1y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2021':
            continue
        else:
            for data_type in data_types:
                database_cols = set(outputs['database'][year_range][year][data_type][1])
                source_cols = set()
                for key in outputs['source'][year_range][year][data_type].keys():
                    source_cols | set(outputs['source'][year_range][year][data_type][key][1])
                if len(database_cols) == len(database_cols | source_cols):
                    print(f'Match for [{year_range}_{year}_{data_type}]' )
                else:
                    print('-'*8 + f'Mismatch for [{year_range}_{year}_{data_type}]' + '-'*8)

# Row values comparisons

In [None]:
def get_source_sample(r, y, d):
    path = r'C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2024\2024-009 ACS PUMS ETL 1Y and 5Y 2010 - 2022\Data'
    with zipfile.ZipFile(path + r"\{} {}r PUMS\raw\csv_{}.zip".format(y if r == '1y' else '{}-{}'.format(str(int(y)-4), y), r, 'hus' if d =='households' else 'pus')) as z:
        # check 3
        sample_df = None
        for text_file in z.infolist():
            if text_file.filename.endswith('.csv'):
                dtype_convert = {"SERIALNO":str}
                if d == 'persons':
                    dtype_convert['SPORDER'] = str
                df = pl.read_csv(z.open(text_file, mode='r').read(),
                                dtypes = dtype_convert,
                                null_values = ['N.A.']).lazy()
                # check 3
                if sample_df is not None:
                    sample_df = pd.concat([sample_df, df.limit(n=10000).collect().to_pandas()])
                else:
                    sample_df = df.limit(n=10000).collect().to_pandas()
        sample_df.columns = sample_df.columns.str.lower()
        print('----source sample: [{}_{}_{}]'.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d))
        return sample_df
    
def get_db_sample(r, y, d, serialnos, conn):
    serialnos = list(serialnos)
    rows_query = """
    SELECT *
    FROM [acs].[pums].[{}_{}_{}]
    WHERE SERIALNO IN ({})
    """.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d, ','.join(serialnos))
    
    if d == 'persons':
        rows_query = """
        SELECT *
        FROM [acs].[pums].[{}_{}_{}]
        """.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d, ','.join(serialnos))
    rows = pd.read_sql_query(rows_query, con=conn)

    print('----database sample: [{}_{}_{}]'.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d))
    rows.columns = rows.columns.str.lower()
    return rows

def compare_samples(r, y, d, conn):
    source = get_source_sample(r, y, d)
    
    key = ['serialno']
    if d == 'persons':
        key.append('sporder')
    
    if d == "persons":
        serialnos = map(lambda x: "'"+str(x[0])+'|'+str(x[1])+"'", source[key].values)
    else:
        serialnos = map(lambda x: "'" + x + "'", source['serialno'].to_list())
    print('serialno completed')
    
    db = get_db_sample(r, y, d, serialnos, conn)
    
    compare_source = source.sort_values(key).set_index(key)
    compare_db = db.astype(compare_source.dtypes).sort_values(key).set_index(key)
    
    compare = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA'))
    change_cols = []
    for col in compare.columns:
        if compare[col].sum()>0:
            change_cols.append(col)
    
    compare_db[change_cols] = compare_db[change_cols].apply(pd.to_numeric)
    compare_source[change_cols] = compare_source[change_cols].apply(pd.to_numeric)
    mismatch = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA')).sum().sum()
    
    return "{} mismatches found in {}_{}_{}".format(mismatch, r, y, d)

In [None]:
for year_range in ['1y']:
    for year in years:
        if year_range == '1y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2021':
            continue
        else:
            for data_type in data_types:
                print(compare_samples(year_range, year, data_type, conn))

# Summary value comparisons

In [4]:
def get_db_summary(r, y, d, conn):
    
    summary_counts = {}
    
    if d == 'households':
        rows_query = """
        SELECT SUM(CAST(NP AS INT)) AS NP, SUM(CAST(VEH AS INT)) AS VEH
          FROM [acs].[pums].[{}_{}_{}]
        """.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d)
        
    elif d == 'persons':
        rows_query = """
        SELECT SUM(CAST(OIP AS INT)) AS OIP, SUM(CAST(WKHP AS INT)) AS WKHP
          FROM [acs].[pums].[{}_{}_{}]
        """.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d)
        

    rows = pd.read_sql_query(rows_query, con=conn)
    for col in rows.columns:
        summary_counts[col] = rows[col][0]
    
    print('----database sample: [{}_{}_{}]'.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d))
    return summary_counts


def get_pums_summary(r, y, d):
    path = r'C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2024\2024-009 ACS PUMS ETL 1Y and 5Y 2010 - 2022\Data'
    with zipfile.ZipFile(path + r"\{} {}r PUMS\raw\csv_{}.zip".format(y if r == '1y' else '{}-{}'.format(str(int(y)-4), y), r, 'hus' if d =='households' else 'pus')) as z:
        
        #check 4
        summary_counts = None
        for text_file in z.infolist():
            if text_file.filename.endswith('.csv'):
                df = pl.read_csv(z.open(text_file, mode='r').read(),
                                dtypes = {'SERIALNO':str},
                                null_values=['N.A.']).lazy()
                
                # check 4
                if d == 'households':
                    counts = df.select(['NP', 'VEH']).sum().collect()
                    if summary_counts is None:
                        summary_counts = {'NP':0, 'VEH':0}
                        
                    summary_counts['NP'] += counts['NP'][0] if counts['NP'][0] is not None else 0
                    summary_counts['VEH'] += counts['VEH'][0] if counts['VEH'][0] is not None else 0
                        
                elif d == 'persons':
                    counts = df.select(['SPORDER', 'WKHP']).sum().collect()
                    if summary_counts is None:
                        summary_counts = {'SPORDER':0, 'WKHP':0}
                        
                    summary_counts['SPORDER'] += counts['SPORDER'][0] if counts['SPORDER'][0] is not None else 0
                    summary_counts['WKHP'] += counts['WKHP'][0] if counts['WKHP'][0] is not None else 0
    return summary_counts


def compare_summaries(r, y, d, conn):
    source_summary = get_pums_summary(r, y, d)
    db_summary = get_db_summary(r, y, d, conn)
    
    source_comparisons = {}
    print(source_summary)
    print(db_summary)
    for key in source_summary.keys():
        source_comparisons[key]=source_summary[key] == db_summary[key]
    return source_comparisons

In [None]:
# year_ranges = ['1y']
year_ranges = ['5y']
# years = ['2010', '2011']
# years = ['2012', '2013']
# years = ['2014', '2015']
# years = ['2016', '2017']
# years = ['2018', '2019']
# years = ['2020']
years = ['2021', '2022']
data_types = ['persons', 'households']

In [None]:
for year_range in year_ranges:
    for year in years:
        if year_range == '1y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2020':
            continue
        elif year_range == '5y' and year == '2021':
            continue
        else:
            for data_type in data_types:
                print(get_pums_summary(year_range, year, data_type))
                print('----source sample: [{}_{}_{}]'.format(year_range, year if year_range == '1y' else '{}_{}'.format(str(int(year)-4), year), data_type))

# Specific Checks for outlier J drive data

In [64]:
a = pl.scan_csv(r'J:\DataScience\DataScience&Analytics\PUMS ETL\2017-2021 5yr PUMS\raw\psam_pusa.csv',
               dtypes = {'SERIALNO':str},
                null_values=['N.A.'])
b = pl.scan_csv(r'J:\DataScience\DataScience&Analytics\PUMS ETL\2017-2021 5yr PUMS\raw\psam_pusb.csv',
               dtypes = {'SERIALNO':str},
                null_values=['N.A.'])
c = pl.scan_csv(r'J:\DataScience\DataScience&Analytics\PUMS ETL\2017-2021 5yr PUMS\raw\psam_pusc.csv',
               dtypes = {'SERIALNO':str},
                null_values=['N.A.'])
d_df = pl.scan_csv(r'J:\DataScience\DataScience&Analytics\PUMS ETL\2017-2021 5yr PUMS\raw\psam_pusd.csv',
               dtypes = {'SERIALNO':str},
                null_values=['N.A.'])

In [65]:
a.collect().shape

(4579396, 287)

In [66]:
b.collect().shape

(3382643, 287)

In [67]:
c.collect().shape

(3640306, 287)

In [68]:
d_df.collect().shape

(3935440, 287)

In [69]:
print(a.columns == b.columns)
print(a.columns == c.columns)
print(a.columns == d_df.columns)


q = """
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = '5y_2017_2021_persons'
"""

dbq = set(pd.read_sql_query(q, con=conn).COLUMN_NAME)

len(dbq) == len(dbq.union(set(a.columns)))

True
True
True


True

In [70]:
d = 'persons'
summary_counts = None
for df in [a, b, c, d_df]:
        # check 4
        if d == 'households':
            counts = df.select(['NP', 'VEH']).sum().collect()
            if summary_counts is None:
                summary_counts = {'NP':0, 'VEH':0}

            summary_counts['NP'] += counts['NP'][0] if counts['NP'][0] is not None else 0
            summary_counts['VEH'] += counts['VEH'][0] if counts['VEH'][0] is not None else 0

        elif d == 'persons':
            counts = df.select(['SPORDER', 'WKHP']).sum().collect()
            if summary_counts is None:
                summary_counts = {'SPORDER':0, 'WKHP':0}

            summary_counts['SPORDER'] += counts['SPORDER'][0] if counts['SPORDER'][0] is not None else 0
            summary_counts['WKHP'] += counts['WKHP'][0] if counts['WKHP'][0] is not None else 0
summary_counts

{'SPORDER': 32172156, 'WKHP': 304592879}

In [71]:
def get_source_sample(lazies):
    sample_df = None
    N = 10000//len(lazies)
    for df in lazies:
        if sample_df is not None:
            sample_df = pd.concat([sample_df, df.limit(n=N).collect().to_pandas()])
        else:
            sample_df = df.limit(n=N).collect().to_pandas()
    sample_df.columns = sample_df.columns.str.lower()
    return sample_df
    
def get_db_sample(r, y, d, serialnos, conn):
    serialnos = list(serialnos)
    rows_query = f"""
    SELECT * FROM [acs].[pums].[{r}_{y if r == '1y' else '{}_{}'.format(str(int(y)-4), y)}_{d}] WHERE SERIALNO IN
    (SELECT SERIALNO
    FROM [acs].[pums].[{r}_{y if r == '1y' else '{}_{}'.format(str(int(y)-4), y)}_{d}]
    WHERE SERIALNO IN ({','.join(serialnos)}))
    """
    
    if d == 'persons':
        rows_query = """
        SELECT *
        FROM [acs].[pums].[{}_{}_{}]
        WHERE CONCAT(SERIALNO, '|', SPORDER) IN ({})
        """.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d, ','.join(serialnos))
    print(rows_query)
    rows = pd.read_sql_query(rows_query, con=conn)

    print('----database sample: [{}_{}_{}]'.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d))
    rows.columns = rows.columns.str.lower()
    return rows

def compare_samples(r, y, d, conn):
    source = get_source_sample(r, y, d)
    
    key = ['serialno']
    if d == 'persons':
        key.append('sporder')
    
    if d == "persons":
        serialnos = map(lambda x: "'"+str(x[0])+'|'+str(x[1])+"'", source[key].values)
    else:
        serialnos = map(lambda x: "'" + x + "'", source['serialno'].to_list())
    print('serialno completed')
    db = get_db_sample(r, y, d, serialnos, conn)
    
    
    compare_source = source.sort_values(key).set_index(key)
    compare_db = db.astype(compare_source.dtypes).sort_values(key).set_index(key)
    
    compare = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA'))
    change_cols = []
    for col in compare.columns:
        if compare[col].sum()>0:
            change_cols.append(col)
    
    compare_db[change_cols] = compare_db[change_cols].apply(pd.to_numeric)
    compare_source[change_cols] = compare_source[change_cols].apply(pd.to_numeric)
    mismatch = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA')).sum().sum()
    
    return "{} mismatches found in {}_{}_{}".format(mismatch, r, y, d)

In [72]:
lazies = get_source_sample([a, b, c, d_df])

In [73]:
r = '5y'
y = '2021'
d = 'persons'


# serialnos = list(map(lambda x: "'" + str(x) + "'", lazies['serialno'].to_list()))
# db_lazies = get_db_sample(r, y, d, serialnos[:5000], conn)

serialnos = list(map(lambda x: "'"+str(x[0])+'|'+str(int(x[1]))+"'", lazies[['serialno', 'sporder']].values))
db_lazies = get_db_sample(r, y, d, serialnos[:5000], conn)


        SELECT *
        FROM [acs].[pums].[5y_2017_2021_persons]
        WHERE CONCAT(SERIALNO, '|', SPORDER) IN ('2017000000016|1','2017000000031|1','2017000000061|1','2017000000061|2','2017000000061|3','2017000000158|1','2017000000158|2','2017000000158|3','2017000000158|4','2017000000158|5','2017000000159|1','2017000000159|2','2017000000159|3','2017000000174|1','2017000000174|2','2017000000174|3','2017000000222|1','2017000000436|1','2017000000436|2','2017000000573|1','2017000000573|2','2017000000573|3','2017000000645|1','2017000000654|1','2017000000654|2','2017000000654|3','2017000000654|4','2017000000654|5','2017000000691|1','2017000000691|2','2017000000691|3','2017000000691|4','2017000000691|5','2017000000695|1','2017000000695|2','2017000000695|3','2017000000695|4','2017000000695|5','2017000000776|1','2017000000776|2','2017000000780|1','2017000000799|1','2017000000799|2','2017000000812|1','2017000000812|2','2017000001036|1','2017000001036|2','2017000001036|3','2017000001331|1','2

----database sample: [5y_2017_2021_persons]


In [74]:
key = ['serialno']
if d == 'persons':
    key.append('sporder')

compare_source = lazies.sort_values(key).set_index(key)
compare_db = db_lazies.astype(compare_source.dtypes).sort_values(key).set_index(key)

compare = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA'))
change_cols = []
for col in compare.columns:
    if compare[col].sum()>0:
        change_cols.append(col)

compare_db[change_cols] = compare_db[change_cols].apply(pd.to_numeric)
compare_source[change_cols] = compare_source[change_cols].apply(pd.to_numeric)
mismatch = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA')).sum().sum()

"{} mismatches found in {}_{}_{}".format(mismatch, r, y, d)

'0 mismatches found in 5y_2021_persons'

In [75]:
def get_source_sample(r, y, d):
    path = r'C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2024\2024-009 ACS PUMS ETL 1Y and 5Y 2010 - 2022\Data'
    with zipfile.ZipFile(path + r"\{} {}r PUMS\raw\csv_{}.zip".format(y if r == '1y' else '{}-{}'.format(str(int(y)-4), y), r, 'hus' if d =='households' else 'pus')) as z:
        # check 3
        sample_df = None
        N = 10000//len([_ for _ in z.infolist() if _.filename.endswith('.csv')])
        for text_file in z.infolist():
            if text_file.filename.endswith('.csv'):
                dtype_convert = {"SERIALNO":pl.String}
                if d == 'persons':
                    dtype_convert['SPORDER'] = pl.String
                df = pl.read_csv(z.open(text_file, mode='r').read(),
                                dtypes = dtype_convert,
                                null_values = ['N.A.', ' ']).lazy()
                # check 3
                if sample_df is not None:
                    sample_df = pd.concat([sample_df, df.limit(n=N).collect().to_pandas()])
                else:
                    sample_df = df.limit(n=N).collect().to_pandas()
        sample_df.columns = sample_df.columns.str.lower()
        print('----source sample: [{}_{}_{}]'.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d))
        return sample_df
    
def get_db_sample(r, y, d, serialnos, conn):
    serialnos = list(serialnos)
    rows_query = f"""
    SELECT * FROM [acs].[pums].[{r}_{y if r == '1y' else '{}_{}'.format(str(int(y)-4), y)}_{d}] WHERE SERIALNO IN
    (SELECT SERIALNO
    FROM [acs].[pums].[{r}_{y if r == '1y' else '{}_{}'.format(str(int(y)-4), y)}_{d}]
    WHERE SERIALNO IN ({','.join(serialnos)}))
    """
    
    if d == 'persons':
        rows_query = """
        SELECT *
        FROM [acs].[pums].[{}_{}_{}]
        WHERE CONCAT(SERIALNO, '|', CAST(SPORDER AS INT)) IN ({})
        """.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d, ','.join(serialnos))
    print(rows_query)
    rows = pd.read_sql_query(rows_query, con=conn)

    print('----database sample: [{}_{}_{}]'.format(r, y if r == '1y' else '{}_{}'.format(str(int(y)-4), y), d))
    rows.columns = rows.columns.str.lower()
    return rows

def compare_samples(r, y, d, conn):
    source = get_source_sample(r, y, d)
    
    key = ['serialno']
    if d == 'persons':
        key.append('sporder')
    
    if d == "persons":
        serialnos = map(lambda x: "'"+str(x[0])+'|'+str(x[1])+"'", source[key].values)
    else:
        serialnos = map(lambda x: "'" + x + "'", source['serialno'].to_list())
    print('serialno completed')
    db = get_db_sample(r, y, d, serialnos, conn)
    
    
    compare_source = source.sort_values(key).set_index(key)
    compare_db = db.astype(compare_source.dtypes).sort_values(key).set_index(key)
    
    compare = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA'))
    change_cols = []
    for col in compare.columns:
        if compare[col].sum()>0:
            change_cols.append(col)
    
    compare_db[change_cols] = compare_db[change_cols].apply(pd.to_numeric)
    compare_source[change_cols] = compare_source[change_cols].apply(pd.to_numeric)
    mismatch = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA')).sum().sum()
    
    return "{} mismatches found in {}_{}_{}".format(mismatch, r, y, d)

In [89]:
r = '5y'
y = '2016'
d = 'persons'
a = get_source_sample(r, y, d)

----source sample: [5y_2012_2016_persons]


In [90]:
# serialnos = list(map(lambda x: "'" + str(x) + "'", a['serialno'].to_list()))
# b = get_db_sample(r, y, d, serialnos[:1000], conn)

serialnos = list(map(lambda x: "'" +str(x[0])+'|'+str(int(x[1]))+"'", a[['serialno', 'sporder']].values))
b = get_db_sample(r, y, d, serialnos[:10000], conn)


        SELECT *
        FROM [acs].[pums].[5y_2012_2016_persons]
        WHERE CONCAT(SERIALNO, '|', CAST(SPORDER AS INT)) IN ('2012000000002|1','2012000000009|1','2012000000009|2','2012000000010|1','2012000000010|2','2012000000010|3','2012000000010|4','2012000000010|5','2012000000010|6','2012000000010|7','2012000000010|8','2012000000010|9','2012000000010|10','2012000000010|11','2012000000011|1','2012000000011|2','2012000000017|1','2012000000020|1','2012000000020|2','2012000000020|3','2012000000020|4','2012000000022|1','2012000000024|1','2012000000025|1','2012000000025|2','2012000000028|1','2012000000031|1','2012000000031|2','2012000000037|1','2012000000037|2','2012000000039|1','2012000000040|1','2012000000040|2','2012000000045|1','2012000000047|1','2012000000047|2','2012000000050|1','2012000000052|1','2012000000057|1','2012000000057|2','2012000000059|1','2012000000059|2','2012000000060|1','2012000000067|1','2012000000069|1','2012000000069|2','2012000000070|1','2012000000071|1','2012

----database sample: [5y_2012_2016_persons]


In [91]:
key = ['serialno']
if d == 'persons':
    key.append('sporder')

compare_source = a.astype({'sporder':int}).sort_values(key).set_index(key)
compare_db = b.astype({'sporder':int}).astype(a.dtypes).sort_values(key).set_index(key)

compare = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA'))
change_cols = []
for col in compare.columns:
    if compare[col].sum()>0:
        change_cols.append(col)

compare_db[change_cols] = compare_db[change_cols].replace("N.A.", np.nan).apply(pd.to_numeric)
compare_source[change_cols] = compare_source[change_cols].apply(pd.to_numeric)
mismatch = (compare_db.replace(np.nan, 'NA') != compare_source.replace(np.NaN, 'NA')).sum().sum()

print("{} mismatches found in {}_{}_{}".format(mismatch, r, y, d))

0 mismatches found in 5y_2016_persons
