In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Functions

In [2]:
def download_data(path, year):
    '''This function downloads both the source and staging data'''
    # Source Data (R Drive)
    files_list = glob.glob(path + f"/*{year}*")
    df_list = []
    for filename in files_list:
        df = pd.read_csv(filename, index_col=None, header=0)
        df_list.append(df)

    # Staging Data (SQL)
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=dpoe_stage;'
                      'Trusted_Connection=yes;')
    
    query ="SELECT *"\
        "FROM [dpoe_stage].[veh_reg_dmv].[fact]"\
        f"WHERE yr = {year};"

    return pd.concat(df_list, axis=0, ignore_index=True), pd.read_sql_query(query, conn)

In [3]:
def clean_data(source_data, sql_data):
    '''This function cleans the data to make them equivalent'''
    sql_data = sql_data.drop('dmv_registration_id', axis=1) #This column doesn't exist in source data
    source_data.columns = sql_data.columns #This is making the assumption formatting does not change, if it does, our analysis portion should catch errors
    sql_data['own'] = sql_data['own'].str.rstrip("\r") #Documented in findings 
    return source_data, sql_data

In [4]:
def analyze_data(source_data, sql_data):
    '''This function runs the QC test. If this function returns 'True', all tests were passed'''
    length_boolean = len(source_data) == len(sql_data) #Checking for identical lengths
    personal_owned_boolean = (len(source_data[source_data['own']=='Personal']) == len(sql_data[sql_data['own']=='Personal']))
    commercial_owned_boolean = (len(source_data[source_data['own']=='Commercial']) == len(sql_data[sql_data['own']=='Commercial']))
    government_owned_boolean = (len(source_data[source_data['own']=='Government']) == len(sql_data[sql_data['own']=='Government']))
    identical_shapes_boolean = sql_data.shape == source_data.shape
    return (length_boolean and personal_owned_boolean and commercial_owned_boolean and government_owned_boolean and identical_shapes_boolean)

In [5]:
# This is the code that was used to calculate a month check. The check has now slightly changed and this can be disregarded


# def month_check(df, year, source_or_sql):
#     if source_or_sql == 'source':
#         df = df[df['reg_date'] > int(f"{int(year)-2000}0000")]
#         df['reg_date'] = pd.to_datetime(df['reg_date'], format='%y%m%d')
#     else:
#         df['reg_date'] = pd.to_datetime(df['reg_date'], format='%Y-%m-%d')
#         df = sql_data[sql_data['reg_date'] >= f'{year}-01-01']
#     output = True
#     for month in range(1,13):
#         length_of_df = len(df.loc[df['reg_date'].dt.month == month])
#         if length_of_df == 0:
#             print(month)
#             output = False
#             break
#     return output

# Initialize Data:

In [6]:
all_years = [str(x) for x in range(2010,2021)]
path = r'R:/DPOE/Vehicle Registration/DMV/Restricted/Source_2020/2010-2020'
results = pd.DataFrame(index=all_years, columns=['Status'])

# Run Code

In [7]:
for year in all_years:
    source_data, sql_data = download_data(path, year)
    source_data, sql_data = clean_data(source_data, sql_data)
    if analyze_data(source_data, sql_data):
        results['Status'][f"{year}"] = 'Pass'
    else: 
        results['Status'][f"{year}"] = 'Fail'
    #print(results) # Uncomment if you would like status updates throughout
    del source_data, sql_data #Clear Outputs From Memory to make space

results



Unnamed: 0,Status
2010,Pass
2011,Pass
2012,Pass
2013,Pass
2014,Pass
2015,Pass
2016,Pass
2017,Pass
2018,Pass
2019,Pass
