In [1]:
import os
import pandas as pd
import numpy as np
import pyodbc

# Data Pull Functions

In [2]:
def standard_format_sql_download(sql_file_name, geo_level, estimates_version):
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

    with open(rf'sql_queries\{sql_file_name}.sql', 'r') as sql_file:
        sql_query = sql_file.read()

    sql_query = sql_query.format(geo_level=geo_level, estimates_version=estimates_version)
    df =  pd.read_sql_query(sql_query, conn)
    df = df.rename(columns={'geo_level':geo_level})
    return df

In [3]:
def standard_format_pivot(df, geo_level):
    output = pd.pivot(df, values='value', index=[geo_level, 'yr_id'], columns=['breakdown_value'])
    output.columns.name = ''
    output = output.reset_index()
    return output

In [4]:
def export_to_j_drive(df, geo_level, estimates_version, sql_file_name):
    df.to_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}\{geo_level}_est_{estimates_version}_{sql_file_name}_ind_QA.xlsx', index=False)

In [5]:
def standard_format_output_creation(sql_file_name, geo_level, estimates_version):
    sql_output = standard_format_sql_download(sql_file_name=sql_file_name, geo_level=geo_level, estimates_version=estimates_version)
    
    pivoted_output = standard_format_pivot(df=sql_output, geo_level=geo_level)

    export_to_j_drive(df=pivoted_output, geo_level=geo_level, estimates_version=estimates_version)

    return pivoted_output

# Specific Manipulations

In [6]:
def age_manipulations(df, geo_level, estimates_version):
    '''This function puts the columns in the correct order'''
    return df[[geo_level, 'yr_id', 'Under 5', '5 to 9', '10 to 14', '15 to 17', '18 and 19', '20 to 24', '25 to 29', '30 to 34', '35 to 39', '40 to 44', '45 to 49', '50 to 54', '55 to 59', '60 and 61', '62 to 64', '65 to 69', '70 to 74', '75 to 79', '80 to 84', '85 and Older']]

In [7]:
def households_manipulations(df, geo_level, estimates_version):
    df['Total Households'] = df.loc[:, ~df.columns.isin([geo_level, 'yr_id'])].sum(axis=1)
    return df

In [8]:
def housing_manipulations(df, geo_level, estimates_version):
    df_to_add = standard_format_sql_download(sql_file_name='housing_p2', geo_level=geo_level, estimates_version=estimates_version)

    final = df.merge(df_to_add, how='left', on=[geo_level, 'yr_id'])

    return final 

In [9]:
def income_manipulations(df, geo_level, estimates_version):
    '''This function puts the columns in the correct order'''
    return df[[geo_level, 'yr_id', 'Less than $15,000', '$15,000 to $29,999', '$30,000 to $44,999', '$45,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', '$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more']]

In [10]:
def population_manipulations(df, geo_level, estimates_version):
    df['Total Population'] = df.loc[:, ~df.columns.isin([geo_level, 'yr_id'])].sum(axis=1)
    df['Total GQ Population'] = df.loc[:, ~df.columns.isin([geo_level, 'yr_id', 'Household Population', 'Total Population'])].sum(axis=1)
    return df

# Information Table

In [11]:
table_information = {
    'age': {
            'manipulation_function': age_manipulations
    },
    'children':{
            'manipulation_function': None
    },
    'ethnicity':{
            'manipulation_function': None
    },
    'households':{
            'manipulation_function': households_manipulations
    },
    'housing':{
            'manipulation_function': housing_manipulations
    },
    'income':{
            'manipulation_function': income_manipulations
    },
    'population':{
            'manipulation_function': population_manipulations
    },
    'sex':{
            'manipulation_function': None
    },
    'workers':{
            'manipulation_function': None
    }
}

# Output Function

In [12]:
pivot_needed = ['age', 'ethnicity', 'households', 'housing', 'income', 'population', 'sex']

In [13]:
def standard_format_output_creation(table_key_name, geo_level, estimates_version):
    output = standard_format_sql_download(sql_file_name=table_key_name, geo_level=geo_level, estimates_version=estimates_version)

    if table_key_name in pivot_needed:
        output = standard_format_pivot(df=output, geo_level=geo_level)

    if table_information[table_key_name]['manipulation_function'] != None:
        output = table_information[table_key_name]['manipulation_function'](df=output, geo_level=geo_level, estimates_version=estimates_version)

    export_to_j_drive(df=output, geo_level=geo_level, estimates_version=estimates_version, sql_file_name=table_key_name)

    return output

# Create Output

In [14]:
for table_key_name in table_information.keys():
    for geo_level in ['mgra', 'census_tract', 'luz', 'cpa', 'jurisdiction', 'sra', 'region']:
        standard_format_output_creation(table_key_name=table_key_name, geo_level=geo_level, estimates_version='2022_04')
        print(f"{table_key_name}-{geo_level} is complete.")

age-mgra is complete.
age-census_tract is complete.
age-luz is complete.
age-cpa is complete.
age-jurisdiction is complete.
age-sra is complete.
age-region is complete.
children-mgra is complete.
children-census_tract is complete.
children-luz is complete.
children-cpa is complete.
children-jurisdiction is complete.
children-sra is complete.
children-region is complete.
ethnicity-mgra is complete.
ethnicity-census_tract is complete.
ethnicity-luz is complete.
ethnicity-cpa is complete.
ethnicity-jurisdiction is complete.
ethnicity-sra is complete.
ethnicity-region is complete.
households-mgra is complete.
households-census_tract is complete.
households-luz is complete.
households-cpa is complete.
households-jurisdiction is complete.
households-sra is complete.
households-region is complete.
housing-mgra is complete.
housing-census_tract is complete.
housing-luz is complete.
housing-cpa is complete.
housing-jurisdiction is complete.
housing-sra is complete.
housing-region is complete.


KeyError: "['Less than $15,000', '$15,000 to $29,999', '$30,000 to $44,999', '$45,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', '$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more'] not in index"

In [16]:
for table_key_name in ['population', 'sex', 'workers']:
    for geo_level in ['mgra', 'census_tract', 'luz', 'cpa', 'jurisdiction', 'sra', 'region']:
        standard_format_output_creation(table_key_name=table_key_name, geo_level=geo_level, estimates_version='2022_04')
        print(f"{table_key_name}-{geo_level} is complete.")

population-mgra is complete.
population-census_tract is complete.
population-luz is complete.
population-cpa is complete.
population-jurisdiction is complete.
population-sra is complete.
population-region is complete.
sex-mgra is complete.
sex-census_tract is complete.
sex-luz is complete.
sex-cpa is complete.
sex-jurisdiction is complete.
sex-sra is complete.
sex-region is complete.
workers-mgra is complete.
workers-census_tract is complete.
workers-luz is complete.
workers-cpa is complete.
workers-jurisdiction is complete.
workers-sra is complete.
workers-region is complete.


# Age Sex Ethnicity Table

In [17]:
def age_sex_ethnciity_manipulations(df, geo_level):
    output = pd.pivot(df, values='population', index=[geo_level, 'yr_id', 'age group', 'sex'], columns=['race'])
    output.columns.name = ''
    return output.reset_index()

In [18]:
def age_sex_ethnicity_table_creation(geo_level, estimates_version):
    print(f"Donwloading {geo_level} data.")
    raw_sql_output = standard_format_sql_download(sql_file_name='age_sex_ethnicity', geo_level=geo_level, estimates_version=estimates_version)
    print(f"Completed donwloading {geo_level} data.")

    pivoted_data = age_sex_ethnciity_manipulations(raw_sql_output, geo_level)

    export_to_j_drive(df=pivoted_data, geo_level=geo_level, estimates_version=estimates_version, sql_file_name='age_sex_ethnicity')

    return pivoted_data

In [20]:
for geo_level in ['jurisdiction', 'region']:
    age_sex_ethnicity_table_creation(geo_level=geo_level, estimates_version='2022_04')
    print(f"{geo_level} is complete.")

Donwloading jurisdiction data.
Completed donwloading jurisdiction data.
jurisdiction is complete.
Donwloading region data.
Completed donwloading region data.
region is complete.


In [18]:
# for geo_level in ['census_tract', 'luz', 'cpa', 'jurisdiction', 'sra', 'region']:
#     age_sex_ethnicity_table_creation(geo_level=geo_level, estimates_version='2022_02')
#     print(f"{geo_level} is complete.")

Donwloading census_tract data.
Completed donwloading census_tract data.
census_tract is complete.
Donwloading luz data.
Completed donwloading luz data.
luz is complete.
Donwloading cpa data.
Completed donwloading cpa data.
cpa is complete.
Donwloading jurisdiction data.
Completed donwloading jurisdiction data.
jurisdiction is complete.
Donwloading sra data.
Completed donwloading sra data.
sra is complete.
Donwloading region data.
Completed donwloading region data.
region is complete.


In [16]:
for geo_level in ['census_tract', 'luz', 'cpa', 'jurisdiction', 'sra', 'region']: #MGRA is too big for now
    age_sex_ethnicity_table_creation(geo_level=geo_level, estimates_version='2022_03')
    print(f"{geo_level} is complete.")

Donwloading census_tract data.
Completed donwloading census_tract data.
census_tract is complete.
Donwloading luz data.
Completed donwloading luz data.
luz is complete.
Donwloading cpa data.
Completed donwloading cpa data.
cpa is complete.
Donwloading jurisdiction data.
Completed donwloading jurisdiction data.
jurisdiction is complete.
Donwloading sra data.
Completed donwloading sra data.
sra is complete.
Donwloading region data.
Completed donwloading region data.
region is complete.
