In [48]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

In [49]:
def standard_format_sql_download(sql_file_name, geo_level, estimates_version):
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

    with open(rf'sql_queries\{sql_file_name}.sql', 'r') as sql_file:
        sql_query = sql_file.read()

    sql_query = sql_query.format(geo_level=geo_level, estimates_version=estimates_version)
    df =  pd.read_sql_query(sql_query, conn)
    df = df.rename(columns={'geo_level':geo_level})
    return df

In [51]:
def standard_format_pivot(df, geo_level):
    output = pd.pivot(df, values='value', index=[geo_level, 'yr_id'], columns=['breakdown_value'])
    output.columns.name = ''
    output = output.reset_index()
    return output

In [52]:
def export_to_j_drive(df, geo_level, estimates_version):
    df.to_excel(rf'J:\DataScience\DataQuality\QAQC\estimates_automation\aggregated_data\{geo_level}_est_{estimates_version}_ind_QA.xlsx', index=False)

In [53]:
def standard_format_output_creation(sql_file_name, geo_level, estimates_version):
    sql_output = standard_format_sql_download(sql_file_name=sql_file_name, geo_level=geo_level, estimates_version=estimates_version)
    
    pivoted_output = standard_format_pivot(df=sql_output, geo_level=geo_level)

    export_to_j_drive(df=pivoted_output, geo_level=geo_level, estimates_version=estimates_version)

    return pivoted_output

# Specific Manipulations

In [112]:
def age_manipulations(df, geo_level, estimates_version):
    '''This function puts the columns in the correct order'''
    return df[[geo_level, 'yr_id', 'Under 5', '5 to 9', '10 to 14', '15 to 17', '18 and 19', '20 to 24', '25 to 29', '30 to 34', '35 to 39', '40 to 44', '45 to 49', '50 to 54', '55 to 59', '60 and 61', '62 to 64', '65 to 69', '70 to 74', '75 to 79', '80 to 84', '85 and Older']]

In [113]:
def households_manipulations(df, geo_level, estimates_version):
    df['Total Households'] = df.loc[:, ~df.columns.isin([geo_level, 'yr_id'])].sum(axis=1)
    return df

In [128]:
def housing_manipulations(df, geo_level, estimates_version):
    df_to_add = standard_format_sql_download(sql_file_name='housing_p2', geo_level=geo_level, estimates_version=estimates_version)

    final = df.merge(df_to_add, how='left', on=[geo_level, 'yr_id'])

    return final 

In [143]:
def income_manipulations(df, geo_level, estimates_version):
    '''This function puts the columns in the correct order'''
    return df[[geo_level, 'yr_id', 'Less than $15,000', '$15,000 to $29,999', '$30,000 to $44,999', '$45,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', '$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more']]

# Information Table

In [144]:
table_information = {
    'age': {
            'manipulation_function': age_manipulations
    },
    'children':{
            'manipulation_function': None
    },
    'ethnicity':{
            'manipulation_function': None
    },
    'households':{
            'manipulation_function': households_manipulations
    },
    'housing':{
            'manipulation_function': housing_manipulations
    },
    'income':{
            'manipulation_function': income_manipulations
    }
}

# Output Function

In [134]:
pivot_needed = ['age', 'ethnicity', 'households', 'housing', 'income']

In [136]:
def standard_format_output_creation(table_key_name, geo_level, estimates_version):
    output = standard_format_sql_download(sql_file_name=table_key_name, geo_level=geo_level, estimates_version=estimates_version)

    if table_key_name in pivot_needed:
        output = standard_format_pivot(df=output, geo_level=geo_level)

    if table_information[table_key_name]['manipulation_function'] != None:
        output = table_information[table_key_name]['manipulation_function'](df=output, geo_level=geo_level, estimates_version=estimates_version)

    export_to_j_drive(df=output, geo_level=geo_level, estimates_version=estimates_version)

    return output

In [146]:
standard_format_output_creation(table_key_name='income', geo_level='cpa', estimates_version='2022_01')

Unnamed: 0,cpa,yr_id,"Less than $15,000","$15,000 to $29,999","$30,000 to $44,999","$45,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","$100,000 to $124,999","$125,000 to $149,999","$150,000 to $199,999","$200,000 or more"
0,*Not in a CPA*,2020,31505,40149,38839,38170,38970,57949,49236,37940,53348,81293
1,*Not in a CPA*,2021,33888,40901,41995,34283,39609,54492,53839,38053,53208,82091
2,*Not in a CPA*,2022,32389,41950,40520,39293,38744,58141,52208,38115,53374,82156
3,32nd Street Naval Station,2020,0,0,0,0,0,0,0,0,0,0
4,32nd Street Naval Station,2021,0,0,0,0,0,0,0,0,0,0
5,32nd Street Naval Station,2022,0,0,0,0,0,0,0,0,0,0
6,Alpine,2020,198,552,463,285,370,935,847,503,944,1252
7,Alpine,2021,240,508,427,246,435,865,814,604,862,1484
8,Alpine,2022,236,517,392,295,422,920,799,605,860,1471
9,Balboa Park,2020,0,0,0,0,0,0,0,0,0,0


In [58]:
''''
- In the table I could have all standard ones and if no manipulations just say None and I can go from there in the actual function 
-Age sex ethinicty will need to be its own product 
- replace value 
'''

"'\n- In the table I could have all standard ones and if no manipulations just say None and I can go from there in the actual function \n-Age sex ethinicty will need to be its own product \n- replace value \n"