# SANDAG Median Calculation

Replicate SANDAG procedure for calculating median values for categories with classification bins (i.e., income, age) using lower/upper bounds.

Functions were coded by Calvin Raab. Execution/outputs coded by Dante Lee.

Last revised 4/7/2023

In [1]:
import os
import pandas as pd
import numpy as np
import pyodbc

# Functions

In [2]:
def median_class_number(num):
    '''Here I am calculating the median class of the number given. The article seems to contradict itself. It gives the instructions below but in initial instructions gives just n/2'''
    if num % 2 == 0: # Even number
        first_median = num/2
        second_median = first_median + 1
        return (first_median + second_median)/2
    else: # Odd number
        return (num+1)/2

In [3]:
def find_position(lst, num):
    '''This function takes in the cumulative distribution list along with the median frequency value and returns the index where that number lives.'''
    # Check if the number is larger than the largest element in the list
    if num > lst[-1]:
        return len(lst) - 1 # Need to subtract 1 because python is zero indexed and it will try and index outside of the dataframe
    
    # Iterate over the list and find the position where the number should be inserted
    for i, val in enumerate(lst):
        if val >= num:
            return i
    
    return 0

In [4]:
def find_median_grouped(df):
    cumulative_freq = [sum(df['count'][0:x]) for x in range(1,len(df)+1)]
    median_class_freq_val = median_class_number(cumulative_freq[-1])
    position = find_position(cumulative_freq, median_class_freq_val)
    median_class_values = df.loc[position]

    i = median_class_values['lower_bound']
    n = cumulative_freq[-1]
    c = cumulative_freq[position-1]
    f = median_class_values['count']
    h = median_class_values['upper_bound'] - median_class_values['lower_bound']

    # To check my values 
    # print(f"cummulative frequency: {cumulative_freq}")
    # print(f"median class frequency value: {median_class_freq_val}")
    # print(f"position in cum_freq: {position}")
    # print(f"i: {i}")
    # print(f"n: {n}")
    # print(f"c: {c}")
    # print(f"f: {f}")
    # print(f"h: {h}")


    return i + (((n/2)-c)/f) * h

In [5]:
def find_median_values(df):
    output = pd.DataFrame(columns=['geo_zone', 'yr_id', 'median'])
    unique_indexes = set(df.index)
    for index in unique_indexes:
        temp_df = df.loc[index]
        temp_df.columns = ['count', 'lower_bound', 'upper_bound']
        temp_df = temp_df.reset_index(drop=True)

        median_output = find_median_grouped(temp_df)
        new_row = [index[0], index[1], median_output]

        new_row_series = pd.Series(new_row, index=output.columns)

        # Append the Series to the DataFrame
        output = output.append(new_row_series, ignore_index=True)
    output = output.sort_values(by=['geo_zone', 'yr_id'])
    
    return output.groupby(['geo_zone', 'yr_id']).sum()

In [6]:
def standard_format_sql_download(sql_file_name, geo_level, estimates_version):
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

    with open(rf'sql_queries\{sql_file_name}.sql', 'r') as sql_file:
        sql_query = sql_file.read()

    sql_query = sql_query.format(geo_level=geo_level, estimates_version=estimates_version)
    df =  pd.read_sql_query(sql_query, conn)
    df = df.rename(columns={'geo_level':geo_level})
    return df

# Create Output

In [11]:
# SET PARAMETERS
estimates_version = '2022_03'
categories = ['median_income', 'median_age']
geo_levels = ['tract', 'region', 'jurisdiction', 'cpa', 'sra']

In [12]:
print(estimates_version)

for category in categories:
    # Folder/file output name
    folder_output = 'J:/DataScience/DataQuality/QAQC/Estimates QC Automation/v_series15/' + category + '/'
    file_name = folder_output + estimates_version + '/' + category + '_' + estimates_version + '.xlsx'

    print('Processing ' + category)

    for geo_level in geo_levels:
        # Read median sql file
        df = standard_format_sql_download(sql_file_name=category, geo_level=geo_level, estimates_version=estimates_version)
        df = df.set_index(['geozone', 'yr_id'])

        # Execute median calculation
        median = find_median_values(df).reset_index()

        # Write results to single median file
        if os.path.exists(file_name):
            with pd.ExcelWriter(file_name, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
                median.to_excel(writer, sheet_name=geo_level, index=False)
        else:
            with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
                median.to_excel(writer, sheet_name=geo_level, index=False)
        
        del writer
        
        print('...' + geo_level + ' complete')

2022_03
Processing median_income


  return i + (((n/2)-c)/f) * h


...tract complete
...region complete
...jurisdiction complete


  temp_df = df.loc[index]


...cpa complete
...sra complete
Processing median_age
...tract complete
...region complete
...jurisdiction complete
...cpa complete
...sra complete


# Archive

## Example

In [8]:
# Dataframe 1
test_input_1 = pd.DataFrame({'yr': 2010, 'geozone':'National City', 'count':[2749,3634,2922, 2146, 1129, 1417, 737, 430, 199, 139], 'lower_bound':[0,15000,30000, 45000, 60000, 75000, 100000, 125000, 150000, 200000], 'upper_bound':[14999,29999,44999,59999,74999,99999,124999,149999,199999,349999]})

# Dataframe 2
test_input_2 = copy.deepcopy(test_input_1)
test_input_2['yr'] = 2010
test_input_2['geozone'] = 'Encinitas'
test_input_2['count'] = [3000,3734,3922, 3146, 4129, 12417, 837, 130, 899, 739]

# Dataframe 3
test_input_3 = copy.deepcopy(test_input_1)
test_input_3['yr'] = 2015
test_input_3['geozone'] = 'Poway'
test_input_3['count'] = [2000,1734,3982, 2146, 4729, 14417, 236, 430, 813, 1739]

# Concatenate dfs
final_test_input = pd.concat([test_input_1, test_input_2, test_input_3])
final_test_input = final_test_input.set_index(['geozone', 'yr'])

final_test_input

Unnamed: 0_level_0,Unnamed: 1_level_0,count,lower_bound,upper_bound
geozone,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
National City,2010,2749,0,14999
National City,2010,3634,15000,29999
National City,2010,2922,30000,44999
National City,2010,2146,45000,59999
National City,2010,1129,60000,74999
National City,2010,1417,75000,99999
National City,2010,737,100000,124999
National City,2010,430,125000,149999
National City,2010,199,150000,199999
National City,2010,139,200000,349999


In [9]:
find_median_values(final_test_input)

  temp_df = df.loc[index]


Unnamed: 0_level_0,Unnamed: 1_level_0,median
geo_zone,yr_id,Unnamed: 2_level_1
Encinitas,2010,69715.385202
National City,2010,37022.119097
Poway,2015,77639.139766


### Age

In [16]:
standard_format_sql_download(sql_file_name='age', geo_level='jurisdiction', estimates_version='2022_02')

Unnamed: 0,jurisdiction,yr_id,breakdown_value,value
0,Carlsbad,2020,10 to 14,9152
1,Carlsbad,2020,15 to 17,5837
2,Carlsbad,2020,18 and 19,2198
3,Carlsbad,2020,20 to 24,5066
4,Carlsbad,2020,25 to 29,4888
...,...,...,...,...
1135,Vista,2022,70 to 74,2765
1136,Vista,2022,75 to 79,2002
1137,Vista,2022,80 to 84,1447
1138,Vista,2022,85 and Older,1797


## CRA notes

In [None]:
''''How to proceed:
- The goal is to create the above dataframe to look like the 'final_test_input' dataframe
- I would create a dictionary that as the key has the breakdown value and as the value has a lower or upper bound. Ex for lower bound: {'Under 5': 0, '74 to 79': 75}
- I would then use the map() feature to map those lower bounds to a column called 'lower_bound'
- Then do the same for upper bound
- Drop the breakdown_value column 
- Change column names to match exactly the 'final_test_input' dataframe []'geo_level', 'yr_id', 'count', 'lower_bound', 'upper_bound'] 
- Set the index to ['geo_level', 'yr_id']
- Then pass that dataframe into find_median_values()
- I recommend writing a function that does this and takes in the estimates version and geo_level as input that way you can write a for loop and run through it all

* The same thing will be done for income 

'''