# SANDAG Median Calculation

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import copy

# Intermediary Functions

In [3]:
def median_class_number(num):
    '''Here I am calculating the median class of the number given. The article seems to contradict itself. It gives the instructions below but in initial instructions gives just n/2'''
    if num % 2 == 0: # Even number
        first_median = num/2
        second_median = first_median + 1
        return (first_median + second_median)/2
    else: # Odd number
        return (num+1)/2

In [4]:
def find_position(lst, num):
    '''This function takes in the cumulative distribution list along with the median frequency value and returns the index where that number lives.'''
    # Check if the number is larger than the largest element in the list
    if num > lst[-1]:
        return len(lst) - 1 # Need to subtract 1 because python is zero indexed and it will try and index outside of the dataframe
    
    # Iterate over the list and find the position where the number should be inserted
    for i, val in enumerate(lst):
        if val >= num:
            return i
    
    return 0

In [5]:
def find_median_grouped(df):
    cumulative_freq = [sum(df['count'][0:x]) for x in range(1,len(df)+1)]
    median_class_freq_val = median_class_number(cumulative_freq[-1])
    position = find_position(cumulative_freq, median_class_freq_val)
    median_class_values = df.loc[position]

    i = median_class_values['lower_bound']
    n = cumulative_freq[-1]
    c = cumulative_freq[position-1]
    f = median_class_values['count']
    h = median_class_values['upper_bound'] - median_class_values['lower_bound']

    # To check my values 
    # print(f"cummulative frequency: {cumulative_freq}")
    # print(f"median class frequency value: {median_class_freq_val}")
    # print(f"position in cum_freq: {position}")
    # print(f"i: {i}")
    # print(f"n: {n}")
    # print(f"c: {c}")
    # print(f"f: {f}")
    # print(f"h: {h}")


    return i + (((n/2)-c)/f) * h

# Putting it all together

In [None]:
def find_median_values(df):
    output = pd.DataFrame(columns=['geo_zone', 'yr_id', 'median'])
    unique_indexes = set(df.index)
    for index in unique_indexes:
        temp_df = df.loc[index]
        temp_df.columns = ['count', 'lower_bound', 'upper_bound']
        temp_df = temp_df.reset_index(drop=True)

        median_output = find_median_grouped(temp_df)
        new_row = [index[0], index[1], median_output]

        new_row_series = pd.Series(new_row, index=output.columns)

        # Append the Series to the DataFrame
        output = output.append(new_row_series, ignore_index=True)
    output = output.sort_values(by=['geo_zone', 'yr_id'])
    
    return output.groupby(['geo_zone', 'yr_id']).sum()

# Example

In [17]:
# Dataframe 1
test_input_1 = pd.DataFrame({'yr': 2010, 'geozone':'National City', 'count':[2749,3634,2922, 2146, 1129, 1417, 737, 430, 199, 139], 'lower_bound':[0,15000,30000, 45000, 60000, 75000, 100000, 125000, 150000, 200000], 'upper_bound':[14999,29999,44999,59999,74999,99999,124999,149999,199999,349999]})

# Dataframe 2
test_input_2 = copy.deepcopy(test_input_1)
test_input_2['yr'] = 2010
test_input_2['geozone'] = 'Encinitas'
test_input_2['count'] = [3000,3734,3922, 3146, 4129, 12417, 837, 130, 899, 739]

# Dataframe 3
test_input_3 = copy.deepcopy(test_input_1)
test_input_3['yr'] = 2015
test_input_3['geozone'] = 'Poway'
test_input_3['count'] = [2000,1734,3982, 2146, 4729, 14417, 236, 430, 813, 1739]

# Concatonate together
final_test_input = pd.concat([test_input_1, test_input_2, test_input_3])
final_test_input = final_test_input.set_index(['geozone', 'yr'])

final_test_input

Unnamed: 0_level_0,Unnamed: 1_level_0,count,lower_bound,upper_bound
geozone,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
National City,2010,2749,0,14999
National City,2010,3634,15000,29999
National City,2010,2922,30000,44999
National City,2010,2146,45000,59999
National City,2010,1129,60000,74999
National City,2010,1417,75000,99999
National City,2010,737,100000,124999
National City,2010,430,125000,149999
National City,2010,199,150000,199999
National City,2010,139,200000,349999


In [18]:
find_median_values(final_test_input)

  temp_df = df.loc[index]


Unnamed: 0_level_0,Unnamed: 1_level_0,median
geo_zone,yr_id,Unnamed: 2_level_1
Encinitas,2010,69715.385202
National City,2010,37022.119097
Poway,2015,77639.139766


# Create Output

In [None]:
#find_median_values(df)