In [1]:
import os
import pandas as pd
import numpy as np
import pyodbc
from scipy.stats import shapiro
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from collections import OrderedDict

# Donwnload the data

In [28]:
# Ind File
mgra_data = pd.read_excel(r'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\2022_03\mgra_est_2022_03_ethnicity_ind_QA.xlsx')
mgra_data_21 = mgra_data[mgra_data['yr_id']==2021]
mgra_data_21 = mgra_data_21[['mgra', 'Hispanic']]
mgra_data_21

# Diff File 
mgra_diff_data = pd.read_excel(r'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\diff_files\2022_03\mgra_ethnicity_est_2022_03_minus_2022_02_QA.xlsx', sheet_name='Diff')
mgra_diff_data = mgra_diff_data[mgra_diff_data['yr_id']==2021]
mgra_diff_data_21 = mgra_diff_data[['mgra', 'Hispanic']]
mgra_diff_data_21

combined_data = mgra_data_21.merge(mgra_diff_data_21, on='mgra', suffixes=['_Tot', '_Diff'])
#combined_data['Hispanic_Diff_Abs'] = abs(combined_data['Hispanic_Diff'])
#combined_data = combined_data[combined_data['Hispanic_Tot'] != 0]
combined_data

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff
0,1,126,13
1,2,36,6
2,3,126,-3
3,4,0,0
4,5,19,1
...,...,...,...
24316,24317,0,0
24317,24318,0,-3
24318,24319,0,0
24319,24320,0,0


# Bootstrap Equation

In [3]:
def bootstrap_mean(data, n_bootstraps=1000):
    means = np.zeros(n_bootstraps)
    for i in range(n_bootstraps):
        sample = np.random.choice(data, size=len(data), replace=True)
        means[i] = np.mean(sample)
    return means

In [4]:
# Example of how it works 
bootstrapped_means = bootstrap_mean(combined_data['Hispanic_Diff_Abs'])

mean_of_means = np.mean(bootstrapped_means)
se_of_means = np.std(bootstrapped_means)

print(f"Mean of means: {mean_of_means}")
print(f"Standard error of means: {se_of_means}")

Mean of means: 2.5796339377492705
Standard error of means: 0.03653268429847169


# Dynamic Binning

In [5]:
def original_lower_and_upper_index(data_series, index):
    if index != 0:
        lower_index = index - 1
    else:
        lower_index = index 
    
    if index != len(data_series)-1:
        upper_index = index + 1
    else:
        upper_index = index
    
    return lower_index, upper_index

In [6]:
def new_lower_and_upper_index(data_series, lower_index, upper_index):
    if lower_index != 0:
        lower_index = lower_index - 1
    else:
        lower_index = lower_index 
    
    if upper_index != len(data_series)-1:
        upper_index = upper_index + 1
    else:
        upper_index = upper_index
    
    return lower_index, upper_index

In [7]:
def dynamic_binning(data_series, target_num):
    '''This function accepts a pandas DataFrame column containing a series of data and a target value as input. The target value must exist within the series. The function returns the lower and upper bounds that need to be filtered so that the number of records in the column is greater than 30. If there are enough records at the target value alone, the target value is returned.

    Parameters:
    data (pandas DataFrame column): The series of data in question.
    target_value (any): The value to filter the data by.
    Returns:

    If the number of records at the target value is greater than or equal to 30, the target value is returned.
    Otherwise, the function returns a tuple containing the lower and upper bounds (inclusive) that need to be filtered in order to obtain at least 30 records in the column.'''
    freq_table_sorted = data_series.value_counts().sort_index()

    freq_table_dict = freq_table_sorted.to_dict()
    list_of_sorted_dict = OrderedDict(sorted(freq_table_dict.items()))

    if target_num not in list(data_series):
        return f"{target_num} is not found in the data series given."

    index = None
    for i, key in enumerate(list_of_sorted_dict.keys()):
        if key == target_num:
            index = i
            break

    freq_table_sorted = freq_table_sorted.reset_index(drop=True)

    if (freq_table_sorted.loc[[index]] > 30).all():
        return sorted(freq_table_dict.items())[index][0]
    else:
        lower, upper = original_lower_and_upper_index(data_series=freq_table_sorted, index=index)
        while sum(freq_table_sorted.loc[[x for x in range(lower, upper)]]) < 30:
            lower, upper = new_lower_and_upper_index(data_series=freq_table_sorted, lower_index=lower, upper_index=upper)
        return (sorted(freq_table_dict.items())[lower][0], sorted(freq_table_dict.items())[upper][0])


In [29]:
dynamic_binning(data_series=combined_data['Hispanic_Tot'], target_num=5)

5

In [64]:
combined_data#[(combined_data['Hispanic_Tot'] == 5)]

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,abs_diff
0,1,126,13,13
1,2,36,6,6
2,3,126,-3,3
3,4,0,0,0
4,5,19,1,1
...,...,...,...,...
24316,24317,0,0,0
24317,24318,0,-3,3
24318,24319,0,0,0
24319,24320,0,0,0


In [65]:
combined_data[(combined_data['Hispanic_Tot'] >= 128) & (combined_data['Hispanic_Tot'] <= 130)]

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,abs_diff
126,127,129,-2,2
143,144,130,-1,1
269,270,129,4,4
308,309,129,3,3
490,491,129,-67,67
...,...,...,...,...
20576,20577,129,-8,8
20813,20814,130,2,2
20958,20959,129,-5,5
23867,23868,130,2,2


# Example

# Finding Outliers

In [66]:
def find_base_outliers(df, control_series_col, diff_col):
    df['abs_diff'] = abs(df[diff_col])
    #df = df.head(1000)

    # Calculate the z-score of each 'Total Population' value based on similar cpas
    df['pop_diff_zscore'] = np.nan
    df['group_mean'] = np.nan
    df['group_std'] = np.nan


    total_rows = len(df.index)
    counter = 0
    progress_interval = total_rows // 10
    progress_threshold = progress_interval
    
    for i, row in df.iterrows():
        cpa_population = row[control_series_col]
        range_of_values = dynamic_binning(data_series=df[control_series_col], target_num=cpa_population)
        
        if isinstance(range_of_values, tuple):
            similar_cpas = df[(df[control_series_col] >= range_of_values[0]) & (df[control_series_col] <= range_of_values[1])]
        else:
            similar_cpas = df[df[control_series_col] == range_of_values]


        means = bootstrap_mean(similar_cpas['abs_diff'], n_bootstraps=100)
        mean_of_means = np.mean(means)
        se_of_means = np.std(means)

        zscore = (row['abs_diff'] - mean_of_means) / se_of_means
        df.at[i, 'pop_diff_zscore'] = zscore
        df.at[i, 'group_mean'] = round(mean_of_means, 2)
        df.at[i, 'group_std'] = round(se_of_means,2)

        counter += 1
        if counter >= progress_threshold:
            progress_percent = round(counter / total_rows * 100)
            print(f"{progress_percent}% complete")
            progress_threshold += progress_interval
    return df
    

In [68]:
combined_data.shape

(24321, 4)

In [69]:
base_df = find_base_outliers(df=combined_data, control_series_col='Hispanic_Tot', diff_col='Hispanic_Diff')
base_df

10% complete
20% complete
30% complete
40% complete
50% complete
60% complete
70% complete
80% complete
90% complete
100% complete


Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,abs_diff,pop_diff_zscore,group_mean,group_std
0,1,126,13,13,7.392324,5.68,0.99
1,2,36,6,6,7.842190,3.21,0.36
2,3,126,-3,3,-3.203723,5.97,0.93
3,4,0,0,0,-8.947543,0.11,0.01
4,5,19,1,1,-5.856212,2.50,0.26
...,...,...,...,...,...,...,...
24316,24317,0,0,0,-11.155263,0.11,0.01
24317,24318,0,-3,3,234.487107,0.11,0.01
24318,24319,0,0,0,-9.547189,0.11,0.01
24319,24320,0,0,0,-8.846857,0.11,0.01


# Further Cleaning Steps
The above outlier function needs further cleaning in order to narrow down the outliers

In [70]:
def remove_small_changes(df, number_of_standard_deviations_acceptable):
    '''Removing rows where the change was less than the mean change plus a set number of standard deviations. This is to remove small changes liek 1 or 2 from being flagged as they may be off from a large mean change.'''
    df['max_acceptable_change'] = df['group_mean'] + (number_of_standard_deviations_acceptable*df['group_std'])

    df = df[df['abs_diff'] > df['max_acceptable_change']]
    df = df.drop('max_acceptable_change', axis=1)
    return df

In [71]:
base_df_2 = remove_small_changes(df=base_df, number_of_standard_deviations_acceptable=2.5)
base_df_2

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,abs_diff,pop_diff_zscore,group_mean,group_std
0,1,126,13,13,7.392324,5.68,0.99
1,2,36,6,6,7.842190,3.21,0.36
6,7,24,8,8,16.079583,2.92,0.32
12,13,31,9,9,9.889840,4.28,0.48
17,18,4,3,3,12.549896,1.12,0.15
...,...,...,...,...,...,...,...
24290,24291,619,-45,45,19.523244,9.53,1.82
24291,24292,91,-15,15,5.297449,5.31,1.83
24292,24293,16,-8,8,21.303750,2.45,0.26
24313,24314,0,-7,7,541.086251,0.11,0.01


In [74]:
def remove_small_weight_changes(df, control_column, max_acceptable_weighted_change):
    '''If a change is a small percetnage change of the final df, it will be dropped. Most of these are taken care of in the function above, however, groups with very small changes (mean and std) will still be flagged'''
    df['weighted_change'] = (df['abs_diff']/df[control_column]*100)

    df = df[df['weighted_change'] > max_acceptable_weighted_change]
    df = df.drop('weighted_change', axis=1)
    return df

In [77]:
base_df_3 = remove_small_weight_changes(df=base_df_2, control_column='Hispanic_Tot', max_acceptable_weighted_change = 10)
base_df_3

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,abs_diff,pop_diff_zscore,group_mean,group_std
0,1,126,13,13,7.392324,5.68,0.99
1,2,36,6,6,7.842190,3.21,0.36
6,7,24,8,8,16.079583,2.92,0.32
12,13,31,9,9,9.889840,4.28,0.48
17,18,4,3,3,12.549896,1.12,0.15
...,...,...,...,...,...,...,...
24284,24285,15,-14,14,36.848424,2.95,0.30
24291,24292,91,-15,15,5.297449,5.31,1.83
24292,24293,16,-8,8,21.303750,2.45,0.26
24313,24314,0,-7,7,541.086251,0.11,0.01


In [78]:
def minimum_abs_changes(df, max_acceptable_abs_change):
    '''This function filters out all values that are below the minimum acceptabble absolute value change.'''

    df = df[df['abs_diff'] > max_acceptable_abs_change]
    return df

In [79]:
base_df_4 = minimum_abs_changes(df=base_df_3, max_acceptable_abs_change=15)
base_df_4

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,abs_diff,pop_diff_zscore,group_mean,group_std
63,64,46,-18,18,20.527530,4.09,0.68
74,75,12,-17,17,49.946843,2.34,0.29
109,110,1,-17,17,168.468732,0.86,0.10
185,186,81,22,22,50.552560,2.63,0.38
481,482,50,-33,33,35.334313,5.26,0.79
...,...,...,...,...,...,...,...
23923,23924,58,-29,29,38.921713,3.72,0.65
23924,23925,38,-36,36,55.521128,3.41,0.59
24064,24065,17,-33,33,107.642127,2.36,0.28
24097,24098,5,-24,24,158.096572,1.24,0.14


# Manual Binning Example
Binning every 30 rows after 100

In [133]:
# Ind File
mgra_data = pd.read_excel(r'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\2022_03\mgra_est_2022_03_ethnicity_ind_QA.xlsx')
mgra_data_21 = mgra_data[mgra_data['yr_id']==2021]
mgra_data_21 = mgra_data_21[['mgra', 'Hispanic']]
mgra_data_21

# Diff File 
mgra_diff_data = pd.read_excel(r'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\diff_files\2022_03\mgra_ethnicity_est_2022_03_minus_2022_02_QA.xlsx', sheet_name='Diff')
mgra_diff_data = mgra_diff_data[mgra_diff_data['yr_id']==2021]
mgra_diff_data_21 = mgra_diff_data[['mgra', 'Hispanic']]
mgra_diff_data_21

combined_data = mgra_data_21.merge(mgra_diff_data_21, on='mgra', suffixes=['_Tot', '_Diff'])
combined_data['Hispanic_Diff_Abs'] = abs(combined_data['Hispanic_Diff'])
combined_data

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,Hispanic_Diff_Abs
0,1,126,13,13
1,2,36,6,6
2,3,126,-3,3
3,4,0,0,0
4,5,19,1,1
...,...,...,...,...
24316,24317,0,0,0
24317,24318,0,-3,3
24318,24319,0,0,0
24319,24320,0,0,0


In [145]:
# Sort from lowest to highest
combined_data = combined_data.sort_values('Hispanic_Tot')

# Less than 100
combined_data_less_than_100 = combined_data[combined_data['Hispanic_Tot'] <= 100]

# Greater than 100
combined_data_greater_than_100 = combined_data[combined_data['Hispanic_Tot'] > 100]

combined_data_greater_than_100

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,Hispanic_Diff_Abs
11670,11671,101,-2,2
11804,11805,101,0,0
9191,9192,101,0,0
18962,18963,101,-2,2
19882,19883,101,1,1
...,...,...,...,...
13594,13595,1246,-46,46
19498,19499,1402,2,2
1379,1380,1435,2,2
19716,19717,1622,-5,5


## Every 30 

In [140]:
df_less_than_100 = combined_data_less_than_100
print(df_less_than_100.shape)

# Calculate the z-score of each 'Total Population' value based on similar cpas
df_less_than_100['pop_diff_zscore'] = np.nan
df_less_than_100['group_mean'] = np.nan
df_less_than_100['group_std'] = np.nan

final_df_less_than_100 = pd.DataFrame()

for pop_val in range(0,101):
    temp_df = df_less_than_100[df_less_than_100['Hispanic_Tot'] == pop_val]
    temp_df = temp_df.reset_index(drop=True)
    for i, row in temp_df.iterrows():
        print(f"Pop Num: {pop_val}")
        means = bootstrap_mean(temp_df['Hispanic_Diff_Abs'], n_bootstraps=100)
        mean_of_means = np.mean(means)
        se_of_means = np.std(means)

        zscore = (row['Hispanic_Diff_Abs'] - mean_of_means) / se_of_means
        temp_df.at[i, 'pop_diff_zscore'] = zscore
        temp_df.at[i, 'group_mean'] = round(mean_of_means, 2)
        temp_df.at[i, 'group_std'] = round(se_of_means,2)
    final_df_less_than_100 = pd.concat([final_df_less_than_100, temp_df])

(21080, 7)
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0
Pop Num: 0

In [146]:
df_greater_than_100 = combined_data_greater_than_100
print(df_greater_than_100.shape)

# Calculate the z-score of each 'Total Population' value based on similar cpas
df_greater_than_100['pop_diff_zscore'] = np.nan
df_greater_than_100['group_mean'] = np.nan
df_greater_than_100['group_std'] = np.nan

batch_size = 30

final_df_greater_than_100 = pd.DataFrame()

for i in range(0, len(df_greater_than_100), batch_size):
    temp_df = df_greater_than_100.iloc[i:i+batch_size]
    temp_df = temp_df.reset_index(drop=True)
    for i, row in temp_df.iterrows():
        print(f"Pop Num: {pop_val}")
        means = bootstrap_mean(temp_df['Hispanic_Diff_Abs'], n_bootstraps=100)
        mean_of_means = np.mean(means)
        se_of_means = np.std(means)

        zscore = (row['Hispanic_Diff_Abs'] - mean_of_means) / se_of_means
        temp_df.at[i, 'pop_diff_zscore'] = zscore
        temp_df.at[i, 'group_mean'] = round(mean_of_means, 2)
        temp_df.at[i, 'group_std'] = round(se_of_means,2)
    final_df_greater_than_100 = pd.concat([final_df_greater_than_100, temp_df])

final_df_greater_than_100

(3241, 4)
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Pop Num: 100
Po

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,Hispanic_Diff_Abs,pop_diff_zscore,group_mean,group_std
0,11671,101,-2,2,-3.092600,6.74,1.53
1,11805,101,0,0,-4.461835,6.70,1.50
2,9192,101,0,0,-4.618333,6.90,1.49
3,18963,101,-2,2,-3.040308,6.56,1.50
4,19883,101,1,1,-3.635610,6.84,1.61
...,...,...,...,...,...,...,...
26,13595,1246,-46,46,21.858354,7.20,1.77
27,19499,1402,2,2,-2.566721,7.34,2.08
28,1380,1435,2,2,-2.777368,7.44,1.96
29,19717,1622,-5,5,-1.179433,7.33,1.98


In [147]:
final_output = pd.concat([final_df_less_than_100, final_df_greater_than_100])
final_output

Unnamed: 0,mgra,Hispanic_Tot,Hispanic_Diff,Hispanic_Diff_Abs,pop_diff_zscore,group_mean,group_std
0,12161,0,0,0,-8.037967,0.11,0.01
1,7899,0,0,0,-8.766591,0.11,0.01
2,19073,0,0,0,-8.600329,0.11,0.01
3,7907,0,0,0,-10.534158,0.11,0.01
4,19070,0,0,0,-9.914229,0.11,0.01
...,...,...,...,...,...,...,...
26,13595,1246,-46,46,21.858354,7.20,1.77
27,19499,1402,2,2,-2.566721,7.34,2.08
28,1380,1435,2,2,-2.777368,7.44,1.96
29,19717,1622,-5,5,-1.179433,7.33,1.98
