# HMDA Data Testing

## TODO:
### Data Collection

### Data Cleaning
- [ ] merge like columns together, ex: 'denial_reason-1', 'denial_reason-2', 'denial_reason-3',
       'denial_reason-4'
- [X] fix interest rate column
- [X] fix loan term column

### Statistics
- [x] summary statistics table: interest_rate by race
- [x] ANOVA test: interest_rate by race

### Documentation
- [ ] data exploration and cleanup process
- [ ] other data (show the download, df load, and df) and how poor it was
- [ ] API fixes - was downloading nationwide, and it was too big for jupyterlab/pandas/computer
- [ ] print example, found problem with 'state'.value_counts()
- [ ] data exploration - print columns, print value_counts for each
- [ ] data exploration - pull out a DF and show it of just the primary columns

## Setup
-----

In [1]:
import gzip
import os
import requests
import subprocess
import pandas as pd
import numpy as np
import scipy.stats as stats
from pathlib import Path
from hmda_lib import valid_state_codes
from hmda_lib import valid_years

In [2]:
def download_hmda_data(fd, state, year):
    url = f'https://ffiec.cfpb.gov/v2/data-browser-api/view/csv?states={state}&years={year}'

    try:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            with open(output_file, 'wb') as fd:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        fd.write(chunk)
        return True

    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        return False

In [3]:
def compress_hmda_data(f):
    subprocess.run(['gzip', f])    

## Data Collection
-----

In [4]:
# download HMDA data from API

state = 'MN'
years = ['2018', '2019', '2020', '2021', '2022']

for year in years:
    output_file = Path('hmda_data', f'hmda-{state}-{year}.csv')
    if os.path.exists(f'{output_file}.gz'):
        print('File exists already! Skipping!')
        continue
    else:
        print(f'Downloading HMDA data for: {year} {state}.....', end='')
        download_hmda_data(output_file, state, year)
        print(' compressing.....', end='')
        compress_hmda_data(output_file)
        print(' done!')

File exists already! Skipping!
File exists already! Skipping!
File exists already! Skipping!
File exists already! Skipping!
File exists already! Skipping!


In [5]:
# load the HMDA data into Pandas DataFrames

data_path = 'hmda_data'
filenames = os.listdir(data_path)
all_dataframes = []

for filename in filenames:
    if filename.endswith('.csv.gz'):
        filepath = Path(data_path, filename)
        with gzip.open(filepath, 'rt') as file:
            df = pd.read_csv(filepath)
        all_dataframes.append(df)

unclean_df = pd.concat(all_dataframes, ignore_index=True)

  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)


## Data Cleaning
-----

In [6]:
# remove null values and 'Exempt' interest rate from dataframe

df = unclean_df[unclean_df['interest_rate'].notnull()]
df = df.query('interest_rate != "Exempt"')
df = df.query('loan_to_value_ratio != "Exempt"')

In [7]:
# remove null loan terms from dataframe

df = df[df['loan_term'].notnull()]

In [8]:
# data type conversions

df['interest_rate'] = pd.to_numeric(df['interest_rate'], errors='raise')
df['loan_to_value_ratio'] = pd.to_numeric(df['loan_to_value_ratio'], errors='raise')

In [9]:
# rename values

df['derived_race'] = df['derived_race'].replace({
    'Black or African American': 'Black',
    'American Indian or Alaska Native': 'Native',
    'Native Hawaiian or Other Pacific Islander': 'Pacific Islander'
})

## Data Exploration
-----

In [10]:
df['derived_race'].value_counts()

derived_race
White                       882643
Race Not Available          288832
Asian                        47390
Black                        30486
Joint                        23764
Native                        3857
Pacific Islander              1106
2 or more minority races       879
Free Form Text Only             66
Name: count, dtype: int64

In [11]:
df.head()

Unnamed: 0,activity_year,lei,derived_msa-md,state_code,county_code,census_tract,conforming_loan_limit,derived_loan_product_type,derived_dwelling_category,derived_ethnicity,...,denial_reason-2,denial_reason-3,denial_reason-4,tract_population,tract_minority_population_percent,ffiec_msa_md_median_family_income,tract_to_msa_income_percentage,tract_owner_occupied_units,tract_one_to_four_family_homes,tract_median_age_of_housing_units
2,2018,KB1H1DSPRFMYMCUFXT09,33460,MN,27053.0,27053110000.0,NC,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,4248,16.43,93600,131,1270,1552,76
3,2018,KB1H1DSPRFMYMCUFXT09,33460,MN,27019.0,27019090000.0,C,Conventional:Subordinate Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,5956,4.37,93600,108,1898,2158,28
6,2018,KB1H1DSPRFMYMCUFXT09,33460,MN,27037.0,27037060000.0,C,Conventional:Subordinate Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,5631,28.61,93600,106,1589,1905,37
9,2018,KB1H1DSPRFMYMCUFXT09,33460,MN,27019.0,27019090000.0,C,Conventional:Subordinate Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,4225,14.67,93600,118,1228,1327,31
13,2018,KB1H1DSPRFMYMCUFXT09,33460,MN,27037.0,27037060000.0,C,Conventional:Subordinate Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,5814,39.15,93600,78,1778,1888,32


In [12]:
for c in df.columns:
    print(c)

activity_year
lei
derived_msa-md
state_code
county_code
census_tract
conforming_loan_limit
derived_loan_product_type
derived_dwelling_category
derived_ethnicity
derived_race
derived_sex
action_taken
purchaser_type
preapproval
loan_type
loan_purpose
lien_status
reverse_mortgage
open-end_line_of_credit
business_or_commercial_purpose
loan_amount
loan_to_value_ratio
interest_rate
rate_spread
hoepa_status
total_loan_costs
total_points_and_fees
origination_charges
discount_points
lender_credits
loan_term
prepayment_penalty_term
intro_rate_period
negative_amortization
interest_only_payment
balloon_payment
other_nonamortizing_features
property_value
construction_method
occupancy_type
manufactured_home_secured_property_type
manufactured_home_land_property_interest
total_units
multifamily_affordable_units
income
debt_to_income_ratio
applicant_credit_score_type
co-applicant_credit_score_type
applicant_ethnicity-1
applicant_ethnicity-2
applicant_ethnicity-3
applicant_ethnicity-4
applicant_ethnicit

In [13]:
for c in df.columns:
    print(f'Examining column: {c}')
    print(df[c].value_counts())
    print()

Examining column: activity_year
activity_year
2020    352175
2021    340417
2019    226539
2018    188313
2022    171579
Name: count, dtype: int64

Examining column: lei
lei
6BYL5QZYBDK8S7L73M02    117008
KB1H1DSPRFMYMCUFXT09     90161
549300WYBPIWKK6SQC06     56778
549300FGXN1K3HLB1R50     55076
549300HW662MN1WU8550     33033
                         ...  
5493008WOGGD8641UC13         1
549300PEWJWBNEOSS013         1
549300UXZ76FJVM3UC43         1
549300LTT1PNSPD2JT50         1
5493006KQF5UKZ3Y9N07         1
Name: count, Length: 1018, dtype: int64

Examining column: derived_msa-md
derived_msa-md
33460    922913
99999    194118
40340     46655
20260     40437
41060     37052
22020     14455
31860     13873
24220      4538
29100      3301
0          1662
38060         3
36084         2
12980         1
41700         1
16974         1
28140         1
36740         1
47300         1
38940         1
23580         1
19740         1
39580         1
36100         1
32580         1
31084       

## Statistics Summaries
-----

In [14]:
# Statistics Summary Table - Interest Rates by Race
race_group = df.groupby('derived_race')

summary_table = pd.DataFrame({
    "Mean Interest Rate": race_group['interest_rate'].mean(),
    "Median Interest Rate": race_group['interest_rate'].median(),
    "Interest Rate Variance": race_group['interest_rate'].var(),
    "Interest Rate Std. Dev.": race_group['interest_rate'].std(),
    "Interest Rate Std. Err.": race_group['interest_rate'].sem()
})

summary_table

Unnamed: 0_level_0,Mean Interest Rate,Median Interest Rate,Interest Rate Variance,Interest Rate Std. Dev.,Interest Rate Std. Err.
derived_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2 or more minority races,3.947778,3.625,2.205659,1.485146,0.050093
Asian,3.687475,3.375,1.426056,1.194176,0.005486
Black,3.820617,3.5,10.695898,3.270458,0.018731
Free Form Text Only,4.121212,3.75,2.153253,1.467397,0.180624
Joint,3.803048,3.5,1.382663,1.175867,0.007628
Native,3.94349,3.625,2.024991,1.423022,0.022913
Pacific Islander,3.903947,3.625,1.77162,1.331022,0.040023
Race Not Available,4.013744,3.625,425.640704,20.631062,0.038388
White,3.788169,3.5,63.506752,7.969112,0.008482


In [15]:
# Statistics Summary Table - Loan Amount by Race
race_group = df.groupby('derived_race')

summary_table = pd.DataFrame({
    "Mean Loan Amount": race_group['loan_amount'].mean(),
    "Median Loan Amount": race_group['loan_amount'].median(),
    "Loan Amount Variance": race_group['loan_amount'].var(),
    "Loan Amount Std. Dev.": race_group['loan_amount'].std(),
    "Loan Amount Std. Err.": race_group['loan_amount'].sem()
})

summary_table

Unnamed: 0_level_0,Mean Loan Amount,Median Loan Amount,Loan Amount Variance,Loan Amount Std. Dev.,Loan Amount Std. Err.
derived_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2 or more minority races,222792.94653,215000.0,15097630000.0,122872.4,4144.383518
Asian,259132.306394,245000.0,21599100000.0,146966.3,675.109934
Black,233876.533491,225000.0,16725600000.0,129327.5,740.697008
Free Form Text Only,177121.212121,160000.0,19140050000.0,138347.6,17029.407105
Joint,263156.455142,245000.0,30467220000.0,174548.6,1132.287305
Native,202814.363495,185000.0,17107700000.0,130796.4,2106.060973
Pacific Islander,188860.759494,165000.0,127509500000.0,357084.7,10737.266542
Race Not Available,299401.347496,225000.0,1590348000000.0,1261090.0,2346.515169
White,231484.173103,205000.0,25255680000.0,158920.3,169.155828


In [16]:
# Statistics Summary Table - loan_to_value_ratio by Race
race_group = df.groupby('derived_race')

summary_table = pd.DataFrame({
    "Mean Loan to Value Ratio": race_group['loan_to_value_ratio'].mean(),
    "Median Loan to Value Ratio": race_group['loan_to_value_ratio'].median(),
    "Loan to Value Ratio Variance": race_group['loan_to_value_ratio'].var(),
    "Loan to Value Ratio Std. Dev.": race_group['loan_to_value_ratio'].std(),
    "Loan to Value Ratio Std. Err.": race_group['loan_to_value_ratio'].sem()
})

summary_table

Unnamed: 0_level_0,Mean Loan to Value Ratio,Median Loan to Value Ratio,Loan to Value Ratio Variance,Loan to Value Ratio Std. Dev.,Loan to Value Ratio Std. Err.
derived_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2 or more minority races,83.870818,90.0,326.2467,18.062301,0.639399
Asian,78.711334,80.0,332.1952,18.226223,0.087076
Black,84.992076,90.001,308.8362,17.573736,0.10573
Free Form Text Only,74.098766,79.615,481.2053,21.936393,2.742049
Joint,77.88171,80.0,326.0366,18.056485,0.121906
Native,80.468634,81.875,372.5801,19.302335,0.328339
Pacific Islander,76.671769,79.9615,406.2736,20.15623,0.634233
Race Not Available,73.672802,77.0,363.4651,19.06476,0.053932
White,99.269989,79.764,400962500.0,20024.049157,22.232933


## Hypothesis Testing
-----

### ANOVA Tests

In [17]:
# Extract individual groups
group0 = df[df["derived_race"].str.fullmatch('White')]["interest_rate"]
group1 = df[df["derived_race"].str.fullmatch('Race Not Available')]["interest_rate"]
group2 = df[df["derived_race"].str.fullmatch('Asian')]["interest_rate"]
group3 = df[df["derived_race"].str.fullmatch('Joint')]["interest_rate"]
group4 = df[df["derived_race"].str.fullmatch('Black')]["interest_rate"]
group5 = df[df["derived_race"].str.fullmatch('Native')]["interest_rate"]
group6 = df[df["derived_race"].str.fullmatch('2 or more minority races')]["interest_rate"]
group7 = df[df["derived_race"].str.fullmatch('Pacific Islander')]["interest_rate"]
group8 = df[df["derived_race"].str.fullmatch('Free Form Text Only')]["interest_rate"]

# Perform the ANOVA test
stats.f_oneway(group0, group1, group2, group3, group4, group5, group6, group7, group8)

F_onewayResult(statistic=10.948260161132053, pvalue=1.4375699707682176e-15)

#### Interpretation
A p-value of 1.4375699707682176e-15 (approximately 1.44 x 10^-15) is extremely small. In the context of statistical hypothesis testing, this indicates:

Strong Evidence Against the Null Hypothesis: The observed results are highly unlikely to have occurred by random chance alone if the null hypothesis (the assumption of no effect or no difference) were true.
Statistical Significance: The result is highly statistically significant. This means you can confidently reject the null hypothesis and conclude that there is a significant effect or difference present in your data.

Source: Google Gemini

In [18]:
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################


In [24]:
# Set the maximum number of columns to None (displays all columns)
pd.options.display.max_columns = None


In [26]:
# Only use a copy of df
# do Not mess with original df past this point

map_df = df.copy()

print(map_df)
# 1279023 rows x 99 columns


         activity_year                   lei  derived_msa-md state_code  \
2                 2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
3                 2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
6                 2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
9                 2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
13                2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
...                ...                   ...             ...        ...   
1787485           2022  549300RBJCM5B02O5U05           33460         MN   
1787487           2022  549300RBJCM5B02O5U05           99999         MN   
1787488           2022  549300RBJCM5B02O5U05           99999         MN   
1787490           2022  549300RBJCM5B02O5U05           99999         MN   
1787491           2022  549300Y7HCU10KIRLG30           99999         MN   

         county_code  census_tract conforming_loan_limit  \
2            27053.0  2.705311e+10     

In [27]:
# ##################
# #  What are those outliers ?
# ##################

##########  Can we throw away the three highest "interest_rate" outliers 

print(map_df)
# 1279023 rows x 99 columns

clean_df = map_df.copy()

non_outlier_df = clean_df[ clean_df['interest_rate'] <= 20 ]

map_df = non_outlier_df.copy()

print(map_df)
# 1278959 rows x 99 columns


         activity_year                   lei  derived_msa-md state_code  \
2                 2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
3                 2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
6                 2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
9                 2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
13                2018  KB1H1DSPRFMYMCUFXT09           33460         MN   
...                ...                   ...             ...        ...   
1787485           2022  549300RBJCM5B02O5U05           33460         MN   
1787487           2022  549300RBJCM5B02O5U05           99999         MN   
1787488           2022  549300RBJCM5B02O5U05           99999         MN   
1787490           2022  549300RBJCM5B02O5U05           99999         MN   
1787491           2022  549300Y7HCU10KIRLG30           99999         MN   

         county_code  census_tract conforming_loan_limit  \
2            27053.0  2.705311e+10     

In [28]:
# ##################
# #  Something wrong with '2018', '2019' data ... dont know what it is yet ... so dropping data for those years
# ##################

# print(map_df)
# # 1278959 rows x 99 columns

# Define the list of valid years
valid_years = [2020, 2021, 2022]

# Filter rows where 'year' is in the list of valid years
new_df = map_df[map_df['activity_year'].isin(valid_years)].copy()

map_df = new_df.copy()

print(map_df)
# 864168 rows x 99 columns


         activity_year                   lei  derived_msa-md state_code  \
586715            2020  AD6GFRVSDT01YPT1CS68           33460         MN   
586716            2020  AD6GFRVSDT01YPT1CS68           33460         MN   
586717            2020  AD6GFRVSDT01YPT1CS68           33460         MN   
586718            2020  AD6GFRVSDT01YPT1CS68           99999         MN   
586719            2020  AD6GFRVSDT01YPT1CS68           33460         MN   
...                ...                   ...             ...        ...   
1787485           2022  549300RBJCM5B02O5U05           33460         MN   
1787487           2022  549300RBJCM5B02O5U05           99999         MN   
1787488           2022  549300RBJCM5B02O5U05           99999         MN   
1787490           2022  549300RBJCM5B02O5U05           99999         MN   
1787491           2022  549300Y7HCU10KIRLG30           99999         MN   

         county_code  census_tract conforming_loan_limit  \
586715       27123.0  2.712304e+10     

In [29]:
# ########## explore derived_dwelling_category

der_dwelling_category_counts = map_df['derived_dwelling_category'].value_counts()

# Display the result
print(der_dwelling_category_counts)


derived_dwelling_category
Single Family (1-4 Units):Site-Built      855430
Single Family (1-4 Units):Manufactured      6637
Multifamily:Site-Built                      2049
Multifamily:Manufactured                      52
Name: count, dtype: int64


In [30]:

# ########## restrict analysis to ...
# ##########     derived_dwelling_category = "Single Family (1-4 Units):Site-Built"

temp_df = map_df[ (map_df['derived_dwelling_category'] == "Single Family (1-4 Units):Site-Built") ].copy()

map_df = temp_df.copy()

# clear out temp_df
temp_df = pd.DataFrame()

print(map_df)
# 855430 rows x 99 columns


         activity_year                   lei  derived_msa-md state_code  \
586715            2020  AD6GFRVSDT01YPT1CS68           33460         MN   
586716            2020  AD6GFRVSDT01YPT1CS68           33460         MN   
586717            2020  AD6GFRVSDT01YPT1CS68           33460         MN   
586718            2020  AD6GFRVSDT01YPT1CS68           99999         MN   
586719            2020  AD6GFRVSDT01YPT1CS68           33460         MN   
...                ...                   ...             ...        ...   
1787484           2022  549300RBJCM5B02O5U05           33460         MN   
1787485           2022  549300RBJCM5B02O5U05           33460         MN   
1787487           2022  549300RBJCM5B02O5U05           99999         MN   
1787488           2022  549300RBJCM5B02O5U05           99999         MN   
1787490           2022  549300RBJCM5B02O5U05           99999         MN   

         county_code  census_tract conforming_loan_limit  \
586715       27123.0  2.712304e+10     

In [31]:
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################


In [32]:
county_data = {
	27173: {"county": "Wadena County"       , "state": "Minnesota", "latitude": 46.6532, "longitude": -94.9907},
	27155: {"county": "Todd County"         , "state": "Minnesota", "latitude": 46.0207, "longitude": -94.8968},
	27149: {"county": "St. Louis County"    , "state": "Minnesota", "latitude": 47.5207, "longitude": -92.5207},
	27145: {"county": "Sherburne County"    , "state": "Minnesota", "latitude": 45.4658, "longitude": -93.7478},
	27127: {"county": "Pennington County"   , "state": "Minnesota", "latitude": 48.0653, "longitude": -96.0725},
	27089: {"county": "Mahnomen County"     , "state": "Minnesota", "latitude": 47.3322, "longitude": -95.8209},
	27087: {"county": "Marshall County"     , "state": "Minnesota", "latitude": 48.3627, "longitude": -96.3625},
	27073: {"county": "Kittson County"      , "state": "Minnesota", "latitude": 48.7684, "longitude": -96.8188},
	27071: {"county": "Koochiching County"  , "state": "Minnesota", "latitude": 48.2524, "longitude": -93.7832},
	27065: {"county": "Lake County"         , "state": "Minnesota", "latitude": 47.6172, "longitude": -91.4177},
	27055: {"county": "Hubbard County"      , "state": "Minnesota", "latitude": 47.0705, "longitude": -94.9281},
	27033: {"county": "Cook County"         , "state": "Minnesota", "latitude": 47.9349, "longitude": -90.4108},
	27031: {"county": "Cook County"         , "state": "Minnesota", "latitude": 47.9349, "longitude": -90.4108},
	27029: {"county": "Clearwater County"   , "state": "Minnesota", "latitude": 47.5738, "longitude": -95.3641},
	27023: {"county": "Carlton County"      , "state": "Minnesota", "latitude": 46.5567, "longitude": -92.4995},
	27021: {"county": "Clearwater County"   , "state": "Minnesota", "latitude": 47.5738, "longitude": -95.3641},
	27007: {"county": "Beltrami County"     , "state": "Minnesota", "latitude": 47.9439, "longitude": -94.8502},
	27169: {"county": "Steele County"       , "state": "Minnesota", "latitude": 44.0275, "longitude": -93.2079},
	27167: {"county": "Steele County"       , "state": "Minnesota", "latitude": 44.0275, "longitude": -93.2079},
	27153: {"county": "Stearns County"      , "state": "Minnesota", "latitude": 45.5798, "longitude": -94.5979},
	27147: {"county": "Stearns County"      , "state": "Minnesota", "latitude": 45.5798, "longitude": -94.5979},
	27137: {"county": "Polk County"         , "state": "Minnesota", "latitude": 47.7506, "longitude": -96.0039},
	27135: {"county": "Rice County"         , "state": "Minnesota", "latitude": 44.3355, "longitude": -93.283 },
	27133: {"county": "Rice County"         , "state": "Minnesota", "latitude": 44.3355, "longitude": -93.283 },
	27121: {"county": "Otter Tail County"   , "state": "Minnesota", "latitude": 46.3703, "longitude": -95.6982},
	27117: {"county": "Norman County"       , "state": "Minnesota", "latitude": 47.4578, "longitude": -96.4347},
	27103: {"county": "Morrison County"     , "state": "Minnesota", "latitude": 46.0204, "longitude": -94.2214},
	27097: {"county": "Morrison County"     , "state": "Minnesota", "latitude": 46.0204, "longitude": -94.2214},
	27095: {"county": "Mille Lacs County"   , "state": "Minnesota", "latitude": 45.8363, "longitude": -93.5926},
	27093: {"county": "Meeker County"       , "state": "Minnesota", "latitude": 45.1159, "longitude": -94.5128},
	27085: {"county": "McLeod County"       , "state": "Minnesota", "latitude": 44.8994, "longitude": -94.3233},
	27083: {"county": "Martin County"       , "state": "Minnesota", "latitude": 43.6711, "longitude": -94.6015},
	27077: {"county": "Le Sueur County"     , "state": "Minnesota", "latitude": 44.3632, "longitude": -93.6814},
	27075: {"county": "Le Sueur County"     , "state": "Minnesota", "latitude": 44.3632, "longitude": -93.6814},
	27061: {"county": "Kanabec County"      , "state": "Minnesota", "latitude": 45.9137, "longitude": -93.3555},
	27059: {"county": "Isanti County"       , "state": "Minnesota", "latitude": 45.5549, "longitude": -93.3142},
	27049: {"county": "Grant County"        , "state": "Minnesota", "latitude": 45.9657, "longitude": -96.0174},
	27037: {"county": "Crow Wing County"    , "state": "Minnesota", "latitude": 46.5396, "longitude": -94.0643},
	27027: {"county": "Clay County"         , "state": "Minnesota", "latitude": 46.9016, "longitude": -96.4115},
	27019: {"county": "Carver County"       , "state": "Minnesota", "latitude": 44.8223, "longitude": -93.7808},
	27017: {"county": "Brown County"        , "state": "Minnesota", "latitude": 44.2388, "longitude": -94.7238},
	27015: {"county": "Brown County"        , "state": "Minnesota", "latitude": 44.2388, "longitude": -94.7238},
	27013: {"county": "Blue Earth County"   , "state": "Minnesota", "latitude": 44.0601, "longitude": -94.1017},
	27009: {"county": "Benton County"       , "state": "Minnesota", "latitude": 45.6859, "longitude": -94.0297},
	27005: {"county": "Becker County"       , "state": "Minnesota", "latitude": 46.9818, "longitude": -95.7989},
	27001: {"county": "Aitkin County"       , "state": "Minnesota", "latitude": 46.6742, "longitude": -93.4087},
	27143: {"county": "Scott County"        , "state": "Minnesota", "latitude": 44.6618, "longitude": -93.5461},
	27129: {"county": "Ramsey County"       , "state": "Minnesota", "latitude": 45.0134, "longitude": -93.0966},
	27057: {"county": "Hennepin County"     , "state": "Minnesota", "latitude": 44.9244, "longitude": -93.4226},
	27053: {"county": "Hennepin County"     , "state": "Minnesota", "latitude": 44.9244, "longitude": -93.4226},
	27039: {"county": "Dakota County"       , "state": "Minnesota", "latitude": 44.6764, "longitude": -93.0623},
	27003: {"county": "Anoka County"        , "state": "Minnesota", "latitude": 45.3311, "longitude": -93.2778},
	27171: {"county": "Traverse County"     , "state": "Minnesota", "latitude": 45.8237, "longitude": -96.4295},
	27165: {"county": "Swift County"        , "state": "Minnesota", "latitude": 45.1673, "longitude": -95.7552},
	27163: {"county": "Sibley County"       , "state": "Minnesota", "latitude": 44.5745, "longitude": -94.2046},
	27161: {"county": "Waseca County"       , "state": "Minnesota", "latitude": 44.0246, "longitude": -93.5479},
	27159: {"county": "Wabasha County"      , "state": "Minnesota", "latitude": 44.3674, "longitude": -92.1205},
	27157: {"county": "Swift County"        , "state": "Minnesota", "latitude": 45.1673, "longitude": -95.7552},
	27151: {"county": "Stevens County"      , "state": "Minnesota", "latitude": 45.6001, "longitude": -96.0772},
	27141: {"county": "Rock County"         , "state": "Minnesota", "latitude": 43.6773, "longitude": -96.2364},
	27139: {"county": "Renville County"     , "state": "Minnesota", "latitude": 44.7549, "longitude": -94.8989},
	27131: {"county": "Redwood County"      , "state": "Minnesota", "latitude": 44.3899, "longitude": -95.2289},
	27125: {"county": "Pipestone County"    , "state": "Minnesota", "latitude": 43.8584, "longitude": -96.3087},
	27123: {"county": "Pipestone County"    , "state": "Minnesota", "latitude": 43.8584, "longitude": -96.3087},
	27119: {"county": "Olmsted County"      , "state": "Minnesota", "latitude": 44.0216, "longitude": -92.4544},
	27115: {"county": "Nobles County"       , "state": "Minnesota", "latitude": 43.6826, "longitude": -95.7634},
	27113: {"county": "Murray County"       , "state": "Minnesota", "latitude": 44.0036, "longitude": -95.7545},
	27111: {"county": "Murray County"       , "state": "Minnesota", "latitude": 44.0036, "longitude": -95.7545},
	27109: {"county": "Mower County"        , "state": "Minnesota", "latitude": 43.6663, "longitude": -92.7798},
	27107: {"county": "Nicollet County"     , "state": "Minnesota", "latitude": 44.4137, "longitude": -94.1644},
	27105: {"county": "Nobles County"       , "state": "Minnesota", "latitude": 43.6826, "longitude": -95.7634},
	27101: {"county": "Murray County"       , "state": "Minnesota", "latitude": 44.0036, "longitude": -95.7545},
	27099: {"county": "Nicollet County"     , "state": "Minnesota", "latitude": 44.4137, "longitude": -94.1644},
	27091: {"county": "Martin County"       , "state": "Minnesota", "latitude": 43.6711, "longitude": -94.6015},
	27081: {"county": "Lincoln County"      , "state": "Minnesota", "latitude": 44.3737, "longitude": -96.3093},
	27079: {"county": "Lyon County"         , "state": "Minnesota", "latitude": 44.4072, "longitude": -95.7508},
	27069: {"county": "Kandiyohi County"    , "state": "Minnesota", "latitude": 45.1796, "longitude": -95.0212},
	27067: {"county": "Lac qui Parle County", "state": "Minnesota", "latitude": 44.9895, "longitude": -96.0875},
	27063: {"county": "Jackson County"      , "state": "Minnesota", "latitude": 43.6594, "longitude": -95.1528},
	27051: {"county": "Goodhue County"      , "state": "Minnesota", "latitude": 44.4201, "longitude": -92.7238},
	27047: {"county": "Freeborn County"     , "state": "Minnesota", "latitude": 43.6883, "longitude": -93.3484},
	27045: {"county": "Fillmore County"     , "state": "Minnesota", "latitude": 43.6318, "longitude": -92.2454},
	27043: {"county": "Faribault County"    , "state": "Minnesota", "latitude": 43.6767, "longitude": -93.9497},
	27041: {"county": "Dodge County"        , "state": "Minnesota", "latitude": 44.0193, "longitude": -92.8542},
	27035: {"county": "Cottonwood County"   , "state": "Minnesota", "latitude": 43.8877, "longitude": -95.2054},
	27025: {"county": "Chippewa County"     , "state": "Minnesota", "latitude": 45.0386, "longitude": -95.5472},
	27011: {"county": "Big Stone County"    , "state": "Minnesota", "latitude": 45.4219, "longitude": -96.4086}
}


def get_county_info(fips_code):
    typeString = type(fips_code)
    if fips_code in county_data:
        county_info = county_data[fips_code]
        county_name = county_info["county"]
        # state = county_info["state"]
        latitude = county_info["latitude"]
        longitude = county_info["longitude"]
        return county_name, latitude, longitude
        # return county_name, state, latitude, longitude

    else:
        return None, None, None
        # return None, None, None, None


In [33]:
# ####################################################################################################
# ####################################################################################################

In [34]:

# only two columns are of interest
#
clean_df = map_df[["county_code", "interest_rate"]].copy()

# Check the updated DataFrame
print(clean_df)
# 855430 rows x 2 columns

# remove rows where 'interest_rate' is not numeric
clean_df['interest_rate'] = pd.to_numeric(clean_df['interest_rate'], errors='coerce')

# Drop rows with NaN values in any column
clean_df = clean_df.dropna()

# Check the updated DataFrame
print(clean_df)
# 854609 rows x 2 columns


         county_code  interest_rate
586715       27123.0          3.000
586716       27053.0          3.250
586717       27163.0          3.500
586718       27035.0          2.750
586719       27139.0          3.000
...              ...            ...
1787484      27053.0          5.240
1787485      27123.0          4.000
1787487      27091.0          4.990
1787488      27173.0          7.125
1787490      27033.0          5.875

[855430 rows x 2 columns]
         county_code  interest_rate
586715       27123.0          3.000
586716       27053.0          3.250
586717       27163.0          3.500
586718       27035.0          2.750
586719       27139.0          3.000
...              ...            ...
1787484      27053.0          5.240
1787485      27123.0          4.000
1787487      27091.0          4.990
1787488      27173.0          7.125
1787490      27033.0          5.875

[854609 rows x 2 columns]


In [35]:

# Group by 'county_code' ... and calculate the mean of 'interest_rate' for each group
mean_interest_rate_by_county = clean_df.groupby('county_code')['interest_rate'].mean()

# Convert the result to a DataFrame for better readability
mean_interest_rate_by_county_df = mean_interest_rate_by_county.reset_index()

# Rename the 'interest_rate' column to 'mean_interest_rate'
mean_interest_rate_by_county_df = mean_interest_rate_by_county_df.rename(columns={'interest_rate': 'mean_interest_rate'})

# Check the updated DataFrame
print(mean_interest_rate_by_county_df)
# 87 rows x 2 columns


    county_code  mean_interest_rate
0       27001.0            3.509389
1       27003.0            3.442619
2       27005.0            3.343704
3       27007.0            3.485990
4       27009.0            3.442437
..          ...                 ...
82      27165.0            3.537129
83      27167.0            3.594771
84      27169.0            3.411404
85      27171.0            3.435158
86      27173.0            3.554130

[87 rows x 2 columns]


In [36]:

# fix "county_code" values , such as "27001.0" to be "27001"
mean_interest_rate_by_county_df['county_code'] = mean_interest_rate_by_county_df['county_code'].astype(int)

# add three new columns ... will populate these further below
mean_interest_rate_by_county_df['lat'] = ""
mean_interest_rate_by_county_df['lon'] = ""
mean_interest_rate_by_county_df['county_name'] = ""

# Check the updated DataFrame
print(mean_interest_rate_by_county_df)
# 87 rows x 5 columns


    county_code  mean_interest_rate lat lon county_name
0         27001            3.509389                    
1         27003            3.442619                    
2         27005            3.343704                    
3         27007            3.485990                    
4         27009            3.442437                    
..          ...                 ...  ..  ..         ...
82        27165            3.537129                    
83        27167            3.594771                    
84        27169            3.411404                    
85        27171            3.435158                    
86        27173            3.554130                    

[87 rows x 5 columns]


In [37]:

# for all rows , add "latitude", "longitude", "county_name" from table defined further above
#
for index, row in mean_interest_rate_by_county_df.iterrows():
    county_code = row['county_code']
    
    # Call get_county_info function to get latitude, longitude, county_name for county_code
    county_name, latitude, longitude = get_county_info(county_code)
    
    # Update 'lat', 'lon', 'county_name' columns fetched values
    mean_interest_rate_by_county_df.at[index, 'lat'] = latitude
    mean_interest_rate_by_county_df.at[index, 'lon'] = longitude
    mean_interest_rate_by_county_df.at[index, 'county_name'] = county_name

# Check the updated DataFrame
print(mean_interest_rate_by_county_df)
# 87 rows x 5 columns


    county_code  mean_interest_rate      lat      lon      county_name
0         27001            3.509389  46.6742 -93.4087    Aitkin County
1         27003            3.442619  45.3311 -93.2778     Anoka County
2         27005            3.343704  46.9818 -95.7989    Becker County
3         27007            3.485990  47.9439 -94.8502  Beltrami County
4         27009            3.442437  45.6859 -94.0297    Benton County
..          ...                 ...      ...      ...              ...
82        27165            3.537129  45.1673 -95.7552     Swift County
83        27167            3.594771  44.0275 -93.2079    Steele County
84        27169            3.411404  44.0275 -93.2079    Steele County
85        27171            3.435158  45.8237 -96.4295  Traverse County
86        27173            3.554130  46.6532 -94.9907    Wadena County

[87 rows x 5 columns]


In [38]:
import hvplot.pandas


In [39]:

# Configure the map plot for Minnesota County Mean Interest Rate
#
mn_map_plot = mean_interest_rate_by_county_df.hvplot.points(
    "lon",
    "lat",
    geo   = True,
    tiles = "OSM",                                           # Set the background map image
    title = "Mean Interest Rate (%) by Minnesota County",    # Set the title of the plot
    width  = 600,
    height = 600,
    fontsize = 15,

    color = "mean_interest_rate",
    cmap = ["darkgreen", "green", "red", "darkred"],    # Set the colormap as a list of colors
    
    size="mean_interest_rate",                          # Set the size of each dot based on mean_interest_rate
    scale=5,

    hover_cols=["county_name","county_code"]
)

# Display the map plot
mn_map_plot


  "class": algorithms.Blowfish,


In [None]:
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
# ####################################################################################################
