# Quantitative analysis of what is the best distance metric. 
---  
Methodology
* 1 Curate 4 meaningful feature sets (Applicable combinations of features)
* 2 Stratify the original dataset to pick 30 target districts to run nearest neighbor on (anchor districts)
* 3 Find neighbors for combinations of anchor district, feature sets, and distance metrics (30*4*4=480)
* 4 Examine variance of standardized columns (standardize all columns to get all on same scale)

Importing necessary packages, lists, and functions 

In [4]:
current_directory = r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\4_Data_Modeling\4.2 Nearest Neighbor Model Validation\1. Nearest Neighbors Model"
os.chdir(current_directory)

#Importing necessary packages
import os
import pandas as pd
import numpy as np
import matplotlib as plt

#Importing demogrpahic buckets 
from Demographic_Buckets import student_teacher_ratio
from Demographic_Buckets import student_count
from Demographic_Buckets import staff_count
from Demographic_Buckets import race_ethnicity_percent
from Demographic_Buckets import economically_disadvantaged
from Demographic_Buckets import special_ed_504
from Demographic_Buckets import language_education_percent
from Demographic_Buckets import special_populations_percent
from Demographic_Buckets import gifted_students
from Demographic_Buckets import district_identifiers

#Importing modeling functions 
from KNN_Model import calculate_missing_percentage
from KNN_Model import drop_columns
from KNN_Model import preprocess_data
from KNN_Model import knn_distance
from KNN_Model import knn_cosine
from KNN_Model import knn_canberra
from KNN_Model import find_nearest_districts
from KNN_Model import get_neighbor_data

#Importing diagnostic plot functions 
from KNN_Diagnostic_Plots import plot_texas_districts
from KNN_Diagnostic_Plots import plot_race_ethnicity_stacked_bar
from KNN_Diagnostic_Plots import plot_class_size_k6_bar
from KNN_Diagnostic_Plots import plot_special_ed_504_bar

Reading in Data & Cleaning

In [7]:
#Reading in the data
df = pd.read_csv(r"https://raw.githubusercontent.com/RiceD2KLab/HERC_Sp25/refs/heads/main/0_Datasets/1.0MergedData/merged_2023.csv")

df = df[df['Charter School (Y/N)'] == 'N']
demographic_df = df[student_teacher_ratio + student_count + staff_count + race_ethnicity_percent + economically_disadvantaged +
                    special_ed_504 + language_education_percent + special_populations_percent + gifted_students +
                    district_identifiers]

# Select only numeric columns
numeric_cols = df.select_dtypes(include='number').columns

# Replace negative values with NaN only in numeric columns
df[numeric_cols] = df[numeric_cols].mask(df[numeric_cols] < 0, np.nan)

Step 1: Curated 4 meaningful feature sets including 
* Basic 
* Demographic heavy feature set 
* Support Services heavy feature set 
* All features 

In [8]:
feature_sets = {
    'basic': student_teacher_ratio + 
              student_count +
              staff_count,
    
    'demographics': 
        race_ethnicity_percent + 
        economically_disadvantaged +
        language_education_percent +
        special_populations_percent 
    ,
    
    'support_services': 
        special_ed_504 + 
        gifted_students +
        language_education_percent +
        special_populations_percent
    ,
    
    'all_features': 
        student_teacher_ratio + 
        student_count +
        staff_count +
        race_ethnicity_percent +
        economically_disadvantaged +
        special_ed_504 +
        language_education_percent +
        special_populations_percent +
        gifted_students
    
}

Step 2: Select 30 anchor districts.  
Anchor districts are determined using proportional stratified sampling across the entire 2023 dataset. Stratification is a tool that tries to take a representative sample of a whole df based on specified columns. Stratification works because it keeps the sample small enough to compute, but diverse enough to trust your results
   
Districts are stratified based on
* TEA Description: 8 columns outline what type of area district is in (Suburban, Urban, etc)
* Region: 1-20 major state/region groupings determined by the TEA (San Antonio, Houston, etc) 

In [9]:
# Strip quotes and whitespace from REGION column if needed
df['REGION'] = df['REGION'].str.strip().str.replace("'", "")

# Group by REGION and TEA Description, then sample 1 from each group
grouped = df.groupby(['REGION', 'TEA Description'])

# Sample 1 district from each unique group (if enough data exists)
anchor_districts = grouped.apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)

# Optional: Limit to 30 districts for feasibility
anchor_df = anchor_districts.sample(n=30, random_state=42) if len(anchor_districts) > 30 else anchor_districts

# Get district IDs
anchor_ids = anchor_df['DISTRICT_id'].tolist()



  anchor_districts = grouped.apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)


Step 3: Loop over distance metrics + feature sets 
* Iteratue over each feature set (step 1)
* For each feature set, try each distance metric 
* For each combo, run find_nearest_districts() on every anchor district 
* Collect the neighbor district id

In [10]:
results = []
distance_metrics = [
    'euclidean',
    'manhattan',
    'mahalanobis',
    'cosine']


for feature_name, features in feature_sets.items(): # Gathering all 4 of the feature combinations 
    for metric in distance_metrics: # Getting all 4 different distance metrics 
        for anchor_id in anchor_ids: #Gett all 30 specific districts
            try:
                neighbors = find_nearest_districts(df, anchor_id, features, 5, metric, "median") #running the neighbors code given the parameters 
                # Saving results to easy to manipulate dataframe 
                results.append({
                    'anchor_id': anchor_id,
                    'feature_set': feature_name,
                    'distance_metric': metric,
                    'neighbor_ids': list(neighbors['DISTRICT_id']), 
                    'neighbor_distname': list(neighbors['DISTNAME'])
                })
            except Exception as e:
                print(f"Failed for anchor {anchor_id} with {metric} on {feature_name}: {e}")

neighbor_results = pd.DataFrame(results)
neighbor_results


Unnamed: 0,anchor_id,feature_set,distance_metric,neighbor_ids,neighbor_distname
0,233901,basic,euclidean,"[233901, 15911, 101924, 227913, 159901]","[SAN FELIPE-DEL RIO CISD, EAST CENTRAL ISD, SH..."
1,13901,basic,euclidean,"[13901, 146906, 84908, 187907, 71904]","[BEEVILLE ISD, LIBERTY ISD, HITCHCOCK ISD, LIV..."
2,34905,basic,euclidean,"[34905, 127901, 157901, 183901, 34907]","[LINDEN-KILDARE CISD, ANSON ISD, MASON ISD, BE..."
3,227901,basic,euclidean,"[227901, 170902, 79907, 220905, 101902]","[AUSTIN ISD, CONROE ISD, FORT BEND ISD, FORT W..."
4,158901,basic,euclidean,"[158901, 15909, 247903, 161906, 247901]","[BAY CITY ISD, SOMERSET ISD, LA VERNIA ISD, LA..."
...,...,...,...,...,...
475,94902,all_features,cosine,"[94902, 46902, 20908, 43907, 199901]","[SCHERTZ-CIBOLO-U CITY ISD, COMAL ISD, PEARLAN..."
476,15907,all_features,cosine,"[15907, 31901, 101911, 101903, 240903]","[SAN ANTONIO ISD, BROWNSVILLE ISD, GOOSE CREEK..."
477,123910,all_features,cosine,"[123910, 57907, 212905, 246906, 57910]","[BEAUMONT ISD, DUNCANVILLE ISD, TYLER ISD, HUT..."
478,39902,all_features,cosine,"[39902, 102906, 221904, 161918, 234903]","[HENRIETTA ISD, ELYSIAN FIELDS ISD, MERKEL ISD..."


Step 4 Examine Variance of Standardized Features

* Standardization: To make variance comparisons fair across features, we standardized all columns used in any feature set across the entire dataset using StandardScaler.

* Intra-Group Variance Calculation: For each neighbor group, we computed the average variance across its selected features. This serves as a proxy for how “tight” or internally similar each group is.

* Distance Metric Comparison: We grouped the results by distance metric and computed the average intra-group variance for each one. Lower variance indicates tighter neighbor clusters.

In [12]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# 1. Standardize the full dataset
# Get a union of all feature columns used across feature sets
all_feature_columns = sorted(set(col for sublist in feature_sets.values() for col in sublist))

# Make a copy to avoid changing the original
df_scaled = df.copy()
scaler = StandardScaler()
df_scaled[all_feature_columns] = scaler.fit_transform(df[all_feature_columns])

# 2. Compute intra-group variance for each neighbor set
variances = []


for _, row in neighbor_results.iterrows():
    neighbor_ids = row['neighbor_ids']  # List of neighbors
    feature_cols = feature_sets[row['feature_set']]  # Selected feature set

    # Subset the scaled data for these neighbors and features
    neighbor_df = df_scaled[df_scaled['DISTRICT_id'].isin(neighbor_ids)][feature_cols].dropna()

    # Skip if too few rows remain
    if neighbor_df.shape[0] < 2:
        variances.append(np.nan)
        continue

    # Compute variance across features and average
    avg_variance = np.var(neighbor_df, axis=0).mean()
    variances.append(avg_variance)

# 3. Store results
neighbor_results['intra_group_variance'] = variances

variance_metric_summary = neighbor_results.groupby('distance_metric')['intra_group_variance'].mean().sort_values()
print(variance_metric_summary)


distance_metric
euclidean      0.099982
manhattan      0.118397
mahalanobis    0.130806
cosine         0.282296
Name: intra_group_variance, dtype: float64


Takeaway: Euclidean performs the best