In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix
from sklearn.decomposition import PCA

In [4]:
data = pd.read_csv('./../../data/data.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25139 entries, 0 to 25138
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   YEAR                          25139 non-null  int64  
 1   FIPS                          25139 non-null  int64  
 2   STATE_FIPS                    25139 non-null  int64  
 3   COUNTY_FIPS                   25139 non-null  int64  
 4   MEDIAN_HOUSEHOLD_INCOME       25137 non-null  float64
 5   POP_POVERTY_DETERMINED        25138 non-null  float64
 6   POP_BELOW_POVERTY             25138 non-null  float64
 7   POP_16_PLUS                   25138 non-null  float64
 8   POP_UNEMPLOYED                25138 non-null  float64
 9   HOUSEHOLDS_TOTAL              25139 non-null  int64  
 10  HOUSEHOLDS_SNAP               25139 non-null  int64  
 11  POVERTY_RATE                  25138 non-null  float64
 12  UNEMPLOYMENT_RATE             25138 non-null  float64
 13  S

In [4]:
data.head()

Unnamed: 0,YEAR,FIPS,STATE_FIPS,COUNTY_FIPS,MEDIAN_HOUSEHOLD_INCOME,POP_POVERTY_DETERMINED,POP_BELOW_POVERTY,POP_16_PLUS,POP_UNEMPLOYED,HOUSEHOLDS_TOTAL,...,High_Threshold_Type,Pct_FI_Below_Low_Threshold,Pct_FI_Between_Thresholds,Pct_FI_Above_High_Threshold,Child_Food_Insecurity_Rate,Num_Food_Insecure_Children,Pct_FI_Children_Below_185FPL,Pct_FI_Children_Above_185FPL,Cost_Per_Meal,Annual_Food_Budget_Shortfall
0,2011,37043,37,43,36711.0,10380.0,2262.0,8680.0,4258.0,4464,...,SNAP,0.683,0.0,0.317,0.312,620.0,0.841,0.159,2.74,719180.0
1,2011,37051,37,51,44861.0,302057.0,50175.0,240096.0,78349.0,118117,...,SNAP,0.718,0.0,0.282,0.237,20110.0,0.7,0.3,2.52,26420430.0
2,2011,37081,37,81,46288.0,469463.0,76141.0,382682.0,125748.0,192064,...,SNAP,0.671,0.0,0.329,0.231,26250.0,0.669,0.331,2.64,41093880.0
3,2011,37099,37,99,36826.0,35995.0,7028.0,33235.0,13872.0,15759,...,SNAP,0.67,0.0,0.33,0.261,1830.0,0.72,0.28,2.82,2933120.0
4,2011,37139,37,139,45298.0,38228.0,7589.0,32283.0,11352.0,14550,...,SNAP,0.663,0.0,0.337,0.25,2350.0,0.658,0.342,2.66,3690390.0


In [5]:
data.isna().sum().sort_values(ascending=False)

Pct_FI_Between_Thresholds       4133
MEDIAN_HOUSEHOLD_INCOME            2
POP_POVERTY_DETERMINED             1
POP_BELOW_POVERTY                  1
POP_16_PLUS                        1
POP_UNEMPLOYED                     1
POVERTY_RATE                       1
UNEMPLOYMENT_RATE                  1
YEAR                               0
Child_Food_Insecurity_Rate         0
High_Threshold_Type                0
Pct_FI_Below_Low_Threshold         0
Pct_FI_Above_High_Threshold        0
Pct_FI_Children_Below_185FPL       0
Num_Food_Insecure_Children         0
Low_Threshold_Type                 0
Pct_FI_Children_Above_185FPL       0
Cost_Per_Meal                      0
High_Threshold_State               0
State                              0
Low_Threshold_State                0
Num_Food_Insecure_Persons          0
Food_Insecurity_Rate               0
FIPS                               0
County                             0
SNAP_RECEIPT_RATE                  0
HOUSEHOLDS_SNAP                    0
H

In [6]:
data = data.dropna(subset=[col for col in data.columns if col != "Pct_FI_Between_Thresholds"])

In [7]:
data.isna().sum().sort_values(ascending=False)

Pct_FI_Between_Thresholds       4133
FIPS                               0
YEAR                               0
COUNTY_FIPS                        0
MEDIAN_HOUSEHOLD_INCOME            0
POP_POVERTY_DETERMINED             0
STATE_FIPS                         0
POP_16_PLUS                        0
POP_UNEMPLOYED                     0
HOUSEHOLDS_TOTAL                   0
HOUSEHOLDS_SNAP                    0
POVERTY_RATE                       0
UNEMPLOYMENT_RATE                  0
SNAP_RECEIPT_RATE                  0
POP_BELOW_POVERTY                  0
County                             0
State                              0
Num_Food_Insecure_Persons          0
Food_Insecurity_Rate               0
Low_Threshold_Type                 0
High_Threshold_State               0
High_Threshold_Type                0
Low_Threshold_State                0
Pct_FI_Below_Low_Threshold         0
Pct_FI_Above_High_Threshold        0
Child_Food_Insecurity_Rate         0
Num_Food_Insecure_Children         0
P

In [8]:
data["Food_Insecurity_Rate"].describe()

count    25137.000000
mean         0.141839
std          0.041016
min          0.024000
25%          0.115000
50%          0.138000
75%          0.164000
max          0.379000
Name: Food_Insecurity_Rate, dtype: float64

In [10]:
bins = [0, 0.115, 0.138, 0.164, 1]  # 1 is just a safe upper bound
labels = ["Low", "Moderate", "Elevated", "High"]

data["FI_Category"] = pd.cut(
    data["Food_Insecurity_Rate"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

data["FI_Category"].value_counts()

FI_Category
Low         6504
Moderate    6247
Elevated    6217
High        6169
Name: count, dtype: int64

In [11]:
y=data['FI_Category']
X = data[['MEDIAN_HOUSEHOLD_INCOME',
         'POP_POVERTY_DETERMINED',
         'POP_BELOW_POVERTY',
         'POP_16_PLUS',
         'POP_UNEMPLOYED',
         'HOUSEHOLDS_TOTAL',
         'HOUSEHOLDS_SNAP',
         'POVERTY_RATE',
         'UNEMPLOYMENT_RATE',
         'SNAP_RECEIPT_RATE',
         'Cost_Per_Meal',
         'Annual_Food_Budget_Shortfall']]

In [12]:
county_series = data["County"]
state_series = data["State"]
fips_series = data["FIPS"]

In [13]:
X_train, X_test, y_train, y_test, county_train, county_test, state_train, state_test, fips_train, fips_test = train_test_split(
    X, y, county_series, state_series, fips_series,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [13]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(weights="distance"))
])


In [14]:
param_grid = {"knn__n_neighbors": range(1, 41, 2)}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring="balanced_accuracy", n_jobs=-1)
grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...'distance'))])
,param_grid,"{'knn__n_neighbors': range(1, 41, 2)}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,7
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [15]:
results_df = pd.DataFrame(grid.cv_results_)

results_df["k"] = results_df["param_knn__n_neighbors"]
results_df["mean_score"] = results_df["mean_test_score"]

best_k = grid.best_params_["knn__n_neighbors"]
best_score = grid.best_score_

fig = px.line(
    results_df,
    x="k",
    y="mean_score",
    title=f"Cross-Validated Balanced Accuracy vs. K (best k = {best_k})",
    markers=True,
    labels={"k": "Number of Neighbors (k)", "mean_score": "Mean CV Balanced Accuracy"}
)


fig.add_scatter(
    x=[best_k],
    y=[best_score],
    mode="markers+text",
    text=[f"Best k = {best_k}"],
    textposition="top center",
    name="Best k"
)

fig.update_layout(hovermode="x unified")
fig.show()

In [16]:
pipe2 = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=best_k,
    weights="distance"))
])

In [17]:
pipe2.fit(X_train, y_train)
y_pred = pipe2.predict(X_test)

In [18]:
acc = accuracy_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)

print(f"Accuracy: {acc:.3f}")
print(f"Balanced accuracy: {bal_acc:.3f}")

Accuracy: 0.640
Balanced accuracy: 0.639


In [19]:
import pandas as pd
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred, labels=pipe2.classes_)

cm_df = pd.DataFrame(
    cm,
    index=[f"Actual {c}" for c in pipe2.classes_],
    columns=[f"Predicted {c}" for c in pipe2.classes_]
)

cm_df

Unnamed: 0,Predicted Elevated,Predicted High,Predicted Low,Predicted Moderate
Actual Elevated,831,297,84,343
Actual High,301,1155,14,72
Actual Low,65,20,1252,289
Actual Moderate,336,72,369,785


In [20]:
scaler_pca = StandardScaler()
X_scaled_full = scaler_pca.fit_transform(X)

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled_full)

pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
pca_df["FI_Category"] = y.values

fig = px.scatter(
    pca_df,
    x="PC1",
    y="PC2",
    color="FI_Category",
    title="PCA (2D) of MMG Features Labeled by FI Category",
    hover_data=["FI_Category"],
)
fig.show()

In [21]:
idx = np.random.randint(0, len(X_test))
x_sample = X_test.iloc[[idx]]
y_true = y_test.iloc[idx]
y_pred_sample = pipe2.predict(x_sample)[0]

print("Random test index:", idx)
print("True FI category:", y_true)
print("Predicted FI category:", y_pred_sample)

Random test index: 6122
True FI category: Moderate
Predicted FI category: Elevated


In [22]:
for _ in range(5):
    idx = np.random.randint(0, len(X_test))
    x_sample = X_test.iloc[[idx]]
    y_true = y_test.iloc[idx]
    y_pred_sample = pipe2.predict(x_sample)[0]
    print(f"idx={idx} | true={y_true} | pred={y_pred_sample}")

idx=5985 | true=Low | pred=Low
idx=5450 | true=Elevated | pred=Elevated
idx=6153 | true=High | pred=Moderate
idx=5914 | true=High | pred=High
idx=3802 | true=Low | pred=Low


In [29]:
# Already defined these earlier
bins = [0, 0.115, 0.138, 0.164, 1]
labels = ["Low", "Moderate", "Elevated", "High"]

category_ranges = {
    "Low": "[0.000, 0.115]",
    "Moderate": "(0.115, 0.138]",
    "Elevated": "(0.138, 0.164]",
    "High": "(0.164, 1.000]"
}

# Mean numeric FI rate for each category (used as predicted numeric)
cat_mean_rate = (
    data.groupby("FI_Category")["Food_Insecurity_Rate"]
    .mean()
    .to_dict()
)





In [30]:
def predict_county(county_name, state_name=None):
    df = data.copy()

    # Filter by county (case-insensitive)
    mask = df["County"].str.contains(county_name, case=False, na=False)

    if state_name:
        mask &= df["State"].str.contains(state_name, case=False, na=False)

    matches = df[mask]

    if matches.empty:
        print("No matching county found.")
        return

    # If we have multiple rows, see if it's really the same county (same FIPS)
    if len(matches) > 1:
        unique_fips = matches["FIPS"].nunique()

        if unique_fips == 1:
            print("Multiple rows for the same county found. Aggregating features.\n")

            # Aggregate features and actual rate
            X_row = matches[X.columns].mean().to_frame().T
            actual = matches["Food_Insecurity_Rate"].mean()

            county = matches["County"].iloc[0]
            state = matches["State"].iloc[0]
        else:
            print("Multiple *different* counties matched your query. Please narrow it down:")
            display(matches[["County", "State", "FIPS"]].reset_index(drop=True))
            return
    else:
        row = matches.iloc[0]
        X_row = row[X.columns].to_frame().T
        actual = row["Food_Insecurity_Rate"]
        county = row["County"]
        state = row["State"]

    # --- MODEL PREDICTION ---
    pred_cat = pipe2.predict(X_row)[0]
    pred_cat_str = str(pred_cat)

    # Predicted numeric rate = mean FI rate for that category in the data
    pred_rate = cat_mean_rate[pred_cat]

    # Actual category from actual numeric rate
    actual_cat = pd.cut(
        pd.Series([actual]),
        bins=bins,
        labels=labels,
        include_lowest=True
    ).iloc[0]
    actual_cat_str = str(actual_cat)

    # --- OUTPUT ---
    print(f"County: {county}, {state}")
    print()
    print(f"Predicted FI Category: {pred_cat_str}")
    print(f"  • Approx. predicted FI rate (category mean): {pred_rate:.3f}")
    print(f"  • Category range: {category_ranges[pred_cat_str]}")
    print()
    print(f"Actual FI rate from dataset: {actual:.3f}")
    print(f"  • Actual FI category (using same bins): {actual_cat_str}")

In [32]:
predict_county("Albemarle")

Multiple rows for the same county found. Aggregating features.

County: albemarle, VA

Predicted FI Category: Low
  • Approx. predicted FI rate (category mean): 0.096
  • Category range: [0.000, 0.115]

Actual FI rate from dataset: 0.094
  • Actual FI category (using same bins): Low


In [33]:
predict_county("Prince William")

Multiple rows for the same county found. Aggregating features.

County: prince william, VA

Predicted FI Category: Low
  • Approx. predicted FI rate (category mean): 0.096
  • Category range: [0.000, 0.115]

Actual FI rate from dataset: 0.067
  • Actual FI category (using same bins): Low


In [38]:
predict_county("Prince George", "MD")

Multiple rows for the same county found. Aggregating features.

County: prince george's, MD

Predicted FI Category: Elevated
  • Approx. predicted FI rate (category mean): 0.151
  • Category range: (0.138, 0.164]

Actual FI rate from dataset: 0.141
  • Actual FI category (using same bins): Elevated


In [39]:
predict_county("Orange", "VA")

Multiple rows for the same county found. Aggregating features.

County: orange, VA

Predicted FI Category: Low
  • Approx. predicted FI rate (category mean): 0.096
  • Category range: [0.000, 0.115]

Actual FI rate from dataset: 0.099
  • Actual FI category (using same bins): Low


In [45]:
predict_county("Stafford", "VA")

Multiple rows for the same county found. Aggregating features.

County: stafford, VA

Predicted FI Category: Low
  • Approx. predicted FI rate (category mean): 0.096
  • Category range: [0.000, 0.115]

Actual FI rate from dataset: 0.070
  • Actual FI category (using same bins): Low


In [46]:
predict_county("Henrico", "VA")

Multiple rows for the same county found. Aggregating features.

County: henrico, VA

Predicted FI Category: Moderate
  • Approx. predicted FI rate (category mean): 0.127
  • Category range: (0.115, 0.138]

Actual FI rate from dataset: 0.121
  • Actual FI category (using same bins): Moderate
