In [7]:
import pandas as pd
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler

file_path1 = '/Users/sherryzhang/Downloads/psam_pusa.csv'
file_path2 = '/Users/sherryzhang/Downloads/psam_pusb.csv'

data1 = pd.read_csv(file_path1)
data2 = pd.read_csv(file_path2)

columns_to_keep = ['AGEP', 'ANC1P', 'CIT', 'JWTRNS', 'DIS', 'SCHL', 'ESR', 'GCM', 'HICOV', 'HISP', 'PINCP', 'OCCP', 'ENG', 'MAR', 'MIG', 'MIL', 'RAC1P', 'POBP', 'SEX', 'FOD1P', 'DRATX']
data1 = data1[columns_to_keep]
data2 = data2[columns_to_keep]

merged_data = pd.concat([data1, data2])

print("Missing values in Merged Data:")
print(merged_data.isnull().sum())

total_rows_before = merged_data.shape[0]
print(f"Total number of rows before dropping rows for NaN PINCP: {total_rows_before}")

initial_row_count = merged_data.shape[0]
merged_data = merged_data.dropna(subset=['PINCP'])
final_row_count = merged_data.shape[0]
rows_dropped_for_PINCP = initial_row_count - final_row_count

print(f"Number of rows dropped for NaN PINCP: {rows_dropped_for_PINCP}")
print(f"Total number of rows after dropping NaN PINCP rows: {final_row_count}")

columns_to_drop = ['JWTRNS', 'GCM', 'ENG', 'FOD1P', 'DRATX']
merged_data = merged_data.drop(columns=columns_to_drop)

initial_row_count = merged_data.shape[0]
merged_data = merged_data.dropna()
final_row_count = merged_data.shape[0]
rows_dropped_for_any_nan = initial_row_count - final_row_count

print(f"Number of rows dropped for NaN in any column: {rows_dropped_for_any_nan}")
print(f"Total number of rows after dropping rows for NaN in any column: {final_row_count}")

poverty_threshold = 13590
merged_data['poverty_status'] = merged_data['PINCP'] <= poverty_threshold

majority_class = merged_data[merged_data['poverty_status'] == False]
minority_class = merged_data[merged_data['poverty_status'] == True]

majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)
balanced_data = pd.concat([minority_class, majority_downsampled])

print("Class distribution after balancing:")
print(balanced_data['poverty_status'].value_counts())

# Standardize
scaler = StandardScaler()
columns_to_scale = ['AGEP', 'ANC1P', 'CIT', 'DIS', 'SCHL', 'ESR', 'HICOV', 'HISP', 'OCCP', 'MAR', 'MIG', 'MIL', 'RAC1P', 'POBP', 'SEX']
balanced_data[columns_to_scale] = scaler.fit_transform(balanced_data[columns_to_scale])

# Normalize
min_max_scaler = MinMaxScaler()
balanced_data[columns_to_scale] = min_max_scaler.fit_transform(balanced_data[columns_to_scale])

output_file_path = '/Users/sherryzhang/Downloads/Final_Balanced_Data_Normalized_Standardized.csv'
balanced_data.to_csv(output_file_path, index=False)

print(f"The final balanced, normalized, and standardized data has been saved to: {output_file_path}")

Missing values in Merged Data:
AGEP            0
ANC1P           0
CIT             0
JWTRNS    1830018
DIS             0
SCHL        90621
ESR        563988
GCM       3351901
HICOV           0
HISP            0
PINCP      522876
OCCP      1366240
ENG       2763995
MAR             0
MIG         29799
MIL        604751
RAC1P           0
POBP            0
SEX             0
FOD1P     2456369
DRATX     3132909
dtype: int64
Total number of rows before dropping rows for NaN PINCP: 3373378
Number of rows dropped for NaN PINCP: 522876
Total number of rows after dropping NaN PINCP rows: 2850502
Number of rows dropped for NaN in any column: 857505
Total number of rows after dropping rows for NaN in any column: 1992997
Class distribution after balancing:
poverty_status
True     416830
False    416830
Name: count, dtype: int64
The final balanced, normalized, and standardized data has been saved to: /Users/sherryzhang/Downloads/Final_Balanced_Data_Normalized_Standardized.csv


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

file_path = r"/Users/sherryzhang/Downloads/Final_Balanced_Data_Normalized_Standardized.csv"
data = pd.read_csv(file_path)

poverty_threshold = 13590
data['poverty_status'] = np.where(data['PINCP'] <= poverty_threshold, 1, 0)

data = data.drop(columns=['PINCP'])

X = data.drop(columns=['poverty_status'])
Y = data['poverty_status']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=18)

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn_model = KNeighborsClassifier()

grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

best_knn_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)

train_predictions = best_knn_model.predict(X_train)
test_predictions = best_knn_model.predict(X_test)

train_accuracy = accuracy_score(Y_train, train_predictions)
train_precision = precision_score(Y_train, train_predictions)
train_recall = recall_score(Y_train, train_predictions)
train_f1 = f1_score(Y_train, train_predictions)
train_conf_matrix = confusion_matrix(Y_train, train_predictions)

print(f"Training Accuracy: {train_accuracy}")
print(f"Training Precision: {train_precision}")
print(f"Training Recall: {train_recall}")
print(f"Training F1 Score: {train_f1}")
print(f"Training Confusion Matrix: \n{train_conf_matrix}")

test_accuracy = accuracy_score(Y_test, test_predictions)
test_precision = precision_score(Y_test, test_predictions)
test_recall = recall_score(Y_test, test_predictions)
test_f1 = f1_score(Y_test, test_predictions)
test_conf_matrix = confusion_matrix(Y_test, test_predictions)

print(f"Testing Accuracy: {test_accuracy}")
print(f"Testing Precision: {test_precision}")
print(f"Testing Recall: {test_recall}")
print(f"Testing F1 Score: {test_f1}")
print(f"Testing Confusion Matrix: \n{test_conf_matrix}")

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters found:  {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
Training Accuracy: 0.8219552935249382
Training Precision: 0.8394367620768293
Training Recall: 0.7964803864861706
Training F1 Score: 0.8173945939462163
Training Confusion Matrix: 
[[282421  50834]
 [ 67909 265764]]
Testing Accuracy: 0.7932850322673513
Testing Precision: 0.8080811915518267
Testing Recall: 0.7679088952222903
Testing F1 Score: 0.7874830435318781
Testing Confusion Matrix: 
[[68409 15166]
 [19300 63857]]
