In [1]:
import os
import subprocess

def install(package):
    subprocess.run(f'pip install {package}', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

install('--upgrade pip setuptools wheel')
install('cython')
install('pyyaml')
install('scipy')
install('scikit-learn')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Load data
file_path = '/Users/sherryzhang/Downloads/Final_Balanced_Data.csv'
data = pd.read_csv(file_path)

# Define poverty threshold and create target variable
poverty_threshold = 13590
data['poverty_status'] = np.where(data['PINCP'] <= poverty_threshold, 1, 0)

# Drop the PINCP column
data = data.drop(columns=['PINCP'])

# Split data into features and target
X = data.drop(columns=['poverty_status'])
Y = data['poverty_status']

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=18)

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [700, 800, 1000],
    'max_depth': [10, 14, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize Random Forest classifier
rf_model = RandomForestClassifier(random_state=18)

# Perform grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)

# Make predictions with the best model
predictions = best_rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(Y_test, predictions)
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
conf_matrix = confusion_matrix(Y_test, predictions)

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: \n{conf_matrix}")

Fitting 3 folds for each of 162 candidates, totalling 486 fits




Best parameters found:  {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Accuracy: 0.811188014298395
Precision: 0.8365199270643396
Recall: 0.7723703356301935
F1 Score: 0.8031662467096419
Confusion Matrix: 
[[71023 12552]
 [18929 64228]]
