In [16]:
import pandas as pd
import re
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

In [17]:
#load, clean, and merge data
demo_df = pd.read_csv('Demographic_New_York_withborough_census_trackt.csv')
demo_df['census tract'] = demo_df['Geographic Area Name'].apply(lambda x: re.findall(r'\d+', x)[0])
demo_df['census tract'] = demo_df['census tract'].astype(int)

demo_df.columns = demo_df.columns.str.replace('!!Estimate!!', ' ').str.strip()
demo_df['poverty_rate'] = demo_df['Percent below poverty level Population for whom poverty status is determined']

tree_df = pd.read_csv('DecisionTree_RandomForest_Health_Predictions.csv')

demo_df['borough'] = demo_df['Borough'].str.lower()
tree_df['borough'] = tree_df['borough'].str.lower()

merged_df = pd.merge(tree_df, demo_df, on=['census tract','borough'])

label_encoder = LabelEncoder()
merged_df['poverty_rate'] = pd.to_numeric(merged_df['poverty_rate'], errors='coerce')
merged_df['health_encoded_dt'] = label_encoder.fit_transform(merged_df['dt_predicted_health'])
merged_df['health_encoded_rf'] = label_encoder.fit_transform(merged_df['rf_predicted_health'])

  merged_df = pd.merge(tree_df, demo_df, on=['census tract','borough'])


In [18]:
bins = [0,6.0,13.0,100.0]
labels = ['Low', 'Medium', 'High']
merged_df['poverty_category'] = pd.cut(merged_df['poverty_rate'], bins=bins, labels=labels)

valid_indices = merged_df[['health_encoded_dt', 'health_encoded_rf', 'poverty_category']].dropna().index
X = merged_df.loc[valid_indices, ['health_encoded_dt', 'health_encoded_rf']]
y = merged_df.loc[valid_indices, 'poverty_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
clf = DecisionTreeClassifier(random_state=42, max_depth=10, class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

y_pred_all = clf.predict(X)
merged_df.loc[valid_indices, 'dtfr_predicted_poverty'] = y_pred_all
merged_df.to_csv('DecisionTree_Socioeconomic_Predictions.csv', index=False)


Accuracy: 0.4363197189284146
Classification Report:
               precision    recall  f1-score   support

        High       0.65      0.58      0.61      5123
         Low       0.22      0.62      0.32      1575
      Medium       0.45      0.01      0.02      2410

    accuracy                           0.44      9108
   macro avg       0.44      0.40      0.32      9108
weighted avg       0.52      0.44      0.41      9108

             Feature  Importance
1  health_encoded_rf    0.979457
0  health_encoded_dt    0.020543


In [20]:
print(np.unique(y_pred))
print(np.unique(y_pred_all))

['High' 'Low' 'Medium']
['High' 'Low' 'Medium']


In [22]:
#mse and r^2 calculations
from sklearn.tree import DecisionTreeRegressor

merged_df = merged_df.dropna(subset=['dtfr_predicted_poverty', 'health_encoded_dt', 'health_encoded_rf'])
label_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
merged_df['status_numeric'] = merged_df['poverty_category'].map(label_mapping)

y = merged_df['status_numeric']
X = merged_df[['health_encoded_dt','health_encoded_rf']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = DecisionTreeRegressor(random_state=42, max_depth=10)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("DT Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("DT R^2 Score:", r2_score(y_test, y_pred))

DT Mean Squared Error: 0.5653682520269943
DT R^2 Score: 0.03132326956894871
