In [18]:
import pandas as pd
import re
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [3]:
#load, clean, and merge data
demo_df = pd.read_csv('Economic_census.csv')

tree_df = pd.read_csv('DecisionTree_RandomForest_Health_Predictions.csv')

merged_df = pd.merge(tree_df, demo_df, on=['census tract','borough'])

label_encoder = LabelEncoder()
merged_df['health_encoded_dt'] = label_encoder.fit_transform(merged_df['dt_predicted_health'])
merged_df['health_encoded_rf'] = label_encoder.fit_transform(merged_df['rf_predicted_health'])

In [26]:
bins = [0,6.0,13.0,100.0]
labels = ['Low', 'Medium', 'High']
merged_df['poverty_category'] = pd.cut(merged_df['Poverty Percent'], bins=bins, labels=labels)

valid_indices = merged_df[['health_encoded_dt', 'health_encoded_rf', 'poverty_category']].dropna().index
X = merged_df.loc[valid_indices, ['latitude', 'longitude', 'postcode'] + species_columns
               + binary_analysis + ['curb_loc', 'sidewalk', 'guards', 'steward', 'user_type',
                                    'health', 'health_encoded_dt', 'health_encoded_rf']]
y = merged_df.loc[valid_indices, 'poverty_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#unique_species = merged_df['spc_common'].unique() 
#species_columns = [col for col in merged_df.columns if col in unique_species]
#binary_analysis = ['root_stone', 'root_grate', 'root_other', 'trunk_wire',
#                   'trnk_light', 'trnk_other', 'brch_light', 'brch_shoe', 'brch_other']
#scaler = StandardScaler()
#numerical_cols = ['curb_loc', 'sidewalk', 'guards', 'steward', 'health'] + binary_analysis+species_columns
#X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
#X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [27]:
clf = DecisionTreeClassifier(random_state=42, max_depth=10, class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

y_pred_all = clf.predict(X)
merged_df.loc[valid_indices, 'dtfr_predicted_poverty'] = y_pred_all
merged_df.to_csv('DecisionTree_Socioeconomic_Predictions.csv', index=False)


Accuracy: 0.7737301284497011
Classification Report:
               precision    recall  f1-score   support

        High       0.90      0.85      0.87     96638
         Low       0.61      0.85      0.71     37500
      Medium       0.72      0.61      0.66     62437

    accuracy                           0.77    196575
   macro avg       0.74      0.77      0.75    196575
weighted avg       0.79      0.77      0.77    196575

                Feature  Importance
0              latitude    0.395086
1             longitude    0.360544
2              postcode    0.237893
150           user_type    0.004674
148              guards    0.000836
..                  ...         ...
81      eastern hemlock    0.000000
80   eastern cottonwood    0.000000
79    American hornbeam    0.000000
78   Chinese tree lilac    0.000000
77       tree of heaven    0.000000

[154 rows x 2 columns]


In [12]:
print(np.unique(y_pred))
print(np.unique(y_pred_all))

['High' 'Low']
['High' 'Low']


In [28]:
#mse and r^2 calculations
from sklearn.tree import DecisionTreeRegressor

merged_df = merged_df.dropna(subset=['dtfr_predicted_poverty', 'health_encoded_dt', 'health_encoded_rf'])
label_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
merged_df['status_numeric'] = merged_df['poverty_category'].map(label_mapping)

y = merged_df['status_numeric']
X = merged_df.loc[valid_indices, ['latitude', 'longitude', 'postcode'] + species_columns
               + binary_analysis + ['curb_loc', 'sidewalk', 'guards', 'steward', 'user_type',
                                    'health', 'health_encoded_dt', 'health_encoded_rf']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = DecisionTreeRegressor(random_state=42, max_depth=10)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("DT Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("DT R^2 Score:", r2_score(y_test, y_pred))

DT Mean Squared Error: 0.1980277872939993
DT R^2 Score: 0.6654200311497476
