In [1]:
import pandas as pd
import re
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
#load, clean, and merge data
demo_df = pd.read_csv('Economic_census.csv')

tree_df = pd.read_csv('DecisionTree_RandomForest_Health_Predictions.csv')

merged_df = pd.merge(tree_df, demo_df, on=['census tract','borough'])

#transform health metrics from field into values
label_encoder = LabelEncoder()
merged_df['health_encoded_dt'] = label_encoder.fit_transform(merged_df['dt_predicted_health'])
merged_df['health_encoded_rf'] = label_encoder.fit_transform(merged_df['rf_predicted_health'])

In [12]:
#Prepare tree analysis to be done and ivide into 3 categories
bins = [0,6.0,13.0,100.0]
labels = ['Low', 'Medium', 'High']
merged_df['poverty_category'] = pd.cut(merged_df['Poverty Percent'], bins=bins, labels=labels)

#Select all columns that have numerical values. Species subdivision and binary analysis are names from data cleaning
unique_species = merged_df['spc_common'].unique() 
species_columns = [col for col in merged_df.columns if col in unique_species]

binary_analysis = ['root_stone', 'root_grate', 'root_other', 'trunk_wire',
                   'trnk_light', 'trnk_other', 'brch_light', 'brch_shoe', 'brch_other']
#Drop if no value was predicted previously
valid_indices = merged_df[['health_encoded_dt', 'health_encoded_rf', 'poverty_category']].dropna().index

#Separate for X and Y
X = merged_df.loc[valid_indices, ['latitude', 'longitude', 'postcode'] + species_columns
               + binary_analysis + ['curb_loc', 'sidewalk', 'guards', 'steward', 'user_type',
                                    'health', 'health_encoded_dt', 'health_encoded_rf']]
y = merged_df.loc[valid_indices, 'poverty_category']

#Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [13]:
#Declare decision tree and train it on the train set
clf = DecisionTreeClassifier(random_state=42, max_depth=20, class_weight='balanced')
clf.fit(X_train, y_train)

#Report accurancy results and most important features to be higlighted in report and used for other methods
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

#create new calumn base on predictions to be possibly used in cloropath
y_pred_all = clf.predict(X)
merged_df.loc[valid_indices, 'dtfr_predicted_poverty'] = y_pred_all
merged_df.to_csv('DecisionTree_Socioeconomic_Predictions.csv', index=False)


Accuracy: 0.4089787612870406
Classification Report:
               precision    recall  f1-score   support

        High       0.59      0.49      0.54     96638
         Low       0.25      0.58      0.35     37500
      Medium       0.39      0.18      0.25     62437

    accuracy                           0.41    196575
   macro avg       0.41      0.42      0.38    196575
weighted avg       0.46      0.41      0.41    196575

                Feature  Importance
146             steward    0.078963
147           user_type    0.077006
145              guards    0.065792
148              health    0.063410
149   health_encoded_dt    0.059601
..                  ...         ...
88        Norway spruce    0.000000
87       European beech    0.000000
86             boxelder    0.000000
85          river birch    0.000000
75   Chinese tree lilac    0.000000

[151 rows x 2 columns]


In [14]:
#mse and r^2 calculations
from sklearn.tree import DecisionTreeRegressor

#Same as above and but now a regression tree, this is to have a R2 and MSE. THis is needed in order to have some comparison
#for other methods
#As a reminder accurancy of a classification method above is not the same as R2.
merged_df = merged_df.dropna(subset=['dtfr_predicted_poverty', 'health_encoded_dt', 'health_encoded_rf'])
label_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
merged_df['status_numeric'] = merged_df['poverty_category'].map(label_mapping)

y = merged_df['status_numeric']
X = merged_df.loc[valid_indices, ['latitude', 'longitude', 'postcode'] + species_columns
               + binary_analysis + ['curb_loc', 'sidewalk', 'guards', 'steward', 'user_type',
                                    'health', 'health_encoded_dt', 'health_encoded_rf']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = DecisionTreeRegressor(random_state=42, max_depth=20)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("DT Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("DT R^2 Score:", r2_score(y_test, y_pred))

DT Mean Squared Error: 0.5671117739899375
DT R^2 Score: 0.04183022863117958


In [None]:
df['census tract'] = df['census tract'].astype(str).str.replace('.00', '', regex=True)
def striging(census):
    modified_tracts = []
    for part in census:
        part1 = str(part)
        part_no = part1
        if part1[len(part1)-2]=='.':
            part_no = part_no[:len(part1)-2]
        modified_tracts.append(part_no)
    return modified_tracts

df['census tract'] = striging(df['census tract'])
print(df['census tract'])

In [None]:
#All the one's below were just for extra testing

In [42]:

merged_df = merged_df.dropna(subset=['dtfr_predicted_poverty', 'health_encoded_dt', 'health_encoded_rf'])
label_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
merged_df['status_numeric'] = merged_df['poverty_category'].map(label_mapping)

y = merged_df['status_numeric']
X = merged_df.loc[valid_indices, species_columns
               + binary_analysis + ['curb_loc', 'sidewalk', 'guards', 'steward', 'user_type',
                                    'health', 'health_encoded_dt', 'health_encoded_rf']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = DecisionTreeRegressor(random_state=42, max_depth=20)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("DT Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("DT R^2 Score:", r2_score(y_test, y_pred))

DT Mean Squared Error: 0.5671117739899375
DT R^2 Score: 0.04183022863117958


In [36]:
importances = reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)


                Feature  Importance
147           user_type    0.092600
145              guards    0.067138
146             steward    0.064255
28         Callery pear    0.054870
148              health    0.050356
..                  ...         ...
113          Scots pine    0.000000
114        tartar maple    0.000000
115         black maple    0.000000
117       quaking aspen    0.000000
75   Chinese tree lilac    0.000000

[151 rows x 2 columns]
