In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [7]:
df = pd.read_csv('Trees_Quantified.csv')

df = df.drop(columns=['created_at', 'address', 'postcode', 'zip_city'])

le = LabelEncoder()
label_encode_cols = ['curb_loc', 'status', 'spc_latin', 'spc_common', 'steward', 'guards', 'borough', 'nta', 'boro_ct', 'nta_name', 'state']


for col in label_encode_cols:
    df[col] = le.fit_transform(df[col].astype(str))

df = pd.get_dummies(df, columns=['sidewalk'])

problem_types = df['problems'].str.get_dummies(sep=',')
df = pd.concat([df, problem_types], axis=1)
df = df.drop(columns=['problems'])

df['health'] = pd.cut(df['health'], bins=[-0.1, 0.4, 0.6, 1.1], labels=['Poor', 'Fair', 'Good'])
selected_columns = [
    'tree_dbh', 'curb_loc', 'nta', 'cncldist', 'st_assem', 'brch_other', 
    'brch_shoe', 'brch_light', 'trnk_light', 'trunk_wire', 'root_other', 
    'root_grate', 'root_stone', 'sidewalk_0', 'sidewalk_1','guards', 
    'steward', 'spc_common'
]
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

y_pred_all0 = clf.predict(X)
df['dt_predicted_health'] = y_pred_all0

#random forest
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train)

y_pred1 = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1))

importances1 = rf_clf.feature_importances_
feature_importance_df1 = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df1 = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:\n", feature_importance_df1)

y_pred_all1 = rf_clf.predict(X)
df['rf_predicted_health'] = y_pred_all1




Accuracy: 0.748653793853685
Classification Report:
               precision    recall  f1-score   support

        Fair       0.26      0.28      0.27     28301
        Good       0.85      0.85      0.85    156767
        Poor       0.66      0.61      0.64     17167

    accuracy                           0.75    202235
   macro avg       0.59      0.58      0.59    202235
weighted avg       0.75      0.75      0.75    202235

       Feature  Importance
0     tree_dbh    0.309200
17  spc_common    0.204364
2          nta    0.136131
4     st_assem    0.081184
3     cncldist    0.069487
16     steward    0.043796
15      guards    0.023448
14  sidewalk_1    0.022624
13  sidewalk_0    0.022038
12  root_stone    0.019656
7   brch_light    0.016791
5   brch_other    0.014215
1     curb_loc    0.012782
10  root_other    0.012124
9   trunk_wire    0.008030
11  root_grate    0.002394
8   trnk_light    0.001185
6    brch_shoe    0.000552
Accuracy: 0.8040151309120578
Classification Report:
  

In [8]:
#Data redimensions for showing in the graph
df.loc[df['rf_predicted_health'].isin(['Poor']), ['rf_predicted_health']] = 0
df.loc[df['rf_predicted_health'] == 'Fair', ['rf_predicted_health']] = 0.5
df.loc[df['rf_predicted_health']== 'Good', ['rf_predicted_health']] = 1

df['health'] = df['health'].cat.add_categories([0, 0.5, 1])

df.loc[df['health'] == 'Poor', ['health']] = 0
df.loc[df['health'] == 'Fair', ['health']] = 0.5
df.loc[df['health'] == 'Good', ['health']] = 1

df.loc[df['borough'] == 1, ['borough']] = ['Brooklyn']
df.loc[df['borough'] == 2, ['borough']] = ['Manhattan']
df.loc[df['borough'] == 3, ['borough']] = ['Queens']
df.loc[df['borough'] == 0, ['borough']] = ['Bronx']
df.loc[df['borough'] == 4, ['borough']] = ['Staten Island']


In [4]:
df.to_csv('DecisionTree_RandomForest_Health_Predictions.csv', index=False)

In [9]:
print(df['census tract'])

0          739.00
1          973.00
2          449.00
3          449.00
4          165.00
           ...   
674110     519.00
674111     707.00
674112     201.00
674113     235.02
674114    1341.00
Name: census tract, Length: 674115, dtype: float64
