In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [41]:
df = pd.read_csv('Trees_Quantified.csv')

df = df.drop(columns=['tree_id', 'created_at', 'address', 'postcode', 'zip_city'])

le = LabelEncoder()
label_encode_cols = ['curb_loc', 'status', 'spc_latin', 'spc_common', 'steward', 'guards', 'borough', 'nta', 'boro_ct', 'nta_name', 'state']


for col in label_encode_cols:
    df[col] = le.fit_transform(df[col].astype(str))

df = pd.get_dummies(df, columns=['sidewalk'])

problem_types = df['problems'].str.get_dummies(sep=',')
df = pd.concat([df, problem_types], axis=1)
df = df.drop(columns=['problems'])

df['health'] = pd.cut(df['health'], bins=[-0.1, 0.4, 0.6, 1.1], labels=['Poor', 'Fair', 'Good'])
selected_columns = [
    'tree_dbh', 'curb_loc', 'nta', 'cncldist', 'st_assem', 'brch_other', 
    'brch_shoe', 'brch_light', 'trnk_light', 'trunk_wire', 'root_other', 
    'root_grate', 'root_stone', 'sidewalk_0', 'sidewalk_1','guards', 
    'steward', 'spc_common'
]
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred0 = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred0))

y_pred_all0 = clf.predict(X)
df['dt_predicted_health'] = y_pred_all0


#random forest
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train)

y_pred1 = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1))

importances1 = rf_clf.feature_importances_
feature_importance_df1 = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df1 = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:\n", feature_importance_df1)

y_pred_all1 = rf_clf.predict(X)
df['rf_predicted_health'] = y_pred_all1

df.to_csv('DecisionTree_RandomForest_Health_Predictions.csv', index=False)


Accuracy: 0.8387950592387194
Classification Report:
               precision    recall  f1-score   support

        Fair       0.28      0.32      0.30      1405
        Good       0.77      0.74      0.75      4731
        Poor       0.96      0.96      0.96      9732

    accuracy                           0.84     15868
   macro avg       0.67      0.67      0.67     15868
weighted avg       0.84      0.84      0.84     15868

       Feature  Importance
16     steward    0.682397
0     tree_dbh    0.070613
17  spc_common    0.054784
2          nta    0.043728
4     st_assem    0.030064
3     cncldist    0.026881
15      guards    0.022478
12  root_stone    0.014466
10  root_other    0.009702
13  sidewalk_0    0.009567
14  sidewalk_1    0.009347
5   brch_other    0.009238
7   brch_light    0.006353
9   trunk_wire    0.004384
11  root_grate    0.002574
1     curb_loc    0.002280
8   trnk_light    0.000985
6    brch_shoe    0.000160
Accuracy: 0.8387950592387194
Classification Report:
 