In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
df = pd.read_csv('Trees_Quantified.csv')

In [8]:

unique_species = df['spc_common'].unique() 
species_columns = [col for col in df.columns if col in unique_species]

df['health'] = pd.cut(df['health'], bins=[-0.1, 0.4, 0.6, 1.1], labels=['Poor', 'Fair', 'Good'])
selected_columns = [
    'tree_dbh', 'cncldist', 'st_assem', 'brch_other', 
    'brch_shoe', 'brch_light', 'trnk_other', 'trnk_light', 'trunk_wire', 'root_other', 
    'root_grate', 'root_stone', 'sidewalk', 'guards', 'steward']+ species_columns
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42, max_depth=10)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

y_pred_all0 = clf.predict(X)
df['dt_predicted_health'] = y_pred_all0

#random forest
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=10)
rf_clf.fit(X_train, y_train)

y_pred1 = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1))

importances1 = rf_clf.feature_importances_
feature_importance_df1 = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df1 = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:\n", feature_importance_df1)

y_pred_all1 = rf_clf.predict(X)
df['rf_predicted_health'] = y_pred_all1

Accuracy: 0.8225529705540584
Classification Report:
               precision    recall  f1-score   support

        Fair       0.41      0.02      0.04     28301
        Good       0.82      1.00      0.90    156767
        Poor       0.97      0.56      0.71     17167

    accuracy                           0.82    202235
   macro avg       0.73      0.53      0.55    202235
weighted avg       0.77      0.82      0.76    202235

          Feature  Importance
24          stump    0.449791
25           dead    0.370139
3      brch_other    0.054063
34   Norway maple    0.033098
6      trnk_other    0.027491
..            ...         ...
57         cherry    0.000000
56      sassafras    0.000000
55   crepe myrtle    0.000000
54    Douglas-fir    0.000000
148        mimosa    0.000000

[149 rows x 2 columns]
Accuracy: 0.8216678616461047
Classification Report:
               precision    recall  f1-score   support

        Fair       0.41      0.00      0.00     28301
        Good       0

In [9]:
#mse and r^2 calculations
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

label_mapping = {'Poor': 0, 'Fair': 1, 'Good': 2}
df['health_numeric'] = df['health'].map(label_mapping)

y = df['health_numeric']
X = df[selected_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = DecisionTreeRegressor(random_state=42, max_depth=10)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("DT Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("DT R^2 Score:", r2_score(y_test, y_pred))

reg = RandomForestRegressor(random_state=42, max_depth=10)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("RF Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("RF R^2 Score:", r2_score(y_test, y_pred))

DT Mean Squared Error: 0.22767089739755542
DT R^2 Score: 0.40643310307058844
RF Mean Squared Error: 0.22609793849866588
RF R^2 Score: 0.4105340063625056


In [10]:
# Data redimensions for showing in the graph
df.loc[df['rf_predicted_health'].isin(['Poor']), ['rf_predicted_health']] = 0
df.loc[df['rf_predicted_health'] == 'Fair', ['rf_predicted_health']] = 0.5
df.loc[df['rf_predicted_health']== 'Good', ['rf_predicted_health']] = 1

df['health'] = df['health'].cat.add_categories([0, 0.5, 1])

df.loc[df['health'] == 'Poor', ['health']] = 0
df.loc[df['health'] == 'Fair', ['health']] = 0.5
df.loc[df['health'] == 'Good', ['health']] = 1

df.loc[df['borough'] == 1, ['borough']] = ['Brooklyn']
df.loc[df['borough'] == 2, ['borough']] = ['Manhattan']
df.loc[df['borough'] == 3, ['borough']] = ['Queens']
df.loc[df['borough'] == 0, ['borough']] = ['Bronx']
df.loc[df['borough'] == 4, ['borough']] = ['Staten Island']


In [11]:
# Function to modify the census tract based on its length
df['census tract'] = df['census tract'].astype(str).str.replace('.00', '', regex=True)
def striging(census):
    modified_tracts = []
    for part in census:
        part1 = str(part)
        part_no = part1
        if part1[len(part1)-2]=='.':
            part_no = part_no[:len(part1)-2]
        modified_tracts.append(part_no)
    return modified_tracts

df['census tract'] = striging(df['census tract'])
print(df['census tract'])

0            739
1            973
2            449
3            449
4            165
           ...  
674110       519
674111       707
674112       201
674113    235.02
674114      1341
Name: census tract, Length: 674115, dtype: object


In [12]:
df.to_csv('DecisionTree_RandomForest_Health_Predictions.csv', index=False)


In [None]:
#Additional Test do not run, just see results were used to optimize hyperparameters DO NOT RUN WILL JUST TAKE TIME

In [15]:
#Additional Test do not run, just see results
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42, max_depth=10)
clf.fit(X_train, y_train)

clf = DecisionTreeClassifier(random_state=42, max_depth=15)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8213019507009173
Classification Report:
               precision    recall  f1-score   support

        Fair       0.39      0.04      0.08     28301
        Good       0.82      0.99      0.90    156767
        Poor       0.95      0.57      0.71     17167

    accuracy                           0.82    202235
   macro avg       0.72      0.53      0.56    202235
weighted avg       0.77      0.82      0.77    202235



In [13]:
#Additional Test do not run, just see results
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42, max_depth=5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8220980542438252
Classification Report:
               precision    recall  f1-score   support

        Fair       0.46      0.00      0.01     28301
        Good       0.81      1.00      0.90    156767
        Poor       0.99      0.55      0.71     17167

    accuracy                           0.82    202235
   macro avg       0.75      0.52      0.54    202235
weighted avg       0.78      0.82      0.76    202235



In [14]:
#Additional Test do not run, just see results
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42, max_depth=4)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8215689667960541


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

        Fair       0.00      0.00      0.00     28301
        Good       0.81      1.00      0.90    156767
        Poor       1.00      0.55      0.71     17167

    accuracy                           0.82    202235
   macro avg       0.60      0.52      0.53    202235
weighted avg       0.72      0.82      0.76    202235



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(criterion="entropy", random_state=42, max_depth=10)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8225084678715356
Classification Report:
               precision    recall  f1-score   support

        Fair       0.41      0.03      0.05     28301
        Good       0.82      1.00      0.90    156767
        Poor       0.98      0.55      0.71     17167

    accuracy                           0.82    202235
   macro avg       0.74      0.53      0.55    202235
weighted avg       0.77      0.82      0.76    202235

