In [44]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

In [45]:
df = pd.read_csv('Trees_Quantified.csv')

In [46]:

unique_species = df['spc_common'].unique() 
species_columns = [col for col in df.columns if col in unique_species]

df['health'] = pd.cut(df['health'], bins=[-0.1, 0.4, 0.6, 1.1], labels=['Poor', 'Fair', 'Good'])
selected_columns = [
    'tree_dbh', 'cncldist', 'st_assem', 'brch_other', 
    'brch_shoe', 'brch_light', 'trnk_other', 'trnk_light', 'trunk_wire', 'root_other', 
    'root_grate', 'root_stone', 'sidewalk', 'guards', 'steward']+ species_columns
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42, max_depth=10)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

y_pred_all0 = clf.predict(X)
df['dt_predicted_health'] = y_pred_all0

#random forest
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=10)
rf_clf.fit(X_train, y_train)

y_pred1 = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1))

importances1 = rf_clf.feature_importances_
feature_importance_df1 = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df1 = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:\n", feature_importance_df1)

y_pred_all1 = rf_clf.predict(X)
df['rf_predicted_health'] = y_pred_all1

Accuracy: 0.8778674061003277
Classification Report:
               precision    recall  f1-score   support

        Fair       0.31      0.09      0.14      1405
        Good       0.74      0.95      0.83      4731
        Poor       0.99      0.96      0.98      9732

    accuracy                           0.88     15868
   macro avg       0.68      0.66      0.65     15868
weighted avg       0.86      0.88      0.86     15868

        Feature  Importance
14      steward    0.927807
3    brch_other    0.012823
0      tree_dbh    0.010300
2      st_assem    0.009731
1      cncldist    0.009645
..          ...         ...
133  pitch pine    0.000000
134      mimosa    0.000000
135   smoketree    0.000000
136      spruce    0.000000
137    red pine    0.000000

[138 rows x 2 columns]
Accuracy: 0.8834131585581043
Classification Report:
               precision    recall  f1-score   support

        Fair       1.00      0.00      0.00      1405
        Good       0.72      1.00      0.84 

In [47]:
#mse and r^2 calculations
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

label_mapping = {'Poor': 0, 'Fair': 1, 'Good': 2}
df['dt_health_numeric'] = df['dt_predicted_health'].map(label_mapping)

y = df['dt_health_numeric']
X = df[selected_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = DecisionTreeRegressor(random_state=42, max_depth=10)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("DT Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("DT R^2 Score:", r2_score(y_test, y_pred))

label_mapping = {'Poor': 0, 'Fair': 1, 'Good': 2}
df['rf_health_numeric'] = df['rf_predicted_health'].map(label_mapping)

y = df['rf_health_numeric']
X = df[selected_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = RandomForestRegressor(random_state=42, max_depth=10)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("RF Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("RF R^2 Score:", r2_score(y_test, y_pred))

DT Mean Squared Error: 0.010906075428682402
DT R^2 Score: 0.9882921856091407
RF Mean Squared Error: 3.176833879505924e-05
RF R^2 Score: 0.9999672773309717


In [48]:
#Data redimensions for showing in the graph
df.loc[df['rf_predicted_health'].isin(['Poor']), ['rf_predicted_health']] = 0
df.loc[df['rf_predicted_health'] == 'Fair', ['rf_predicted_health']] = 0.5
df.loc[df['rf_predicted_health']== 'Good', ['rf_predicted_health']] = 1

df['health'] = df['health'].cat.add_categories([0, 0.5, 1])

df.loc[df['health'] == 'Poor', ['health']] = 0
df.loc[df['health'] == 'Fair', ['health']] = 0.5
df.loc[df['health'] == 'Good', ['health']] = 1

#df.loc[df['borough'] == 1, ['borough']] = ['Brooklyn']
#df.loc[df['borough'] == 2, ['borough']] = ['Manhattan']
#df.loc[df['borough'] == 3, ['borough']] = ['Queens']
#df.loc[df['borough'] == 0, ['borough']] = ['Bronx']
#df.loc[df['borough'] == 4, ['borough']] = ['Staten Island']


In [49]:
# Function to modify the census tract based on its length
df['census tract'] = df['census tract'].astype(str).str.replace('.00', '', regex=True)
def striging(census):
    modified_tracts = []
    for part in census:
        part1 = str(part)
        part_no = part1
        if part1[len(part1)-2]=='.':
            part_no = part_no[:len(part1)-2]
        modified_tracts.append(part_no)
    return modified_tracts

df['census tract'] = striging(df['census tract'])
print(df['census tract'])

0        191
1        183
2        128
3        138
4        145
        ... 
52888    395
52889     58
52890    157
52891    157
52892    118
Name: census tract, Length: 52893, dtype: object


In [50]:
df.to_csv('DecisionTree_RandomForest_Health_Predictions.csv', index=False)
