In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [19]:
df = pd.read_csv('Trees_Quantified.csv')

In [20]:

unique_species = df['spc_common'].unique() 
species_columns = [col for col in df.columns if col in unique_species]

df['health'] = pd.cut(df['health'], bins=[-0.1, 0.4, 0.6, 1.1], labels=['Poor', 'Fair', 'Good'])
selected_columns = [
    'tree_dbh', 'cncldist', 'st_assem', 'brch_other', 
    'brch_shoe', 'brch_light', 'trnk_other', 'trnk_light', 'trunk_wire', 'root_other', 
    'root_grate', 'root_stone', 'sidewalk', 'guards', 'steward']+ species_columns
X = df[selected_columns]
y = df['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

y_pred_all0 = clf.predict(X)
df['dt_predicted_health'] = y_pred_all0

#random forest
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train)

y_pred1 = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1))

importances1 = rf_clf.feature_importances_
feature_importance_df1 = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df1 = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:\n", feature_importance_df1)

y_pred_all1 = rf_clf.predict(X)
df['rf_predicted_health'] = y_pred_all1




Accuracy: 0.7609315894874774
Classification Report:
               precision    recall  f1-score   support

        Fair       0.27      0.24      0.25     28301
        Good       0.84      0.87      0.86    156767
        Poor       0.72      0.60      0.65     17167

    accuracy                           0.76    202235
   macro avg       0.61      0.57      0.59    202235
weighted avg       0.75      0.76      0.76    202235

           Feature  Importance
0         tree_dbh    0.173751
25           stump    0.145363
26            dead    0.119621
2         st_assem    0.118999
1         cncldist    0.100083
..             ...         ...
128     Scots pine    0.000039
95     Atlas cedar    0.000028
142  Virginia pine    0.000013
125   Osage-orange    0.000010
148     pitch pine    0.000002

[149 rows x 2 columns]
Accuracy: 0.8038717333794843
Classification Report:
               precision    recall  f1-score   support

        Fair       0.34      0.16      0.21     28301
        

In [21]:
#Data redimensions for showing in the graph
df.loc[df['rf_predicted_health'].isin(['Poor']), ['rf_predicted_health']] = 0
df.loc[df['rf_predicted_health'] == 'Fair', ['rf_predicted_health']] = 0.5
df.loc[df['rf_predicted_health']== 'Good', ['rf_predicted_health']] = 1

df['health'] = df['health'].cat.add_categories([0, 0.5, 1])

df.loc[df['health'] == 'Poor', ['health']] = 0
df.loc[df['health'] == 'Fair', ['health']] = 0.5
df.loc[df['health'] == 'Good', ['health']] = 1

#df.loc[df['borough'] == 1, ['borough']] = ['Brooklyn']
#df.loc[df['borough'] == 2, ['borough']] = ['Manhattan']
#df.loc[df['borough'] == 3, ['borough']] = ['Queens']
#df.loc[df['borough'] == 0, ['borough']] = ['Bronx']
#df.loc[df['borough'] == 4, ['borough']] = ['Staten Island']


In [23]:
# Function to modify the census tract based on its length
df['census tract'] = df['census tract'].astype(str).str.replace('.00', '', regex=True)
def striging(census):
    modified_tracts = []
    for part in census:
        part1 = str(part)
        part_no = part1
        if part1[len(part1)-2]=='.':
            part_no = part_no[:len(part1)-2]
        modified_tracts.append(part_no)
    return modified_tracts

df['census tract'] = striging(df['census tract'])
print(df['census tract'])

0            739
1            973
2            449
3            449
4            165
           ...  
674110       519
674111       707
674112       201
674113    235.02
674114      1341
Name: census tract, Length: 674115, dtype: object


In [None]:
df.to_csv('DecisionTree_RandomForest_Health_Predictions.csv', index=False)
