In [25]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from xgboost import XGBClassifier

In [26]:
DATA_DIR = Path("tree_data_training")
TREE_DATA = Path("final_data/trees/")
FEATURE_COLS = ['avg_height', 'avg_year', 'has_tree',
       'Fraxinus', 'Salix', 'Alnus', 'Quercus', 'Tilia', 'Acer', 'Populus',
       'Betula', 'Prunus', 'Platanus', 'Malus', 'Robinia', 'Crataegus',
       'Ulmus', 'Carpinus', 'Overig', 'Onbekend', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'apparent_temperature', 'precipitation', 'rain', 'snowfall',
       'snow_depth', 'weather_code', 'pressure_msl', 'surface_pressure',
       'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m',
       'soil_temperature_0_to_7cm', 'soil_temperature_7_to_28cm',
       'soil_temperature_28_to_100cm', 'soil_temperature_100_to_255cm',
       'soil_moisture_0_to_7cm', 'soil_moisture_7_to_28cm',
       'soil_moisture_28_to_100cm', 'soil_moisture_100_to_255cm'
       ]



TEST_PERCENTAGE = 0.2 # percentage of total
VALIDATION_PERCENTAGE = 0.25 # percentage of (1-TEST_PERCENTAGE)*total
ID_KEY = "Incident_ID"
LABEL_KEY = "Label"

SOIL_MOISTURE_COLUMNS = [
    'soil_moisture_0_to_7cm',
    'soil_moisture_7_to_28cm', 
    'soil_moisture_28_to_100cm', 
    'soil_moisture_100_to_255cm'
]

SOIL_TEMPERATURE_COLUMNS = [
    'soil_temperature_0_to_7cm',
    'soil_temperature_7_to_28cm',
    'soil_temperature_28_to_100cm',
    'soil_temperature_100_to_255cm',
]


## Merging separate sets

In [27]:
# Load in training and testing sets
positive_path = TREE_DATA / "trees_new_grid_pos_samples.csv"
negative_path_t = TREE_DATA / "trees_new_grid_neg_samples_true.csv"
negative_path_f = TREE_DATA / "trees_new_grid_neg_samples_false.csv" 

positive_samples_df = pd.read_csv(positive_path, sep=",", encoding="utf-8")
negative_samples_df_t = pd.read_csv(negative_path_t, sep=",", encoding="utf-8")
negative_samples_df_f = pd.read_csv(negative_path_f, sep=",", encoding="utf-8")


In [28]:
negative_samples_df = pd.concat([negative_samples_df_t, negative_samples_df_f], axis=0)

In [29]:
negative_samples_df_f

Unnamed: 0.2,Unnamed: 0.1,Date,grid_id,LAT,LON,Unnamed: 0,Hour,has_tree,avg_height,avg_diameter,...,wind_direction_100m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm
0,0,2022-11-15,7798,52.302912,5.009427,0,10,False,,,...,180.000000,33.480000,9.287001,8.537001,12.1370,13.637000,0.646,0.621,0.519,0.600
1,1,2023-07-05,1381,52.394804,4.788347,2,9,False,,,...,268.999330,104.760000,13.237000,16.487000,15.9870,10.787001,0.727,0.509,0.547,0.666
2,2,2020-07-05,7837,52.373183,5.009427,3,8,False,,,...,234.819210,65.520004,16.693500,17.093500,15.6935,10.943500,0.434,0.358,0.470,0.629
3,3,2008-09-11,5669,52.281291,4.923943,6,10,False,,,...,143.325560,30.599998,19.850000,16.750000,15.5500,13.100000,0.604,0.599,0.540,0.605
4,4,2023-07-05,4448,52.310120,4.882675,7,8,False,,,...,258.074040,103.320000,13.206500,16.656500,16.0065,10.856501,0.747,0.456,0.538,0.661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1337,1337,2023-09-13,2618,52.369579,4.826668,1950,15,False,,,...,22.833694,32.039997,20.130499,19.330500,17.1805,13.280500,0.603,0.554,0.532,0.635
1338,1338,2022-02-21,7296,52.374985,4.985845,1951,12,False,,,...,288.721040,77.400000,5.780500,6.480500,7.4805,9.830501,0.759,0.744,0.712,0.650
1339,1339,2022-11-17,6968,52.299309,4.974054,1952,2,False,,,...,130.297870,65.880000,9.243500,9.893499,11.8435,13.543500,0.741,0.649,0.558,0.608
1340,1340,2022-02-18,4538,52.304714,4.885622,1953,17,False,,,...,249.630600,104.039990,8.419499,8.019500,7.6195,9.919499,0.727,0.720,0.700,0.679


In [30]:
# Make sure both df's have identifiable id
# Not really necessary but makes things easier
positive_samples_df[ID_KEY] = ["P"+str(id_) for id_ in positive_samples_df['Incident_ID']]
negative_samples_df[ID_KEY] = ["N"+str(id_) for id_ in range(len(negative_samples_df))]

In [31]:
# Assign labels
positive_samples_df[LABEL_KEY] = 1
negative_samples_df[LABEL_KEY] = 0

In [32]:
positive_samples_df = positive_samples_df.fillna(0)
negative_samples_df = negative_samples_df.fillna(0)

In [33]:
# Merge df's
pos_columns = positive_samples_df.columns
neg_columns = negative_samples_df.columns
common_cols = pos_columns.intersection(neg_columns)

positive_sub_df = positive_samples_df[common_cols]
negative_sub_df = negative_samples_df[common_cols]

tree_training_df = pd.concat([positive_sub_df, negative_sub_df], axis=0)

In [34]:
positive_samples_df.columns

Index(['grid_id', 'has_tree', 'avg_height', 'avg_diameter', 'avg_year',
       'Fraxinus', 'Salix', 'Alnus', 'Quercus', 'Tilia', 'Acer', 'Populus',
       'Betula', 'Prunus', 'Platanus', 'Malus', 'Robinia', 'Crataegus',
       'Ulmus', 'Carpinus', 'Overig', 'Onbekend', 'Incident_ID',
       'Service_Area', 'Date', 'Hour', 'temperature_2m',
       'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature',
       'precipitation', 'rain', 'snowfall', 'snow_depth', 'weather_code',
       'pressure_msl', 'surface_pressure', 'wind_speed_10m',
       'wind_direction_10m', 'wind_gusts_10m', 'soil_temperature_0_to_7cm',
       'soil_temperature_7_to_28cm', 'soil_temperature_28_to_100cm',
       'soil_temperature_100_to_255cm', 'soil_moisture_0_to_7cm',
       'soil_moisture_7_to_28cm', 'soil_moisture_28_to_100cm',
       'soil_moisture_100_to_255cm', 'Label'],
      dtype='object')

In [35]:
def average_across_cols(
    df,
    cols,
    new_col
):
    df[new_col] = df[cols].mean(axis=1)
    return df

In [36]:
tree_training_df = average_across_cols(tree_training_df, SOIL_MOISTURE_COLUMNS, "average_soil_moisture")
tree_training_df = average_across_cols(tree_training_df, SOIL_TEMPERATURE_COLUMNS, "average_soil_temperature")

In [37]:
[tree_training_df.drop(col, axis=1, inplace=True) for col in SOIL_MOISTURE_COLUMNS]
[tree_training_df.drop(col, axis=1, inplace=True) for col in SOIL_TEMPERATURE_COLUMNS]

[None, None, None, None]

## Train - Validate - Test split

In [38]:
df = tree_training_df

In [39]:
# split train - test
train_ids, test_ids, train_labels, test_labels = train_test_split(df[ID_KEY], df[LABEL_KEY], test_size=TEST_PERCENTAGE, stratify=df[LABEL_KEY], random_state=42)
# train_ids, validation_ids = train_test_split(train_ids, test_size=VALIDATION_PERCENTAGE, stratify=train_labels, random_state=35)

In [40]:
train_set = df[df[ID_KEY].isin(train_ids)]
test_set = df[df[ID_KEY].isin(test_ids)]
# validation_set = df[df[ID_KEY].isin(validation_ids)]

In [41]:
feature_cols = [col for col in FEATURE_COLS if col not in SOIL_MOISTURE_COLUMNS and col not in SOIL_TEMPERATURE_COLUMNS]

print(feature_cols)

x_train = train_set[feature_cols]
y_train = train_set[LABEL_KEY]
x_test = test_set[feature_cols]
y_test = test_set[LABEL_KEY]
# x_validate = validation_set[FEATURE_COLS]
# y_validate = validation_set[LABEL_KEY]

['avg_height', 'avg_year', 'has_tree', 'Fraxinus', 'Salix', 'Alnus', 'Quercus', 'Tilia', 'Acer', 'Populus', 'Betula', 'Prunus', 'Platanus', 'Malus', 'Robinia', 'Crataegus', 'Ulmus', 'Carpinus', 'Overig', 'Onbekend', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain', 'snowfall', 'snow_depth', 'weather_code', 'pressure_msl', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m']


In [42]:
def make_train_test(
    df,
    seed = 42
):
    train_ids, test_ids, train_labels, test_labels = train_test_split(df[ID_KEY], df[LABEL_KEY], test_size=TEST_PERCENTAGE, stratify=df[LABEL_KEY], random_state=seed)
    
    feature_cols = [col for col in FEATURE_COLS if col not in SOIL_MOISTURE_COLUMNS and col not in SOIL_TEMPERATURE_COLUMNS]

    train_set = df[df[ID_KEY].isin(train_ids)]
    test_set = df[df[ID_KEY].isin(test_ids)]

    x_train = train_set[feature_cols]
    y_train = train_set[LABEL_KEY]
    x_test = test_set[feature_cols]
    y_test = test_set[LABEL_KEY]

    return x_train, y_train, x_test, y_test


## Random forest

In [19]:
# clf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10, random_state=42, n_jobs=-1)

clf = RandomForestClassifier(n_estimators=20, max_depth=5, min_samples_split=2, min_samples_leaf=8, random_state=42, n_jobs=-1)

In [20]:
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.69      0.99      0.82       786
           1       0.89      0.12      0.21       393

    accuracy                           0.70      1179
   macro avg       0.79      0.56      0.51      1179
weighted avg       0.76      0.70      0.61      1179



In [21]:
rf_f1 = f1_score(y_true=y_test, y_pred=predictions)
print(rf_f1)

0.21076233183856502


### Optimization

#### grid opt

In [22]:
clf  = RandomForestClassifier(random_state=42, n_jobs=-1)

In [23]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': np.arange(20, 240, 30),
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_split': [2, 22, 2],
    'min_samples_leaf': np.arange(1, 10, 1),
    'max_features': np.arange(0.2, 1.0, 0.2)
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=3, verbose=3)

In [24]:
# Fit the model to the data
grid_search.fit(x_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {best_params}')
print(f'Best Model Accuracy: {accuracy:.2f}')

Fitting 3 folds for each of 4320 candidates, totalling 12960 fits
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.693 total time=   0.1s
[CV 2/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.668 total time=   0.1s
[CV 3/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.703 total time=   0.1s
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.689 total time=   0.1s
[CV 2/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.667 total time=   0.1s
[CV 3/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.702 total time=   0.1s
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=80;, score=0.687 total time=   0

KeyboardInterrupt: 

## XGBoost

In [46]:
clf = XGBClassifier(verbosity=2, max_depth=15, subsample=0.9)

x_train, y_train, x_test, y_test = make_train_test(df, seed = 42)

clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
print(f1_score(y_pred=predictions, y_true=y_test))



0.7098445595854922


In [47]:
for name, score in zip(clf.feature_names_in_, clf.feature_importances_):
    print(f"{name} : {score}")

avg_height : 0.02556045539677143
avg_year : 0.10686264932155609
has_tree : 0.01237479504197836
Fraxinus : 0.02697938308119774
Salix : 0.029725905507802963
Alnus : 0.027573855593800545
Quercus : 0.031595658510923386
Tilia : 0.03444623947143555
Acer : 0.028514746576547623
Populus : 0.03125520050525665
Betula : 0.028653353452682495
Prunus : 0.031002258881926537
Platanus : 0.03469470515847206
Malus : 0.04737110808491707
Robinia : 0.03827314078807831
Crataegus : 0.03139074146747589
Ulmus : 0.05017649009823799
Carpinus : 0.026593683287501335
Overig : 0.036586929112672806
Onbekend : 0.04751036688685417
temperature_2m : 0.017486436292529106
relative_humidity_2m : 0.0191772673279047
dew_point_2m : 0.01635928265750408
apparent_temperature : 0.01821461319923401
precipitation : 0.026385750621557236
rain : 0.03108418732881546
snowfall : 0.030996670946478844
snow_depth : 0.0
weather_code : 0.017582161352038383
pressure_msl : 0.01956384815275669
surface_pressure : 0.01630496233701706
wind_speed_10m :

In [None]:
# import pickle
# #save model
# with open("models/trees/xgboost_md15_sub90_mixed.pkl", "wb") as f:
#     pickle.dump(clf, f)

In [45]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, f1_score
# Define the f1_score as the scoring metric
scorer = make_scorer(f1_score)

# Perform 3-fold cross-validation
cv_predictions = cross_val_predict(clf, x_train, y_train, cv=3)
print(cv_predictions)

# Print the mean F1 score across all folds
print(f1_score(y_pred=predictions, y_true=y_test))
print(classification_report(y_true=y_test, y_pred=predictions))

[0 0 0 ... 0 0 0]
0.5983827493261457
              precision    recall  f1-score   support

           0       0.76      0.81      0.78       661
           1       0.64      0.56      0.60       393

    accuracy                           0.72      1054
   macro avg       0.70      0.69      0.69      1054
weighted avg       0.71      0.72      0.71      1054



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
from collections import Counter
print(Counter(y_test)[1])

393


In [None]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.71      0.69      0.70       393
           1       0.70      0.72      0.71       393

    accuracy                           0.70       786
   macro avg       0.71      0.70      0.70       786
weighted avg       0.71      0.70      0.70       786



In [None]:
# Define the parameter grid to search
param_grid = {
    'max_depth': [5, 10, 15, 20, 25, None],
    'subsample': [0.1, 0.3, 0.5, 0.7, 0.9]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=3, verbose=3)

# Fit the model to the data
grid_search.fit(x_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {best_params}')
print(f'Best Model Accuracy: {accuracy:.2f}')

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV 1/3] END ........max_depth=5, subsample=0.1;, score=0.566 total time=   0.5s
[CV 2/3] END ........max_depth=5, subsample=0.1;, score=0.568 total time=   0.5s
[CV 3/3] END ........max_depth=5, subsample=0.1;, score=0.548 total time=   0.4s
[CV 1/3] END ........max_depth=5, subsample=0.3;, score=0.580 total time=   0.5s
[CV 2/3] END ........max_depth=5, subsample=0.3;, score=0.555 total time=   0.5s
[CV 3/3] END ........max_depth=5, subsample=0.3;, score=0.523 total time=   0.5s
[CV 1/3] END ........max_depth=5, subsample=0.5;, score=0.584 total time=   0.5s
[CV 2/3] END ........max_depth=5, subsample=0.5;, score=0.560 total time=   0.5s
[CV 3/3] END ........max_depth=5, subsample=0.5;, score=0.522 total time=   0.5s
[CV 1/3] END ........max_depth=5, subsample=0.7;, score=0.613 total time=   0.5s
[CV 2/3] END ........max_depth=5, subsample=0.7;, score=0.530 total time=   0.6s
[CV 3/3] END ........max_depth=5, subsample=0.7;

### LR

In [44]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

x_train, y_train, x_test, y_test = make_train_test(df, seed = 42)

clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
print(f1_score(y_pred=predictions, y_true=y_test))

0.5983827493261457


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
