In [108]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from xgboost import XGBClassifier

In [109]:
DATA_DIR = Path("tree_data_training")
TREE_DATA = Path("final_data/trees/")
FEATURE_COLS = ['avg_height', 'avg_year',
       'Fraxinus', 'Salix', 'Alnus', 'Quercus', 'Tilia', 'Acer', 'Populus',
       'Betula', 'Prunus', 'Platanus', 'Malus', 'Robinia', 'Crataegus',
       'Ulmus', 'Carpinus', 'Overig', 'Onbekend', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'apparent_temperature', 'precipitation', 'rain', 'snowfall',
       'snow_depth', 'weather_code', 'pressure_msl', 'surface_pressure',
       'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m',
       'soil_temperature_0_to_7cm', 'soil_temperature_7_to_28cm',
       'soil_temperature_28_to_100cm', 'soil_temperature_100_to_255cm',
       'soil_moisture_0_to_7cm', 'soil_moisture_7_to_28cm',
       'soil_moisture_28_to_100cm', 'soil_moisture_100_to_255cm'
       ]



TEST_PERCENTAGE = 0.2 # percentage of total
VALIDATION_PERCENTAGE = 0.25 # percentage of (1-TEST_PERCENTAGE)*total
ID_KEY = "Incident_ID"
LABEL_KEY = "Label"

SOIL_MOISTURE_COLUMNS = [
    'soil_moisture_0_to_7cm',
    'soil_moisture_7_to_28cm', 
    'soil_moisture_28_to_100cm', 
    'soil_moisture_100_to_255cm'
]

SOIL_TEMPERATURE_COLUMNS = [
    'soil_temperature_0_to_7cm',
    'soil_temperature_7_to_28cm',
    'soil_temperature_28_to_100cm',
    'soil_temperature_100_to_255cm',
]


## Merging separate sets

In [110]:
# Load in training and testing sets
positive_path = TREE_DATA / "trees_new_grid_pos_samples.csv"
negative_path = TREE_DATA / "trees_new_grid_neg_samples.csv"

positive_samples_df = pd.read_csv(positive_path, sep=",", encoding="utf-8")
negative_samples_df = pd.read_csv(negative_path, sep=",", encoding="utf-8")

In [111]:
# Make sure both df's have identifiable id
# Not really necessary but makes things easier
positive_samples_df[ID_KEY] = ["P"+str(id_) for id_ in positive_samples_df['Incident_ID']]
negative_samples_df[ID_KEY] = ["N"+str(id_) for id_ in range(len(negative_samples_df))]

In [112]:
# Assign labels
positive_samples_df[LABEL_KEY] = 1
negative_samples_df[LABEL_KEY] = 0

In [113]:
positive_samples_df = positive_samples_df.fillna(0)
negative_samples_df = negative_samples_df.fillna(0)

In [114]:
# Merge df's
pos_columns = positive_samples_df.columns
neg_columns = negative_samples_df.columns
common_cols = pos_columns.intersection(neg_columns)

positive_sub_df = positive_samples_df[common_cols]
negative_sub_df = negative_samples_df[common_cols]

tree_training_df = pd.concat([positive_sub_df, negative_sub_df], axis=0)

In [115]:
positive_samples_df.columns

Index(['grid_id', 'has_tree', 'avg_height', 'avg_diameter', 'avg_year',
       'Fraxinus', 'Salix', 'Alnus', 'Quercus', 'Tilia', 'Acer', 'Populus',
       'Betula', 'Prunus', 'Platanus', 'Malus', 'Robinia', 'Crataegus',
       'Ulmus', 'Carpinus', 'Overig', 'Onbekend', 'Incident_ID',
       'Service_Area', 'Date', 'Hour', 'temperature_2m',
       'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature',
       'precipitation', 'rain', 'snowfall', 'snow_depth', 'weather_code',
       'pressure_msl', 'surface_pressure', 'wind_speed_10m',
       'wind_direction_10m', 'wind_gusts_10m', 'soil_temperature_0_to_7cm',
       'soil_temperature_7_to_28cm', 'soil_temperature_28_to_100cm',
       'soil_temperature_100_to_255cm', 'soil_moisture_0_to_7cm',
       'soil_moisture_7_to_28cm', 'soil_moisture_28_to_100cm',
       'soil_moisture_100_to_255cm', 'Label'],
      dtype='object')

In [116]:
def average_across_cols(
    df,
    cols,
    new_col
):
    df[new_col] = df[cols].mean(axis=1)
    return df

In [117]:
tree_training_df = average_across_cols(tree_training_df, SOIL_MOISTURE_COLUMNS, "average_soil_moisture")
tree_training_df = average_across_cols(tree_training_df, SOIL_TEMPERATURE_COLUMNS, "average_soil_temperature")

In [118]:
[tree_training_df.drop(col, axis=1, inplace=True) for col in SOIL_MOISTURE_COLUMNS]
[tree_training_df.drop(col, axis=1, inplace=True) for col in SOIL_TEMPERATURE_COLUMNS]

[None, None, None, None]

## Train - Validate - Test split

In [119]:
df = tree_training_df

In [120]:
# split train - test
train_ids, test_ids, train_labels, test_labels = train_test_split(df[ID_KEY], df[LABEL_KEY], test_size=TEST_PERCENTAGE, stratify=df[LABEL_KEY], random_state=42)
# train_ids, validation_ids = train_test_split(train_ids, test_size=VALIDATION_PERCENTAGE, stratify=train_labels, random_state=35)

In [121]:
train_set = df[df[ID_KEY].isin(train_ids)]
test_set = df[df[ID_KEY].isin(test_ids)]
# validation_set = df[df[ID_KEY].isin(validation_ids)]

In [122]:
feature_cols = [col for col in FEATURE_COLS if col not in SOIL_MOISTURE_COLUMNS and col not in SOIL_TEMPERATURE_COLUMNS]

print(feature_cols)

x_train = train_set[feature_cols]
y_train = train_set[LABEL_KEY]
x_test = test_set[feature_cols]
y_test = test_set[LABEL_KEY]
# x_validate = validation_set[FEATURE_COLS]
# y_validate = validation_set[LABEL_KEY]

['avg_height', 'avg_year', 'Fraxinus', 'Salix', 'Alnus', 'Quercus', 'Tilia', 'Acer', 'Populus', 'Betula', 'Prunus', 'Platanus', 'Malus', 'Robinia', 'Crataegus', 'Ulmus', 'Carpinus', 'Overig', 'Onbekend', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain', 'snowfall', 'snow_depth', 'weather_code', 'pressure_msl', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m']


In [137]:
def make_train_test(
    df,
    seed = 42
):
    train_ids, test_ids, train_labels, test_labels = train_test_split(df[ID_KEY], df[LABEL_KEY], test_size=TEST_PERCENTAGE, stratify=df[LABEL_KEY], random_state=seed)
    
    feature_cols = [col for col in FEATURE_COLS if col not in SOIL_MOISTURE_COLUMNS and col not in SOIL_TEMPERATURE_COLUMNS]

    train_set = df[df[ID_KEY].isin(train_ids)]
    test_set = df[df[ID_KEY].isin(test_ids)]

    x_train = train_set[feature_cols]
    y_train = train_set[LABEL_KEY]
    x_test = test_set[feature_cols]
    y_test = test_set[LABEL_KEY]

    return x_train, y_train, x_test, y_test


## Random forest

In [95]:
# clf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10, random_state=42, n_jobs=-1)

clf = RandomForestClassifier(n_estimators=20, max_depth=5, min_samples_split=2, min_samples_leaf=8, random_state=42, n_jobs=-1)

In [96]:
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.68      0.62      0.65       393
           1       0.65      0.71      0.68       393

    accuracy                           0.67       786
   macro avg       0.67      0.67      0.67       786
weighted avg       0.67      0.67      0.67       786



In [90]:
rf_f1 = f1_score(y_true=y_test, y_pred=predictions)
print(rf_f1)

0.7556109725685786


### Optimization

#### grid opt

In [91]:
clf  = RandomForestClassifier(random_state=42, n_jobs=-1)

In [92]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': np.arange(20, 240, 30),
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_split': [2, 22, 2],
    'min_samples_leaf': np.arange(1, 10, 1),
    'max_features': np.arange(0.2, 1.0, 0.2)
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=3, verbose=3)

In [93]:
# Fit the model to the data
grid_search.fit(x_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {best_params}')
print(f'Best Model Accuracy: {accuracy:.2f}')

Fitting 3 folds for each of 4320 candidates, totalling 12960 fits
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.621 total time=   0.1s
[CV 2/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.549 total time=   0.0s
[CV 3/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.507 total time=   0.0s
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.614 total time=   0.1s
[CV 2/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.555 total time=   0.1s
[CV 3/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.518 total time=   0.1s
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=80;, score=0.612 total time=   0

## XGBoost

In [145]:
clf = XGBClassifier(verbosity=2, max_depth=15, subsample=0.9)

x_train, y_train, x_test, y_test = make_train_test(df, seed = 42)

clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
print(f1_score(y_pred=predictions, y_true=y_test))

0.7726708074534162


In [143]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, f1_score
# Define the f1_score as the scoring metric
scorer = make_scorer(f1_score)

# Perform 3-fold cross-validation
cv_predictions = cross_val_predict(clf, x_train, y_train, cv=3)
print(cv_predictions)

# Print the mean F1 score across all folds
print(f1_score(y_pred=predictions, y_true=y_test))
print(classification_report(y_true=y_test, y_pred=predictions))

[0.48235294 0.63815789 0.61512605 0.57818182 0.46360153]
Mean F1 Score: 0.5554840474164697


In [106]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.71      0.69      0.70       393
           1       0.70      0.72      0.71       393

    accuracy                           0.70       786
   macro avg       0.71      0.70      0.70       786
weighted avg       0.71      0.70      0.70       786



In [100]:
# Define the parameter grid to search
param_grid = {
    'max_depth': [5, 10, 15, 20, 25, None],
    'subsample': [0.1, 0.3, 0.5, 0.7, 0.9]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=3, verbose=3)

# Fit the model to the data
grid_search.fit(x_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {best_params}')
print(f'Best Model Accuracy: {accuracy:.2f}')

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV 1/3] END ........max_depth=5, subsample=0.1;, score=0.566 total time=   0.5s
[CV 2/3] END ........max_depth=5, subsample=0.1;, score=0.568 total time=   0.5s
[CV 3/3] END ........max_depth=5, subsample=0.1;, score=0.548 total time=   0.4s
[CV 1/3] END ........max_depth=5, subsample=0.3;, score=0.580 total time=   0.5s
[CV 2/3] END ........max_depth=5, subsample=0.3;, score=0.555 total time=   0.5s
[CV 3/3] END ........max_depth=5, subsample=0.3;, score=0.523 total time=   0.5s
[CV 1/3] END ........max_depth=5, subsample=0.5;, score=0.584 total time=   0.5s
[CV 2/3] END ........max_depth=5, subsample=0.5;, score=0.560 total time=   0.5s
[CV 3/3] END ........max_depth=5, subsample=0.5;, score=0.522 total time=   0.5s
[CV 1/3] END ........max_depth=5, subsample=0.7;, score=0.613 total time=   0.5s
[CV 2/3] END ........max_depth=5, subsample=0.7;, score=0.530 total time=   0.6s
[CV 3/3] END ........max_depth=5, subsample=0.7;