In [390]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from xgboost import XGBClassifier

In [391]:
DATA_DIR = Path("tree_data_training")
FEATURE_COLS = ['avg_height', 'avg_year',
       'Fraxinus', 'Salix', 'Alnus', 'Quercus', 'Tilia', 'Acer', 'Populus',
       'Betula', 'Prunus', 'Platanus', 'Malus', 'Robinia', 'Crataegus',
       'Ulmus', 'Carpinus', 'Overig', 'Onbekend', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'apparent_temperature', 'precipitation', 'rain', 'snowfall',
       'snow_depth', 'weather_code', 'pressure_msl', 'surface_pressure',
       'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m',
       'soil_temperature_0_to_7cm', 'soil_temperature_7_to_28cm',
       'soil_temperature_28_to_100cm', 'soil_temperature_100_to_255cm',
       'soil_moisture_0_to_7cm', 'soil_moisture_7_to_28cm',
       'soil_moisture_28_to_100cm', 'soil_moisture_100_to_255cm'
       ]



TEST_PERCENTAGE = 0.2 # percentage of total
VALIDATION_PERCENTAGE = 0.25 # percentage of (1-TEST_PERCENTAGE)*total
ID_KEY = "Incident_ID"
LABEL_KEY = "Label"

SOIL_MOISTURE_COLUMNS = [
    'soil_moisture_0_to_7cm',
    'soil_moisture_7_to_28cm', 
    'soil_moisture_28_to_100cm', 
    'soil_moisture_100_to_255cm'
]

SOIL_TEMPERATURE_COLUMNS = [
    'soil_temperature_0_to_7cm',
    'soil_temperature_7_to_28cm',
    'soil_temperature_28_to_100cm',
    'soil_temperature_100_to_255cm',
]


## Merging separate sets

In [392]:
# Load in training and testing sets
positive_path = DATA_DIR / "positive_samples.csv"
negative_path = DATA_DIR / "negative_samples.csv"
positive_samples_df = pd.read_csv(positive_path, sep=",", encoding="utf-8")
negative_samples_df = pd.read_csv(negative_path, sep=",", encoding="utf-8")

In [393]:
# Make sure both df's have identifiable id
# Not really necessary but makes things easier
positive_samples_df[ID_KEY] = ["P"+str(id_) for id_ in positive_samples_df['Incident_ID']]
negative_samples_df[ID_KEY] = ["N"+str(id_) for id_ in range(len(negative_samples_df))]

In [394]:
# Assign labels
positive_samples_df[LABEL_KEY] = 1
negative_samples_df[LABEL_KEY] = 0

In [395]:
# Merge df's
pos_columns = positive_samples_df.columns
neg_columns = negative_samples_df.columns
common_cols = pos_columns.intersection(neg_columns)

positive_sub_df = positive_samples_df[common_cols]
negative_sub_df = negative_samples_df[common_cols]

tree_training_df = pd.concat([positive_sub_df, negative_sub_df], axis=0)

In [396]:
def average_across_cols(
    df,
    cols,
    new_col
):
    df[new_col] = df[cols].mean(axis=1)
    return df

In [397]:
tree_training_df = average_across_cols(tree_training_df, SOIL_MOISTURE_COLUMNS, "average_soil_moisture")
tree_training_df = average_across_cols(tree_training_df, SOIL_TEMPERATURE_COLUMNS, "average_soil_temperature")

In [398]:
[tree_training_df.drop(col, axis=1, inplace=True) for col in SOIL_MOISTURE_COLUMNS]
[tree_training_df.drop(col, axis=1, inplace=True) for col in SOIL_TEMPERATURE_COLUMNS]

[None, None, None, None]

In [399]:
# Replace NaN with -1
# tree_training_df.fillna(-1, inplace=True)

## Train - Validate - Test split

In [400]:
df = tree_training_df

In [401]:
# split train - test
train_ids, test_ids, train_labels, test_labels = train_test_split(df[ID_KEY], df[LABEL_KEY], test_size=TEST_PERCENTAGE, stratify=df[LABEL_KEY], random_state=42)
# train_ids, validation_ids = train_test_split(train_ids, test_size=VALIDATION_PERCENTAGE, stratify=train_labels, random_state=35)

In [402]:
train_set = df[df[ID_KEY].isin(train_ids)]
test_set = df[df[ID_KEY].isin(test_ids)]
# validation_set = df[df[ID_KEY].isin(validation_ids)]

In [403]:
feature_cols = [col for col in FEATURE_COLS if col not in SOIL_MOISTURE_COLUMNS and col not in SOIL_TEMPERATURE_COLUMNS]

print(feature_cols)

x_train = train_set[feature_cols]
y_train = train_set[LABEL_KEY]
x_test = test_set[feature_cols]
y_test = test_set[LABEL_KEY]
# x_validate = validation_set[FEATURE_COLS]
# y_validate = validation_set[LABEL_KEY]

['avg_height', 'avg_year', 'Fraxinus', 'Salix', 'Alnus', 'Quercus', 'Tilia', 'Acer', 'Populus', 'Betula', 'Prunus', 'Platanus', 'Malus', 'Robinia', 'Crataegus', 'Ulmus', 'Carpinus', 'Overig', 'Onbekend', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain', 'snowfall', 'snow_depth', 'weather_code', 'pressure_msl', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m']


## Random forest

In [404]:
clf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10, random_state=42, n_jobs=-1)

In [405]:
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)


In [406]:
rf_f1 = f1_score(y_true=y_test, y_pred=predictions)
print(rf_f1)

0.6684005201560467


### Optimization

#### grid opt

In [407]:
clf = clf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [408]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': np.arange(20, 240, 30),
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_split': [2, 22, 2],
    'min_samples_leaf': np.arange(1, 10, 1),
    'max_features': np.arange(0.2, 1.0, 0.2)
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=3, verbose=3)

In [409]:
# Fit the model to the data
grid_search.fit(x_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {best_params}')
print(f'Best Model Accuracy: {accuracy:.2f}')

Fitting 3 folds for each of 4320 candidates, totalling 12960 fits
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.566 total time=   0.1s
[CV 2/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.542 total time=   0.1s
[CV 3/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.589 total time=   0.1s
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.566 total time=   0.1s
[CV 2/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.560 total time=   0.1s
[CV 3/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.580 total time=   0.1s
[CV 1/3] END max_depth=5, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=80;, score=0.569 total time=   0

## XGBoost

In [410]:
clf = XGBClassifier(verbosity=2)

clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
print(f1_score(y_pred=predictions, y_true=y_test))

0.6464646464646464
