# Random Forest baseline for tile-wise classification

In [None]:
import torch
import torchvision
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

In [None]:
train_features = torch.load('../data/processed/train_features.pt')
train_features = pd.DataFrame(train_features.numpy().astype(np.float32),
                                 columns=["y", "x", "1y", "5y", "10y", "urban", "slope",
                                          "landuse", "soil", "current", "target"])

In [None]:
train_features[["1y", "5y", "10y", "urban", "slope"]] = np.log(train_features[["1y", "5y", "10y", "urban", "slope"]] + 10)

feature_names = ["y", "x", "1y", "5y", "10y", "urban", "slope"]
mean = np.mean(train_features[feature_names], axis=0)
std = np.std(train_features[feature_names], axis=0)
train_features[feature_names] = (train_features[feature_names] - mean) / std

In [None]:
# replace target with aggregated target
input_px = 50
train_layers = torch.load("../data/processed/train_layers.pt")
train_layers = torchvision.transforms.functional.center_crop(train_layers, (input_px, input_px))
target_layers = train_layers[:,-1,:,:] == 4
del train_layers
target = torch.count_nonzero(target_layers, dim=(1,2)) > 0
train_features["target"] = target
del target_layers

train_features["target"] = train_features["target"].replace(2, 0)
train_features["target"] = train_features["target"].replace(4, 1)

X_train = train_features[feature_names]
y_train = train_features["target"]

In [None]:
train_features.describe()

In [None]:
val_features = torch.load('../data/processed/val_features.pt')
val_features = pd.DataFrame(val_features.numpy().astype(np.float32),
                                 columns=["y", "x", "1y", "5y", "10y", "urban", "slope",
                                          "landuse", "soil", "current", "target"])
val_features[["1y", "5y", "10y", "urban", "slope"]] = np.log(val_features[["1y", "5y", "10y", "urban", "slope"]] + 10)
val_features[feature_names] = (val_features[feature_names] - mean) / std

# replace target with aggregated target
val_layers = torch.load("../data/processed/val_layers.pt")
val_layers = torchvision.transforms.functional.center_crop(val_layers, (input_px, input_px))
target = torch.count_nonzero(val_layers[:,-1,:,:] == 4, dim=(1,2)) > 0
val_features["target"] = target
del val_layers

val_features["target"] = val_features["target"].replace(2, 0)
val_features["target"] = val_features["target"].replace(4, 1)

X_val = val_features[feature_names]
y_val = val_features["target"]

In [None]:
# weight_target_class_arr = [0.5,0.6,0.7,0.8,0.9,0.99,0.999,0.9999,0.99999]
weight_target_class_arr = [0.5]

# hyperparameter tuning
for weight_target_class in weight_target_class_arr:
    class_weights = {0:1-weight_target_class,
                     1:weight_target_class}
    clf = RandomForestClassifier(class_weight=class_weights, random_state=42)
    clf.fit(X_train, y_train)

    probabilities = clf.predict_proba(X_val)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_val, probabilities)
    f1 = 2 * precision[:-1] * recall[:-1] / (precision[:-1] + recall[:-1])
    threshold = thresholds[np.argmax(f1)]
    print("Weight target class: ", weight_target_class)
    print("Threshold: ", threshold)
    print("F1: ", np.max(f1))
    print("Precision: ", precision[np.argmax(f1)])
    print("Recall: ", recall[np.argmax(f1)])


In [None]:
predictions = clf.predict_proba(X_val)
predictions = predictions[:, 1] >= threshold # 0.31
print("Accuracy: ", accuracy_score(y_val, predictions))
print("F1: ", f1_score(y_val, predictions))
print("Precision: ", precision_score(y_val, predictions))
print("Recall: ", recall_score(y_val, predictions))

In [None]:
# confusion matrix
confusion_matrix(y_val, predictions)

In [None]:
np.count_nonzero(predictions == 1) / len(predictions)

In [None]:
# plot f1 score over chosen threshold
probabilities = clf.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, probabilities)
plt.plot(thresholds, precision[:-1], label="precision")
plt.plot(thresholds, recall[:-1], label="recall")
f1 = 2 * precision[:-1] * recall[:-1] / (precision[:-1] + recall[:-1])
plt.plot(thresholds, f1, label="f1")
plt.legend()
plt.show()

In [None]:
probabilities

# Data statistics

In [None]:
# cross correlation matrix
corr = train_features.corr()
plt.imshow(corr, cmap="RdYlGn")
plt.colorbar()
plt.show()
# plot last row
plt.imshow(corr.iloc[-1, :-1].values.reshape(1,10))
plt.xticks(np.arange(10), corr.columns[:-1])
plt.colorbar()
plt.show()
corr

In [None]:
# get feature importance of clf
importances = clf.feature_importances_
# plot feature importance orderd by importance
feature_names = ["y", "x", "1y", "5y", "10y", "urban", "slope"]
feature_names_sorted = feature_names.copy()
indices = np.argsort(importances)
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names_sorted[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()