In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc  # Garbage collector interface


In [None]:

import rasterio
import pandas as pd


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, classification_report, accuracy_score,
                             precision_score, recall_score, f1_score, roc_auc_score,
                             precision_recall_curve, roc_curve, auc)
from sklearn.model_selection import (train_test_split, cross_val_score,
                                     GridSearchCV, RandomizedSearchCV)

from imblearn.ensemble import BalancedRandomForestClassifier
import joblib
from joblib import dump
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold
from scipy.stats import skew

In [None]:
FEATURES_DIR = r"C:\Users\bsf31\Documents\post-meds\data\policy-data\ml_data\training"
EXCLUDE_FILE = 'train_binary_deforestation_raster.tif'


In [None]:
# Helper function to read TIFF files
def read_tiff_image(file_path):
    with rasterio.open(file_path) as src:
        return src.read(1)

# List of paths to the raster files excluding the specified file
feature_files = [os.path.join(FEATURES_DIR, file_name)
                 for file_name in os.listdir(FEATURES_DIR)
                 if file_name != EXCLUDE_FILE]

# Read and store each raster file's data in an array
feature_data_arrays = [read_tiff_image(file_path) for file_path in feature_files]

In [None]:
feature_data_flat = [data_array.flatten() for data_array in feature_data_arrays]
del feature_data_arrays



In [None]:
feature_files

In [None]:

# Path to the y_file
y_file = os.path.join(FEATURES_DIR, 'train_binary_deforestation_raster.tif')

In [None]:

# NoData Value
no_data_value = -1

# Stack the flattened raster data
X_flat = np.column_stack(feature_data_flat)

# Delete the raw feature arrays as they are no longer needed after flattening and cleaning
del feature_data_flat

# Use the y_file obtained from the find_deforestation_file function
y = read_tiff_image(y_file).flatten()
del y_file

# Remove rows with NoData values
'''checks each row in X_flat and creates a boolean array (valid_rows_X) that has the same number of elements
as the number of rows in X_flat. Each element in valid_rows_X is True if there is no NoData value in
the corresponding row of X_flat and False otherwise.'''
valid_rows_X = ~(X_flat == no_data_value).any(axis=1)

'''checks each element in the y array and creates a boolean array (valid_rows_y) that has the same number of
elements as y. Each element in valid_rows_y is True if the corresponding element in y is not
equal to the NoData value and False otherwise.'''
valid_rows_y = y != no_data_value

'''checks each element in the y array and creates a boolean array (valid_rows_y)
that has the same number of elements as y. Each element in valid_rows_y is True if the corresponding element
in y is not equal to the NoData value and False otherwise.'''
valid_rows = valid_rows_X & valid_rows_y
del valid_rows_X
del valid_rows_y

'''creates a new array X_cleaned by selecting only the rows in X_flat that
correspond to the True elements in valid_rows.'''
X_cleaned = X_flat[valid_rows]

'''creates a new array y_cleaned by selecting only the elements in y that correspond
to the True elements in valid_rows.'''
y_cleaned = y[valid_rows]

del X_flat
del y
del valid_rows


In [None]:
# Define the labels for your features
feature_labels = [ 'CITIES', 'GRUPO', 'PORTS', 'PRECIPITATION', 'RIVER', 'ROAD', 'SOIL' ]





X_cleaned[:, feature_labels.index('RIVER')] = np.log1p(X_cleaned[:, feature_labels.index('RIVER')])
X_cleaned[:, feature_labels.index('CITIES')] = np.log1p(X_cleaned[:, feature_labels.index('CITIES')])
X_cleaned[:, feature_labels.index('ROAD')] = np.log1p(X_cleaned[:, feature_labels.index('ROAD')])

del feature_labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.9, random_state=42, stratify=y_cleaned)
del X_cleaned
del y_cleaned


In [None]:
X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.int32)  # Convert y_train to int32
X_test = X_test.astype(np.float32)
y_test = y_test.astype(np.int32)  # Convert y_test to int32


In [None]:




# Force a garbage collection to free up unused memory
gc.collect()


In [None]:
brfc = BalancedRandomForestClassifier(random_state=42, class_weight= 'balanced', sampling_strategy='not majority')

# Define a basic parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],   # number of trees in the forest
    'max_depth': [None, 5, 10, 20],    # maximum depth of the tree
    'min_samples_split': [2, 5, 10],   # minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],     # minimum number of samples required to be at a leaf node
    'max_features': ['sqrt']   # number of features to consider when looking for the best split
}

# Set scoring metrics
scoring = {
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

# Create a StratifiedKFold object

''' Stratified K-Fold is a type of cross-validation object in scikit-learn.
 It provides train/test indices to split data into train/test sets in a stratified fashion.
 It is beneficial for imbalanced datasets
 as it ensures that relative class frequencies are approximately preserved in each train and test set.'''

strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use the object in the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator = brfc,
    param_distributions=param_grid,
    scoring=scoring,
    refit='f1',  # because we are interested in maximizing f1_score
    cv=strat_kfold,
    n_jobs=1,
    verbose=0,
    n_iter=10,  # number of parameter settings that are sampled
    random_state=42  # for reproducibility
)

'''# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator = brfc,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1',  # because we are interested in maximizing f1_score
    cv=5,
    n_jobs=19,
    verbose=0
)
'''

In [None]:
# Fit RandomizedSearchCV to the BalancedRandomForestClassifier data
#grid_search.fit(X_train, y_train)
random_search.fit(X_train, y_train)

In [None]:
joblib.dump(random_search, 'random_search_results.pkl')


In [None]:
# Print all available attributes and methods for the random_search object
all_attributes_methods = dir(random_search)

# Filter out attributes and methods inherited from BaseSearchCV
specific_attributes_methods = [
    attribute for attribute in all_attributes_methods
    if attribute not in dir(RandomizedSearchCV)
]

print("Attributes and methods specific to GridSearchCV:")
for attr in specific_attributes_methods:
    print(attr)

In [None]:
def is_fitted(estimator):
    try:
        getattr(estimator, "estimators_")
        return True
    except AttributeError:
        return False

print(is_fitted(brfc))

In [None]:
random_search.score


In [None]:
# Get the best parameters and the corresponding score
best_params = random_search.best_params_
best_score = random_search.best_score_

best_estimator = random_search.best_estimator_

cv_results = random_search.cv_results_

cv_results_df = pd.DataFrame(random_search.cv_results_)

scorer = random_search.scorer_

refit_time = random_search.refit_time_

In [None]:
print("Best parameters:", best_params)
print("Best cross-validation score:", best_score)
print("Best estimator:", best_estimator)
print("CV Results:",cv_results_df)
print("Scorer function:", scorer)
print("Refit time (seconds):", refit_time)

In [None]:
best_model = random_search.best_estimator_


In [None]:
# Predictions for test data
y_pred = best_model.predict(X_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate F1-score (use 'weighted' or 'macro' depending on your problem)
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1-score:", f1)

# Print classification report
report = classification_report(y_test, y_pred)
print("Classification report:\n", report)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()

In [None]:
# Predictions for train data
y_pred_train = best_model.predict(X_train)

In [None]:
# Confusion matrix and classification report for train data
train_cm = confusion_matrix(y_train, y_pred_train)
train_cr = classification_report(y_train, y_pred_train)
print("Training confusion matrix:")
print(train_cm)
print("Training classification report:")
print(train_cr)

In [None]:
disp = ConfusionMatrixDisplay.from_estimator(
        brfc,
        X_test,
        y_test,
        cmap=plt.cm.Blues)

title = disp.ax_.set_title("Confusion matrix")

print(title)
print(disp.confusion_matrix)

plt.show()

In [None]:
 Calculate feature importances and the standard deviation for those importances
importances = best_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in best_model.estimators_], axis=0)


 # list of feature names corresponding to the input bands of your raster stack
feature_names =  [ 'SOIL', 'ROAD', 'LUP_10', 'PRECIPITATION', 'RIVER', 'CITIES', 'PORTS' ]
# Create a sorted list of tuples containing feature names and their importances:
sorted_features = sorted(zip(feature_names, importances, std), key=lambda x: x[1], reverse=True)

# Create a bar chart
fig, ax = plt.subplots()

# Set the feature names as x-axis labels
ax.set_xticklabels([item[0] for item in sorted_features], rotation=45, ha='right')
ax.set_xticks(range(len(sorted_features)))

# Set the y-axis labels as importances
ax.bar(range(len(sorted_features)), [item[1] for item in sorted_features], yerr=[item[2] for item in sorted_features])

# Set the title and labels for the chart
ax.set_title('Feature Importances')
ax.set_xlabel('Features')
ax.set_ylabel('Importance')

# Display the chart
plt.tight_layout()
plt.show()

In [None]:
y_proba_curve = best_model.predict_proba(X_test)[:, 1]


In [None]:
print("Shape of y_proba_curve:", y_proba_curve.shape)


In [None]:
# Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_proba_curve)
plt.plot(recall, precision, marker='.', label='Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

print(f"Area under Precision-Recall curve: {auc(recall, precision)}")

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba_curve)
plt.plot(fpr, tpr, marker='.', label='Random Forest')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

print(f"Area under ROC curve: {auc(fpr, tpr)}")


In [None]:
# Predict probabilities for deforestation events
y_proba = best_model.predict_proba(X_cleaned)[:, 1]

In [None]:

# Predicts the
# Create a probability raster by filling in the valid pixel values
prob_raster = np.full(y.shape, no_data_value, dtype=np.float32)
prob_raster[valid_rows] = y_proba
prob_raster = prob_raster.reshape(feature_data_arrays[0].shape)

In [None]:
print(y_proba.shape)

In [None]:
try:
    joblib.dump(best_params, 'best_params.pkl')
    joblib.dump(best_score, 'best_score.pkl')
    joblib.dump(best_model, 'best_model.pkl')
    joblib.dump(cv_results, 'cv_results.pkl')
    joblib.dump(cv_results_df, 'cv_results_df.pkl')
    joblib.dump(scorer, 'scorer.pkl')
    joblib.dump(refit_time, 'refit_time.pkl')
    joblib.dump(report, 'report.pkl')
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Save the probability raster as a GeoTIFF file
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

output_file = os.path.join(output_folder, "brfc-df-prediction-feature.tiff")

with rasterio.open(y_file) as src:
    profile = src.profile
    profile.update(dtype=rasterio.float32, count=1)

prob_raster_reshaped = prob_raster.reshape((1, prob_raster.shape[0], prob_raster.shape[1]))

with rasterio.open(output_file, 'w', **profile) as dst:
    dst.write_band(1, prob_raster_reshaped[0])

In [None]:
# Report
model_report = f'''

Balanced Random Forest Classifier Model Report

# Summary

The Balanced Random Forest Classifier performed reasonably well on this task,
with an accuracy of  {accuracy} and an F1-score of {f1}.
However, there is room for improvement, particularly in the precision and recall for class 1.
Future work could explore different models, additional feature engineering, or further hyperparameter tuning to improve performance.

# Model Selection

We chose to use a Balanced Random Forest Classifier for this task.
This model is an ensemble method that combines the predictions of several base estimators
built with a given learning algorithm in order to improve generalizability and robustness over a single estimator.
It also handles imbalanced classes, which is a common problem in many machine learning tasks.

Hyperparameter Tuning
We used RandomizedSearchCV for hyperparameter tuning.
This method performs a random search on hyperparameters, which is more efficient than an exhaustive search like GridSearchCV.

The hyperparameters we tuned were:

'n_estimators': The number of trees in the forest.
'max_depth': The maximum depth of the tree.
'min_samples_split': The minimum number of samples required to split a node.
'min_samples_leaf': The minimum number of samples required at a leaf node.
'bootstrap': Whether bootstrap samples are used when building trees.

{param_grid}

# Model Performance
The best parameters found by RandomizedSearchCV were:

Best parameters:, {best_params}



With these parameters, the model achieved the following performance metrics:
Best cross-validation score: {best_score}
Best model:, {best_estimator}
Scorer function:, {scorer}
Refit time (seconds): {refit_time}
Accuracy:, {accuracy}
F1-score: {f1}

# Testing Data

Classification report:

{report}

#  TRAINING DATA Classificatin Report-Confusion Matrix

Training confusion matrix:

{train_cm}

Training classification report:

{train_cr}


This indicates that the model correctly classified [1,1] instances of class 0
and [2,2] instances of class 1,

while misclassifying [1,2] instances of class 0 and [2,1] instances of class 1.

CV Results:
{cv_results_df}

'''
# Write the report to a Quarto markdown file
with open('model_report.qmd', 'w') as f:
    f.write(model_report)