# Inspect prediction results from a Random Forest classifier

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sn
import pickle
import glob
import matplotlib.pyplot as plt
from plotnine import *
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, recall_score
import read_settings

Set input and output directories

In [None]:
# Read settings
global_settings = read_settings.check_global()
instrument = global_settings['input_data']['instrument']
rf_settings = read_settings.check_rf()
use_weights = rf_settings['use_weights']

# Directory for input data
data_dir = os.path.join('data', instrument)

# Directory for training outputs
output_dirs = glob.glob(os.path.join('output', '_'.join(['rf', 'w' if use_weights else 'nw', instrument, '*'])))
output_dirs.sort()
output_dirs

Choose output directory to inspect

In [None]:
output_dir = output_dirs[-1]

Glimpse at settings

In [None]:
with open(os.path.join(output_dir, 'settings.pickle'),'rb') as settings_file:
    settings = pickle.load(settings_file)
settings

## Input data

In [None]:
df_comp = pd.read_csv(os.path.join(output_dir, 'df_comp.csv')).set_index('classif_id')

In [None]:
plt.figure()
df_comp.plot.bar(stacked=True, figsize=(16,8), fontsize = 14)
plt.xlabel("Classes", fontsize = 14)
plt.ylabel("Image number", fontsize = 14)
plt.legend(loc="best")
plt.title("Dataset composition for RF", fontsize = 16)
plt.show()

## Training

Read training file

In [None]:
with open(os.path.join(output_dir, 'train_results.pickle'),'rb') as results_file:
    train_results = pickle.load(results_file)
    train_results = pd.DataFrame(train_results)

Plot gridsearch results

In [None]:
eval_metric = settings['rf_settings']['grid_search']['eval_metric']
(ggplot(train_results) +
  geom_point(aes(x='max_features', y='valid_' + eval_metric, colour='factor(n_estimators)'))+
  facet_wrap('~min_samples_leaf', labeller = 'label_both') +
  labs(colour='n_estimators', title = 'Gridsearch results'))

In [None]:
if eval_metric == 'log_loss':
    # if evaluation metric is log loss, look for the smallest value
    best_params = train_results.nsmallest(1, 'valid_'+ eval_metric).reset_index(drop=True).drop(['valid_accuracy', 'valid_balanced_accuracy', 'valid_log_loss'], axis=1)
else:
    # in other cases, look for the largest value
    best_params = train_results.nlargest(1, 'valid_'+ eval_metric).reset_index(drop=True).drop(['valid_accuracy', 'valid_balanced_accuracy', 'valid_log_loss'], axis=1)
best_params = best_params.iloc[0].to_dict()
best_params

## Testing

Read test file

In [None]:
with open(os.path.join(output_dir, 'test_results.pickle'),'rb') as results_file:
    test_results = pickle.load(results_file)

    classes = test_results.get('classes')
    classes_g = test_results.get('classes_g')
    eco_rev_classes = test_results.get('eco_rev_classes')
    eco_rev_classes_g = test_results.get('eco_rev_classes_g')
    
    true_classes = test_results.get('true_classes')
    predicted_classes = test_results.get('predicted_classes')
    true_classes_g = test_results.get('true_classes_g')
    predicted_classes_g = test_results.get('predicted_classes_g')
    
    accuracy = test_results.get('accuracy')
    balanced_accuracy = test_results.get('balanced_accuracy')
    eco_rev_precision = test_results.get('eco_rev_precision')
    eco_rev_recall = test_results.get('eco_rev_recall')
    
    accuracy_g = test_results.get('accuracy_g')
    balanced_accuracy_g = test_results.get('balanced_accuracy_g')
    eco_rev_precision_g = test_results.get('eco_rev_precision_g')
    eco_rev_recall_g = test_results.get('eco_rev_recall_g')

### Accuracy, precision and recall scores

In [None]:
print(f'Accuracy score is {accuracy}')
print(f'Balanced accuracy score is {balanced_accuracy}')
print(f'Weighted ecologically relevant precision score is {eco_rev_precision}')
print(f'Weighted ecologically relevant recall is {eco_rev_recall}')

### Confusion matrix

Plot a confusion matrix

In [None]:
# Create confution matrix
cm = confusion_matrix(true_classes, predicted_classes, normalize='true')

# Plot it
plt.figure(figsize=(20,20))
plt.imshow(cm, cmap='Greys')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=90, fontsize=14)
plt.yticks(tick_marks, classes, fontsize=14)
plt.ylabel('True label', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.title("Confusion matrix for RF", fontsize=20)
plt.show()

### Classification report

Plot a classification report

In [None]:
# Create classification report
report = classification_report(true_classes, predicted_classes, output_dict=True)

# List annotations for figure
annot = [str(x) for x in classes]
annot.extend(("accuracy", "macro avg", "weighted avg"))

# Convert report to dataframe
df_report = pd.DataFrame(report).transpose().drop('support', axis=1)

# Plot figure
plt.figure(figsize = (8,15))
sn.heatmap(df_report, annot=True, vmin=0, vmax=1.0,  yticklabels = annot, cmap="Greys")
plt.title("Classification report for RF", fontsize=16)
plt.show()

### Accuracy, precision and recall scores after regrouping classes

In [None]:
print(f'Grouped accuracy score is {accuracy_g}')
print(f'Grouped balanced accuracy score is {balanced_accuracy_g}')
print(f'Grouped weighted ecologically relevant precision score is {eco_rev_precision_g}')
print(f'Grouped weighted ecologically relevant recall is {eco_rev_recall_g}')

### Confusion matrix

Plot a confusion matrix

In [None]:
# Create confution matrix
cm_g = confusion_matrix(true_classes_g, predicted_classes_g, normalize='true')

# Plot if
plt.figure(figsize=(20,20))
plt.imshow(cm_g, cmap='Greys')
plt.colorbar()
tick_marks = np.arange(len(classes_g))
plt.xticks(tick_marks, classes_g, rotation=90, fontsize=14)
plt.yticks(tick_marks, classes_g, fontsize=14)
plt.ylabel('True label', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.title("Confusion matrix for CNN after grouping ecological classes", fontsize=20)
plt.show()

### Classification report

Plot a classification report

In [None]:
# Create classification report
report = classification_report(true_classes_g, predicted_classes_g, output_dict=True)

# List annotations for figure
annot = [str(x) for x in classes_g]
annot.extend(("accuracy", "macro avg", "weighted avg"))

# Convert report to dataframe
df_report = pd.DataFrame(report).transpose().drop('support', axis=1)

# Plot figure
plt.figure(figsize = (8,15))
sn.heatmap(df_report, annot=True, vmin=0, vmax=1.0,  yticklabels = annot, cmap="Greys")
plt.title("Classification report for CNN after grouping ecological classes", fontsize=16)
plt.show()