# Assess model performance - collate data using script

C:\Users\robot\code\ml\landsat\collate_validation_metrics.py

In [None]:
!conda info

In [None]:
import os
import pandas as pd
import math
import ast
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from IPython.display import display, Markdown
%matplotlib inline

# Disable scientific notation for pandas display
pd.set_option('display.float_format', lambda x: '%.10f' % x)

In [None]:
# Directory path to search through
directory_ = r'H:\biomass'
model_run = "test_train_all04_05"
model_run = "model_all01_02_03_04_05_06_v2"


directory_path = os.path.join(directory_, f"model_{model_run}")

combined_df = pd.read_csv(os.path.join(directory_path, "total_metrics.csv"))

In [None]:
# # Directory path to search through
# directory_ = r'H:\biomass'
# model_run = "test_train_mlp"
# model_run = "test_train_all04_05"
# 
# directory_path = os.path.join(directory_, f"{model_run}")
# 
# combined_df = pd.read_csv(os.path.join(directory_path, "total_metrics.csv"))

## Split into groups based on status

In [None]:
retrain_df = combined_df[combined_df["status"]=="retrain"]
retest_df = combined_df[combined_df["status"]=="retest"]

In [None]:
retest_df

# Join Validation and Train together

In [None]:
df_merged = pd.merge(retrain_df, retest_df, on=['mdl', 'model', 'sel_num', 'fac', 'var', 'file', 'stats', 'data', 'stdev'], how='outer')

In [None]:
# Rename columns containing "_x" to "validation" and "_y" to "train"
df_merged.columns = df_merged.columns.str.replace('_x', '_retrain').str.replace('_y', '_retest')
df_merged

In [None]:
def plot_best_model(df, model_run, model, var, data, std, display_limit):
    #df = df.sort_values(by=['rmse_retest', 'r2_retest', 'rmse_retrain', 'r2_retrain'], ascending=[True, False, True, False])
    df = df.sort_values(by=['rmse_retest'], ascending=[True])

    # Generate the Windows path
    df["file_path"] = (
        f"H:/biomass/model_{model_run}/AGB/" +
        df['var'].astype(str) + "/" +
        df['mdl'].astype(str) + "/" +
        df['stats'].astype(str) + "/std" +
        #df['stdev'].astype(str) + "/" +
        df['stdev'].astype(int).astype(str) + "/" + 
        df['data'].astype(str) + "/" +
        df['fac'].astype(str) + "/sel_num_" +
        df['sel_num'].astype(str).str[:2]
    )

    # Loop through each row in the DataFrame and print out the relevant information
    for i, row in enumerate(df.itertuples(index=False)):

        # Limit the number of displayed image sets to prevent memory overload
        if i >= display_limit:
            print("Display limit reached. Adjust the display limit to see more images.")
            break  # Stop after showing the defined number of image sets

        # Extract validation and test (retest) metrics from the row
        test_metrics = {
            'R² (retest)': getattr(row, 'r2_retest', np.nan),
            'MSE (retest)': getattr(row, 'mse_retest', np.nan),
            'RMSE (retest)': getattr(row, 'rmse_retest', np.nan),
            'MAE (retest)': getattr(row, 'mae_retest', np.nan),
            'MAPE (retest)': getattr(row, 'mape_retest', np.nan),
            'Bias (retest)': getattr(row, 'bias_retest', np.nan),
            'N (retest)': getattr(row, 'n_retest', np.nan)
        }

        retrain_metrics = {
            'R² (retrain)': getattr(row, 'r2_retrain', np.nan),
            'MSE (retrain)': getattr(row, 'mse_retrain', np.nan),
            'RMSE (retrain)': getattr(row, 'rmse_retrain', np.nan),
            'MAE (retrain)': getattr(row, 'mae_retrain', np.nan),
            'MAPE (retrain)': getattr(row, 'mape_retrain', np.nan),
            'Bias (retrain)': getattr(row, 'bias_retrain', np.nan),
            'N (retrain)': getattr(row, 'n_retrain', np.nan)
        }

        # Generate and display images (retest and validation as before)
        path = row.file_path
        print(f"Path: {path}")

        # Log the path being used to ensure the correct images are selected
        #print(f"Looking for image files in: {path}")

        # Split the path after "AGB" and keep the rest of the path
        base, after_agb = path.split("AGB", 1)

        # Replace forward slashes with underscores in the part after "AGB"
        after_agb_modified = after_agb.replace("/", "_")
        df.at[i, "apply_mdl"] = after_agb_modified[1:]  # Update the DataFrame
        print(f"If using - create directory: {after_agb_modified[1:]}")

        # Locate the directory and find files ending with retest.JPG and validation.JPG
        search_path = os.path.join(path, "*.JPG")  # Search for all JPG files in the directory

        # Get all JPG files
        jpg_files = glob.glob(search_path)

        # Filter for files ending with retest.JPG and validation.JPG
        retest_files = [file for file in jpg_files if file.endswith("retest.JPG")]
        retrain_files = [file for file in jpg_files if file.endswith("retrain.JPG")]

        # Log file paths to check which files are being imported
        if retest_files:
            print(f" - Retest Image found: {retest_files[0]}")
            #print(retest_files)
        else:
            print("No Retest Image found.")
            print("ERROR --"*50)

        if retrain_files:
            print(f" - retrain Image found: {retrain_files[0]}")
            #print(retrain_files)
        else:
            print("No retrain Image found.")
            print("ERROR --"*50)

        # **NEW** Search for files ending with retrain_distribution.JPG two sub-directories above the original path
        distribution_dir = os.path.abspath(os.path.join(path, "../.."))  # Two directories above
        distribution_search_path = os.path.join(distribution_dir, "*retrain_distribution.JPG")

        # Check if the retrain_distribution.JPG exists
        distribution_files = glob.glob(distribution_search_path)  # Search for the file

        #print("distribution_files: ", distribution_files)
        if len(distribution_files) > 0:
            #print(f"retrain Distribution Plot found at: {distribution_files[0]}")
            fig, ax = plt.subplots(figsize=(10, 8), dpi=150)  # Increase size and DPI for better quality
            retrain_distribution_img = mpimg.imread(distribution_files[0])
            ax.imshow(retrain_distribution_img)
            ax.set_title('retrain Distribution Plot', fontsize=16)  # Increased font size
            ax.axis('off')  # Turn off axis display
            plt.tight_layout()
            plt.show()

        # Plot the found JPG files (retest and retrain) side by side
        if retest_files or retrain_files:
            fig, axes = plt.subplots(1, 2, figsize=(12, 6), dpi=150)  # Larger figure and higher DPI

            # Load and display the retest image (if available)
            if retest_files:
                retest_img = mpimg.imread(retest_files[0])
                axes[0].imshow(retest_img)
                axes[0].set_title('Retest Image', fontsize=16)  # Increased font size
                axes[0].axis('off')  # Turn off axis display
            else:
                axes[0].axis('off')  # Leave the subplot blank if no image is available

            # Load and display the retrain image (if available)
            if retrain_files:
                retrain_img = mpimg.imread(retrain_files[0])
                axes[1].imshow(retrain_img)
                axes[1].set_title('retrain Image', fontsize=16)  # Increased font size
                axes[1].axis('off')  # Turn off axis display
            else:
                axes[1].axis('off')  # Leave the subplot blank if no image is available

            # Adjust layout and show the images
            plt.tight_layout(pad=2.0)  # Add padding between images
            plt.show()

            # Clear the figure from memory after displaying it
            plt.close()
            print("-" * 100)
            # Print the test and retrain metrics for comparison
            print(f"Test Metrics vs retrain Metrics for row {i}:")
            # for metric in test_metrics:
            #     print(f"{metric}: {test_metrics[metric]}"  |  {retrain_metric_name}: {retrain_metrics.get(retrain_metric_name, 'N/A')}")

            print("="*100)
        # Print the relevant features
        print(f"Features Used: {row.features_retest}")
        print("-" * 100)

    print("exported to: ", r"C:\Users\robot\code\pipelines\apply_biomass\{0}_{1}_{2}_{3}_{4}_overall_best.csv".format(
        model_run, model, var, data, std))
    # Save the updated DataFrame to a CSV file
    df.to_csv(r"C:\Users\robot\code\pipelines\apply_biomass\{0}_{1}_{2}_{3}_{4}_overall_best.csv".format(
        model_run, model, var, data, std))
    
    return df

In [None]:
def plot_best_model(df, model_run, model, var, data, std, display_limit):
    # Sort the DataFrame
    df = df.sort_values(by=['rmse_retest'], ascending=[True])

    # Generate the Windows path
    df["file_path"] = (
        f"H:/biomass/model_{model_run}/AGB/" +
        df['var'].astype(str) + "/" +
        df['mdl'].astype(str) + "/" +
        df['stats'].astype(str) + "/std" +
        df['stdev'].astype(int).astype(str) + "/" + 
        df['data'].astype(str) + "/" +
        df['fac'].astype(str) + "/sel_num_" +
        df['sel_num'].astype(str).str[:2]
    )

    # Loop through each row in the DataFrame and print out the relevant information
    for i, row in enumerate(df.itertuples(index=False)):

        # Limit the number of displayed image sets to prevent memory overload
        if i >= display_limit:
            print("Display limit reached. Adjust the display limit to see more images.")
            break

        # Extract validation and test (retest) metrics from the row
        test_metrics = {
            'R² (retest)': getattr(row, 'r2_retest', np.nan),
            'MSE (retest)': getattr(row, 'mse_retest', np.nan),
            'RMSE (retest)': getattr(row, 'rmse_retest', np.nan),
            'MAE (retest)': getattr(row, 'mae_retest', np.nan),
            'MAPE (retest)': getattr(row, 'mape_retest', np.nan),
            'Bias (retest)': getattr(row, 'bias_retest', np.nan),
            'N (retest)': getattr(row, 'n_retest', np.nan)
        }

        retrain_metrics = {
            'R² (retrain)': getattr(row, 'r2_retrain', np.nan),
            'MSE (retrain)': getattr(row, 'mse_retrain', np.nan),
            'RMSE (retrain)': getattr(row, 'rmse_retrain', np.nan),
            'MAE (retrain)': getattr(row, 'mae_retrain', np.nan),
            'MAPE (retrain)': getattr(row, 'mape_retrain', np.nan),
            'Bias (retrain)': getattr(row, 'bias_retrain', np.nan),
            'N (retrain)': getattr(row, 'n_retrain', np.nan)
        }

        # Generate and display images (retest and validation as before)
        path = row.file_path
        print(f"Path: {path}")
        
        # Split the path after "AGB" and keep the rest of the path
        base, after_agb = path.split("AGB", 1)

        # Replace forward slashes with underscores in the part after "AGB"
        after_agb_modified = after_agb.replace("/", "_")
        df.at[i, "apply_mdl"] = after_agb_modified[1:]  # Update the DataFrame
        print(f"If using - create directory: {after_agb_modified[1:]}")

        # Locate the directory and find files ending with retest.JPG and validation.JPG
        search_path = os.path.join(path, "*.JPG")  # Search for all JPG files in the directory
        jpg_files = glob.glob(search_path)

        # Filter for files ending with retest.JPG and validation.JPG
        retest_files = [file for file in jpg_files if file.endswith("retest.JPG")]
        retrain_files = [file for file in jpg_files if file.endswith("retrain.JPG")]

        # Log file paths to check which files are being imported
        if retest_files:
            print(f" - Retest Image found: {retest_files[0]}")
        else:
            print("No Retest Image found.")
            print("ERROR --" * 50)

        if retrain_files:
            print(f" - retrain Image found: {retrain_files[0]}")
        else:
            print("No retrain Image found.")
            print("ERROR --" * 50)

        # **NEW** Search for files ending with retrain_distribution.JPG two sub-directories above the original path
        distribution_dir = os.path.abspath(os.path.join(path, "../.."))  # Two directories above
        distribution_search_path = os.path.join(distribution_dir, "*retrain_distribution.JPG")

        # Check if the retrain_distribution.JPG exists
        distribution_files = glob.glob(distribution_search_path)  # Search for the file

        if len(distribution_files) > 0:
            fig, ax = plt.subplots(figsize=(10, 8), dpi=150)  # Increase size and DPI for better quality
            retrain_distribution_img = mpimg.imread(distribution_files[0])
            ax.imshow(retrain_distribution_img)
            ax.set_title('retrain Distribution Plot', fontsize=16)  # Increased font size
            ax.axis('off')  # Turn off axis display
            plt.tight_layout()
            plt.show()

        # Plot the found JPG files (retest and retrain) side by side
        if retest_files or retrain_files:
            fig, axes = plt.subplots(1, 2, figsize=(12, 6), dpi=150)  # Larger figure and higher DPI

            # Load and display the retest image (if available)
            if retest_files:
                retest_img = mpimg.imread(retest_files[0])
                axes[0].imshow(retest_img)
                axes[0].set_title('Retest Image', fontsize=16)  # Increased font size
                axes[0].axis('off')  # Turn off axis display
            else:
                axes[0].axis('off')  # Leave the subplot blank if no image is available

            # Load and display the retrain image (if available)
            if retrain_files:
                retrain_img = mpimg.imread(retrain_files[0])
                axes[1].imshow(retrain_img)
                axes[1].set_title('retrain Image', fontsize=16)  # Increased font size
                axes[1].axis('off')  # Turn off axis display
            else:
                axes[1].axis('off')  # Leave the subplot blank if no image is available

            # Adjust layout and show the images
            plt.tight_layout(pad=2.0)  # Add padding between images
            plt.show()

            # Clear the figure from memory after displaying it
            plt.close()
            print("-" * 100)

        from IPython.display import display
        import pandas as pd
        
        # Assuming test_metrics and retrain_metrics are dictionaries with metric names as keys
        metrics_table = pd.DataFrame({
            'Metric': list(test_metrics.keys()),
            'Retest': list(test_metrics.values()),
            'Retrain': [retrain_metrics[metric.replace('(retest)', '(retrain)')] for metric in test_metrics.keys()]
        })
        
        # Display the table
        display(metrics_table)

        #print(metrics_table.to_string(index=False))
        print("=" * 100)

        # Print the relevant features
        print(f"Features Used: {row.features_retest}")
        print("-" * 100)

    print("exported to: ", r"C:\Users\robot\code\pipelines\apply_biomass\{0}_{1}_{2}_{3}_{4}_overall_best.csv".format(
        model_run, model, var, data, std))
    # Save the updated DataFrame to a CSV file
    
    return df, model_run, model, var, data, std

In [None]:
# Adjust filtering condition for R² if necessary
df_low_rmse = df_merged[(df_merged['rmse_retest'] <= 7000.0) & (df_merged['r2_retest'] >= 0.6)] #\
   #                     & (df_merged['rmse_test'] <= 12000.0) & (df_merged['r2_test'] >= 0.6)]  # Adjusted to 0.5 assuming typical R² values

In [None]:
#'rmse_retest', 'r2_retest', 'rmse_retrain', 'r2_retrain'

In [None]:
print(list(df_low_rmse))

In [None]:
print(len(df_low_rmse))

In [None]:
df_low_rmse

In [None]:
# Make a copy of the DataFrame
df_low_rmse_run = df_low_rmse.copy()
mdl_list = sorted(df_low_rmse_run.mdl.unique())
var_list = sorted(df_low_rmse_run["var"].unique())
data_list = sorted(df_low_rmse_run["data"].unique())
std_list = sorted(df_low_rmse_run["stdev"].unique())
#mdl = "GBR"
mdl_sel_df = df_low_rmse_run[df_low_rmse_run["var"]=='all05_rs30']
#mdl_sel_df = df_low_rmse_run[(df_low_rmse_run["var"]=='all01_rs30') & (df_low_rmse_run["mdl"]=='KNN')]
#mdl_sel_df = df_low_rmse_run[(df_low_rmse_run["var"]=='all01_rs0')&(df_low_rmse_run["data"]=='all0')&(df_low_rmse_run["mdl"]=='RFR')]
#mdl_sel_df = df[(df["mdl"]==mdl) & (df["data"]!="all_data") & ((df["var"]=="ann02_rs0") | (df["var"]=="ann02_rs47")) ]
#df = mdl_sel_df.copy()
#mdl_sel_df = df[df["mdl"]==mdl]
#dl_sel_df = df_low_rmse_run[df_low_rmse_run["data"]!= "all0"]
#mdl_sel_df = df_low_rmse_run
#mdl_sel_df.shape
print("mdl_list: ", mdl_list)
print(var_list)
print(data_list)

In [None]:
best_df, model_run, model, var, data, std = plot_best_model(mdl_sel_df, model_run, "top", "top", "top", "top", 5)

In [None]:
best_df.to_csv(r"C:\Users\robot\code\pipelines\apply_biomass\{0}_{1}_{2}_{3}_{4}_rs30_all05_overall_best.csv".format(model_run, model, var, data, std))

In [None]:
best_df

In [None]:
best_df

In [None]:
# Find the model with the lowest RMSE
best_perform_model = best_df.loc[best_df['rmse_retest'].idxmin()]

# Calculate summary statistics
summary_stats = best_df.describe()

# Output results
print("Summary Statistics:")
print(summary_stats)
print("\nModel with the Lowest RMSE:")
print(best_perform_model)

In [None]:
# Normalise metrics
best_df['RMSE_norm'] = best_df['rmse_retest'] / best_df['rmse_retest'].max()  # Lower is better
best_df['MAE_norm'] = best_df['mae_retest'] / best_df['mae_retest'].max()    # Lower is better
best_df['R2_norm'] = 1 - ((best_df['r2_retest'].max() - best_df['r2_retest']) / (best_df['r2_retest'].max() - best_df['r2_retest'].min()))  # Higher is better

# Assign weights to each metric (optional)
weights = {'rmse_retest': 0.8, 'mae_retest': 0.1, 'r2_retest': 0.1}

# Calculate combined score
best_df['Score'] = (
    best_df['RMSE_norm'] * weights['rmse_retest'] +
    best_df['MAE_norm'] * weights['mae_retest'] +
    best_df['R2_norm'] * weights['r2_retest']
)

# Rank models based on the score
best_df['Rank'] = best_df['Score'].rank(ascending=True)

# Sort by rank
df_rank = best_df.sort_values(by='Rank')

# Output results
df_rank[['mdl', 'rmse_retest', 'mae_retest', 'r2_retest', 'Score', 'Rank', 'file_path', 'model', 'features_retest']]

In [None]:
# Group by 'mdl' and get the best rank (minimum score) per model
best_rank_per_model = df_rank.groupby('mdl', as_index=False).first()

# Sort the models by Rank for clear display
best_rank_per_model = best_rank_per_model.sort_values(by='Rank')

# Output the results
print("Best Rank for Each Model:")
best_rank_per_model[['mdl', 'rmse_retest', 'mae_retest', 'r2_retest', 'Score', 'Rank', 'file_path', 'model', 'features_retest']]


In [None]:
# Group by 'mdl' and find the row with the minimum 'rmse_retest' for each model
lowest_rmse_df = best_df.loc[best_df.groupby('mdl')['rmse_retest'].idxmin()]
# Reset index for cleaner presentation
lowest_rmse_df = lowest_rmse_df.reset_index(drop=True)

# Display the table with the lowest RMSE per model
print("Table of Lowest RMSE for Each Model:")
lowest_rmse_df

In [None]:
best_rank_per_model.to_csv(r"C:\Users\robot\code\pipelines\apply_biomass\{0}_{1}_{2}_{3}_{4}_AGB_best_model_results_rs30_all05.csv".format(model_run, model, var, data, std))
#best_rank_per_model.to_csv(r"H:\biomass\model_test_train_all01_02_03_04_05_06_final\AGB_best_model_results_rs30_all05.csv", index=False)
print(r"C:\Users\robot\code\pipelines\apply_biomass\{0}_{1}_{2}_{3}_{4}_AGB_best_model_results_rs30_all05.csv".format(model_run, model, var, data, std))

In [None]:
grouped_df = best_df.groupby(['mdl'])[['rmse_retest', 'r2_retest', 'mae_retest']].agg(['mean', 'std']).reset_index()
grouped_df


In [None]:
best_df