# Analyze the results of the prediction
1. Generate the metadataset by running all cells in `create-metadataset.ipynb`
2. Run `train.py`. Make sure to use a config with the postprocessing step `VisualizationBlock`
3. Run all cells
4. View the plots, explore the tabels in your IDE, and view the images with the dashboard using their `tile_id`

In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
path = '../outputs/2024-01-17/14-33-08/results.csv'

In [12]:
# Load the processed metadataset
metadataset = pd.read_csv('../data/processed/metadata.csv', index_col=0)
metadataset

Unnamed: 0_level_0,cloud,land,missing_landsat,kelp,in_train
tile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
JW725114,0.008294,0.142604,0.000000,0.000082,True
UX493605,0.004155,0.303135,0.000000,0.007404,True
OU500661,0.039673,0.254376,0.000000,0.000000,True
DC227980,0.009371,0.429110,0.000000,0.000000,True
SS602790,0.061763,0.837020,0.000000,0.000000,True
...,...,...,...,...,...
UT495238,0.297796,0.601306,0.296580,,False
GE987629,0.307053,0.125967,0.307020,,False
EN974536,0.348498,0.714710,0.348269,,False
KI806222,0.215600,0.396090,0.133959,,False


Load prediction results csv to analyze performance

In [13]:
# Load the prediction results
results = pd.read_csv(path, index_col=0)
results

Unnamed: 0_level_0,sum_targets,sum_preds,intersections,dice_coef
image_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AB417661,0,25348.595703,0.0,0.0
AD125151,0,-190639.875,0.0,-0.0
AH540191,0,26756.070312,0.0,0.0
AI270757,0,19280.482422,0.0,0.0
AA498489,345,30486.707031,54.042187,0.003506
AG705659,1149,45786.222656,216.192764,0.009212
AF191061,1632,40367.9375,364.410583,0.017353


Join the results with the metadata

In [14]:
results = results.join(metadataset)
results

Unnamed: 0_level_0,sum_targets,sum_preds,intersections,dice_coef,cloud,land,missing_landsat,kelp,in_train
image_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB417661,0,25348.595703,0.0,0.0,0.019763,0.0,0.0,0.0,True
AD125151,0,-190639.875,0.0,-0.0,0.097967,0.0,0.0,0.0,True
AH540191,0,26756.070312,0.0,0.0,0.006457,0.081616,0.000424,0.0,True
AI270757,0,19280.482422,0.0,0.0,0.025037,0.0,0.025037,0.0,True
AA498489,345,30486.707031,54.042187,0.003506,5.7e-05,0.279241,0.0,0.002816,True
AG705659,1149,45786.222656,216.192764,0.009212,0.015543,0.606122,0.0,0.00938,True
AF191061,1632,40367.9375,364.410583,0.017353,0.010245,0.366939,0.0,0.013322,True


# Compute error
Error is roughly "how much would our score increase if we had perfect predictions for this error"
Lower is better.

In [15]:
# compute error (difference between dice if prediction would've been perfect and actual dice)
total_intersection = results['intersection'].sum()
total_pred_target_sum = results['pred_sum'].sum() + results['target_sum'].sum()

perfect_dice_coef = (2*results['intersection']+total_intersection) / (2*results['intersection'] + total_pred_target_sum)
actual_dice_coef = (2*results['intersection']+total_intersection) / (results['pred_sum'] + results['target_sum'] + total_pred_target_sum)
results['error'] = perfect_dice_coef - actual_dice_coef
results

KeyError: 'intersection'

In [None]:
# Plot the correlation matrix with sns
corr = results.corr()
sns.heatmap(corr, annot=True)
plt.show()

# Plots against error

In [None]:
# Plot scatterplots of error
for col in ['dice_coef','kelp', 'land', 'cloud', 'missing_landsat']:
    sns.scatterplot(data=results, x=col, y='error')
    plt.show()

# Plots against dice coef

In [None]:
for col in ['error','kelp', 'land', 'cloud', 'missing_landsat']:
    sns.scatterplot(data=results, x=col, y='dice_coef')
    plt.show()

In [16]:
# There is a clear correlation between error and kelp. 
# Create a new column that is the error divided by kelp
# Then make the plots against error again
results['error_per_kelp'] = results['error'] / results['kelp']


# remove outliers with an error_per_kelp larger than n stds
n = 2
results_clean = results[results['kelp'] > 0]
mean = results_clean['error_per_kelp'].mean()
stds = results_clean['error_per_kelp'].std()
results_clean = results_clean[results_clean['error_per_kelp'] < mean + n*stds]


for col in ['dice_coef','kelp', 'land', 'cloud', 'missing_landsat']:
    
    sns.scatterplot(data=results_clean, x=col, y='error_per_kelp')
    plt.show()

KeyError: 'error'