In [1]:
import numpy as np
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon
from sklearn.metrics import recall_score, precision_score

import pandas as pd
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [37]:
ds_version = "RTK"
data_path = os.path.join("..", "data", ds_version, "metadata")
annotations_file = "GT_RTK.csv"
annot = pd.read_csv(os.path.join(data_path, annotations_file)).rename(columns={"image_id": "Image"})

In [38]:
# this are sample paths and need to be replaced with the final actual paths
paths=[
    "V1_0_merged_lmh-crop_s42_type_prediction-RTK_GT-20240820_162043.csv",
    "V1_0_merged_lmh-crop_s1024_type_prediction-RTK_GT-20240820_162155.csv",
    "V1_0_merged_lmh-crop_s3_type_prediction-RTK_GT-20240821_092534.csv",
    "V1_0_merged_lmh-crop_s57_type_prediction-RTK_GT-20240821_092741.csv",
    "V1_0_merged_lmh-crop_s1000_type_prediction-RTK_GT-20240821_092950.csv",
    ]

## Robustness: Entropy

In [39]:
# entropy close to 0: all values are the same
# entropy close to 1: all values are different

### Variation 1: by category

In [54]:
rtk_type_pred_allruns = pd.DataFrame()
for i in range(0, len(paths)):
    rtk = pd.read_csv(os.path.join(data_path, paths[i]), index_col=False)
    rtk_type_pred = rtk[rtk.Level == "type"][["Image", "Prediction", "Level_0", "is_in_validation"]]
    idx = rtk_type_pred.groupby("Image")["Prediction"].idxmax()
    rtk_type_pred = rtk.loc[idx]
    rtk_type_pred["run"] = i
    rtk_type_pred_allruns = pd.concat([rtk_type_pred_allruns, rtk_type_pred], axis=0)

In [55]:
base = 2
n = 3 # how many different classes?
hmax = np.log2(n)  # maximum entropy

In [56]:
def calculate_entropy(group):
    value_counts = group["Level_0"].value_counts().values
    H = entropy(value_counts, base=base)
    return H / hmax #normalized entropy

In [57]:
entr = rtk_type_pred_allruns.groupby(["Image"]).apply(lambda x: calculate_entropy(x))

  entr = rtk_type_pred_allruns.groupby(["Image"]).apply(lambda x: calculate_entropy(x))


In [58]:
entr.sort_values().value_counts()

0.000000    4403
0.455486    1084
0.612602     443
0.864974     237
0.960230     130
Name: count, dtype: int64

In [59]:
entr.mean()

0.17388563394332615

In [60]:
# TODO: merge true surface type and average entropy by true surface type
rtk_type_pred_allruns = rtk_type_pred_allruns.set_index("Image").join(
        annot[["Image", "type_true"]].set_index("Image"), how="left").reset_index()


In [61]:
entr = rtk_type_pred_allruns.groupby(["type_true", "Image"]).apply(lambda x: calculate_entropy(x))

  entr = rtk_type_pred_allruns.groupby(["type_true", "Image"]).apply(lambda x: calculate_entropy(x))


In [66]:
entr.groupby(["type_true"]).mean()

type_true
asphalt    0.093779
paved      0.262018
unpaved    0.259873
dtype: float64

### Variation 2: by probability distribution

Jensen-Shannon distance: symmetric version of Kullback-Leibler divergence bound between 0 and 1.

In [80]:
rtk_dsets= []
for i in range(0, len(paths)):
    rtk = pd.read_csv(os.path.join(data_path, paths[i]), index_col=False)
    rtk_type_pred = rtk[rtk.Level == "type"][["Image", "Prediction", "Level_0", "is_in_validation"]]
    rtk_type_pred['Prediction'] = rtk_type_pred['Prediction'].replace(0, 1e-10) # quick fix: make sure no values are 0
    rtk_type_pred = rtk_type_pred.set_index("Image").join(
        annot[["Image", "type_true"]].set_index("Image"), how="left").reset_index()
    rtk_dsets.append(rtk_type_pred)

In [81]:
ents = []
jsds = []
for i in range(0, len(paths)):
    for j in range(0, len(paths)):
        if j <= i:
            continue
        e = entropy(rtk_dsets[i].Prediction, 
                        rtk_dsets[j].Prediction, base=base)
        jsd = jensenshannon(rtk_dsets[i].Prediction, 
                            rtk_dsets[j].Prediction, base = base)
        ents.append(e)
        jsds.append(jsd)

In [82]:
ents

[1.118153316833213,
 0.7791834595029071,
 0.40053026421305926,
 0.4083522511432169,
 0.8090901296400019,
 1.040079197523175,
 0.7946481704798619,
 0.7965666514117775,
 0.38143960580383374,
 0.6223286375307305]

In [83]:
np.mean(ents)

0.7150371684081778

In [84]:
np.mean(jsds)

0.3190167376631419

In [85]:
# TODO: merge true values to compute entropy for each type individually

In [86]:
types = ["asphalt", "paved", "unpaved"]
ents = {}
for type in types:
    ents[type] = []
for i in range(0, len(paths)):
    rtk_type_pred_i = rtk_dsets[i]
    for j in range(0, len(paths)):
        if j <= i:
            continue
        rtk_type_pred_j = rtk_dsets[j]
        for type in types:
            # TODO: filter by true type
            e = entropy(rtk_type_pred_i[rtk_type_pred_i["type_true"]==type].Prediction, # rtk_dsets[i][rtk_dsets[i].true_type == type].Prediction,
                            rtk_type_pred_j[rtk_type_pred_j["type_true"]==type].Prediction, base=base) # rtk_dsets[j][rtk_dsets[j].true_type == type].Prediction,
            ents[type].append(e)

In [87]:
ents

{'asphalt': [0.8674320859034795,
  0.4926338121702718,
  0.4048923833158444,
  0.3860314579995721,
  0.09984003202756292,
  0.41792701251797604,
  0.10001992387168337,
  0.47979094413769724,
  0.1457458848987931,
  0.8276380630831062],
 'paved': [1.4793923328501384,
  1.5086516336574414,
  0.596693566749406,
  0.6995020160115184,
  1.9088699526996222,
  1.1273466158157075,
  1.3436254423575795,
  0.8664623739803728,
  0.466395890359656,
  0.6026657995574866],
 'unpaved': [1.287253733922836,
  0.6015914322606501,
  0.16045121064951245,
  0.12003969218251277,
  1.1962087734477058,
  2.407463813073801,
  1.792408613861935,
  1.4629602829743418,
  0.8386658281625718,
  0.16039286653074494]}

In [88]:
# averaged entropy
for ent in ents:
    print(f"{ent}: {np.mean(ents[ent]).round(2)}")

asphalt: 0.42
paved: 1.06
unpaved: 1.0
