In [1]:
import numpy as np
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon
from sklearn.metrics import recall_score, precision_score

import pandas as pd
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [12]:
ds_version = "V1_0"
root_data_path = os.path.join("..", "data", ds_version)
data_path = os.path.join(root_data_path, "metadata")
annotations_file = "streetSurfaceVis_v1_0.csv"
annot = pd.read_csv(os.path.join(root_data_path, annotations_file)).rename(
    columns={"mapillary_image_id": "Image", "surface_type": "type_true"}).replace(
        {'asphalt': 'asphalt', 'concrete': 'asphalt', 'paving_stones': 'paved', 'sett': 'paved'})

In [13]:
# this are sample paths and need to be replaced with the final actual paths
paths=[
    "RTK_complete_lmh-crop_s42_type_prediction-V1_0_annotated-20240820_155547.csv",
    "RTK_complete_lmh-crop_s1024_type_prediction-V1_0_annotated-20240820_160652.csv",
    "RTK_complete_lmh-crop_s3_type_prediction-V1_0_annotated-20240821_083348.csv",
    "RTK_complete_lmh-crop_s57_type_prediction-V1_0_annotated-20240821_083834.csv",
    "RTK_complete_lmh-crop_s1000_type_prediction-V1_0_annotated-20240821_084337.csv",
    ]

## Robustness: Entropy

In [14]:
# entropy close to 0: all values are the same
# entropy close to 1: all values are different

### Variation 1: by category

In [15]:
rtk_type_pred_allruns = pd.DataFrame()
for i in range(0, len(paths)):
    rtk = pd.read_csv(os.path.join(data_path, paths[i]), index_col=False)
    rtk_type_pred = rtk[rtk.Level == "type"][["Image", "Prediction", "Level_0", "is_in_validation"]]
    idx = rtk_type_pred.groupby("Image")["Prediction"].idxmax()
    rtk_type_pred = rtk.loc[idx]
    rtk_type_pred["run"] = i
    rtk_type_pred_allruns = pd.concat([rtk_type_pred_allruns, rtk_type_pred], axis=0)

In [16]:
base = 2
n = 3 # how many different classes?
hmax = np.log2(n)  # maximum entropy

In [17]:
def calculate_entropy(group):
    value_counts = group["Level_0"].value_counts().values
    H = entropy(value_counts, base=base)
    return H / hmax #normalized entropy

In [18]:
entr = rtk_type_pred_allruns.groupby(["Image"]).apply(lambda x: calculate_entropy(x))

  entr = rtk_type_pred_allruns.groupby(["Image"]).apply(lambda x: calculate_entropy(x))


In [19]:
entr.sort_values().value_counts()

0.000000    6063
0.455486    1205
0.612602     993
0.864974      46
0.960230      39
Name: count, dtype: int64

In [20]:
entr.mean()

0.14790458620695487

In [21]:
# TODO: merge true surface type and average entropy by true surface type
rtk_type_pred_allruns = rtk_type_pred_allruns.set_index("Image").join(
        annot[["Image", "type_true"]].set_index("Image"), how="left").reset_index()


In [22]:
entr = rtk_type_pred_allruns.groupby(["type_true", "Image"]).apply(lambda x: calculate_entropy(x))

  entr = rtk_type_pred_allruns.groupby(["type_true", "Image"]).apply(lambda x: calculate_entropy(x))


In [23]:
entr.groupby(["type_true"]).mean()

type_true
asphalt    0.040606
paved      0.230320
unpaved    0.339204
dtype: float64

### Variation 2: by probability distribution

Jensen-Shannon distance: symmetric version of Kullback-Leibler divergence bound between 0 and 1.

In [24]:
rtk_dsets= []
for i in range(0, len(paths)):
    rtk = pd.read_csv(os.path.join(data_path, paths[i]), index_col=False)
    rtk_type_pred = rtk[rtk.Level == "type"][["Image", "Prediction", "Level_0", "is_in_validation"]]
    rtk_type_pred['Prediction'] = rtk_type_pred['Prediction'].replace(0, 1e-10) # quick fix: make sure no values are 0
    rtk_type_pred = rtk_type_pred.set_index("Image").join(
        annot[["Image", "type_true"]].set_index("Image"), how="left").reset_index()
    rtk_dsets.append(rtk_type_pred)

In [25]:
ents = []
jsds = []
for i in range(0, len(paths)):
    for j in range(0, len(paths)):
        if j <= i:
            continue
        e = entropy(rtk_dsets[i].Prediction, 
                        rtk_dsets[j].Prediction, base=base)
        jsd = jensenshannon(rtk_dsets[i].Prediction, 
                            rtk_dsets[j].Prediction, base = base)
        ents.append(e)
        jsds.append(jsd)

In [26]:
ents

[0.7080327967275319,
 0.4293237321899831,
 0.330842182611124,
 1.2347745214167893,
 0.5751466908204472,
 0.2635403217554511,
 0.7360743128562794,
 0.5937321065022212,
 0.6060428815695277,
 0.9346009372615344]

In [27]:
np.mean(ents)

0.641211048371089

In [28]:
np.mean(jsds)

0.2974374820554756

In [29]:
# TODO: merge true values to compute entropy for each type individually

In [30]:
types = ["asphalt", "paved", "unpaved"]
ents = {}
for type in types:
    ents[type] = []
for i in range(0, len(paths)):
    rtk_type_pred_i = rtk_dsets[i]
    for j in range(0, len(paths)):
        if j <= i:
            continue
        rtk_type_pred_j = rtk_dsets[j]
        for type in types:
            # TODO: filter by true type
            e = entropy(rtk_type_pred_i[rtk_type_pred_i["type_true"]==type].Prediction, # rtk_dsets[i][rtk_dsets[i].true_type == type].Prediction,
                            rtk_type_pred_j[rtk_type_pred_j["type_true"]==type].Prediction, base=base) # rtk_dsets[j][rtk_dsets[j].true_type == type].Prediction,
            ents[type].append(e)

In [31]:
ents

{'asphalt': [0.311335727765581,
  0.1744855091605928,
  0.19329335858212524,
  0.4646387576683817,
  0.14373733054036744,
  0.11810601473595767,
  0.1559746114963259,
  0.15190669637192536,
  0.17935534048719565,
  0.3289270153112079],
 'paved': [0.4956412950070443,
  0.635036347267402,
  0.30515209298157786,
  1.7672459016995592,
  0.8443428230137626,
  0.2991018097511743,
  1.5792710902960594,
  0.9931023769275665,
  0.683214023766153,
  1.551763703255125],
 'unpaved': [3.2231715142879036,
  0.84880004484146,
  1.0352722042741935,
  2.8143346055074856,
  1.5616445729582322,
  0.7890549011381611,
  0.380122915900042,
  1.1716520630994096,
  2.2428077641025266,
  1.4831924652702673]}

In [32]:
# averaged entropy
for ent in ents:
    print(f"{ent}: {np.mean(ents[ent]).round(2)}")

asphalt: 0.22
paved: 0.92
unpaved: 1.56
