# Description
This notebook will do the following:
1. Calculate metrics on the model predictions
2. Identify clusters that are classified badly based on metadata and features
3. Identify clusters that are classified badly based on unstructured data by using embeddings (+ Train models that assign certain properties to samples based on embeddings)
4. Try to infer rules that describe and select problematic samples/clusters

In [1]:
ACCESS_TOKEN_FILE = "access_token.txt"

# Imports

In [2]:
from pathlib import Path
import pandas as pd
from jiwer import wer
from pyannote.audio import Model
from pyannote.audio import Inference
from sentence_transformers import SentenceTransformer
from renumics import spotlight
from renumics.spotlight import Audio, Embedding

In [3]:
access_token = Path(ACCESS_TOKEN_FILE).read_text()

# Load the data

In [4]:
df = pd.read_json("predictions.json")

In [5]:
df

Unnamed: 0,sentence,up_votes,down_votes,age,gender,accent,locale,segment,variant,prediction,audio
0,These rights were not reinstated during Nazi r...,2,0,,,,en,,,These rates were not reinstated during Nazi r...,audios/f4312eaa-8a00-436f-b99d-89f09bde5739.wav
1,The album was recorded at The Forum in Los Ang...,4,0,fourties,male,"German English,Non native speaker",en,,,The album was recorded at the forum in Los An...,audios/4d9acf55-f0e1-44e0-8304-5132f89b3b90.wav
2,"A veteran of the United States Army, Lowenstei...",2,0,,,,en,,,"A veteran of the United States Army, Lawrence...",audios/b3b02b6c-1b2b-4c1e-ad10-c25a2da3c3c5.wav
3,The album features heavy involvement from the ...,2,1,teens,other,Australian English,en,,,The album features heavy involvement from the...,audios/a29ace65-41c3-4ddc-bcd1-bfcca34e0847.wav
4,Rogers served in the Kentucky and North Caroli...,2,0,,,,en,,,Roger served in the Kentucky North Carolina A...,audios/9c1ff557-1898-4713-ad14-4536855034e0.wav
...,...,...,...,...,...,...,...,...,...,...,...
19995,He was by now a regular on the starting fifteen.,2,0,thirties,male,England English,en,,,He was by now a regular on the starting 15.,audios/8ecff702-b635-42ca-80fe-c4de3255c418.wav
19996,The team's home ballpark is Prince George's St...,2,0,,,,en,,,The team's Home Ball Park is French Georgia S...,audios/11ad111d-7272-48eb-82fd-2f6d4b0a4331.wav
19997,"Even so, the surveys were showing a degraded s...",4,2,fourties,male,"German English,Non native speaker",en,,,"Even so, the surveys showing a degraded surfi...",audios/b025707d-2a2c-4ae1-a36a-6266c52364f6.wav
19998,"To date, however, it has no recorded national ...",2,1,thirties,male,United States English,en,,,"2Date, however, it has now recorded national ...",audios/d5fb80b7-c291-49f3-aae4-1e4908b2f1ca.wav


# Compute metrics

In [6]:
word_error_rates = []

for _, row in df.iterrows():
    sample_wer = wer(row["sentence"], row["prediction"])
    word_error_rates.append(sample_wer)
    
df["wer"] = word_error_rates

# Compute additional features
* Text length, ...

# Compute speaker and text embeddings

In [9]:
# Create speaker embedding for detecting speaker based biases
speaker_emb_model = Model.from_pretrained(
    "pyannote/embedding", use_auth_token=access_token
)
inference = Inference(speaker_emb_model, window="whole")
df["speaker_embedding"] = [inference(af).tolist() for af in df["audio"]]


# Create text embedding model for detecting text based biases
sentence_embedding_model = SentenceTransformer(
    "all-MiniLM-L6-v2"
)
sentence_embeddings = sentence_embedding_model.encode(df["sentence"])
df["text_embedding_ann"] = [e.tolist() for e in sentence_embeddings]

sentence_embeddings = sentence_embedding_model.encode(df["prediction"])
df["text_embedding_pred"] = [e.tolist() for e in sentence_embeddings]

KeyboardInterrupt: 

In [7]:
# Either save or load embeddings
# df.to_json("predictions_embs.json")
df= pd.read_json("predictions_embs.json")

In [8]:
spotlight.show(df, dtype={"audio": Audio, "text_embedding_ann": Embedding, "text_embedding_pred": Embedding, "speaker_embedding": Embedding})

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:40885/'), HBox(children=(Button(description=…

# Map features to same value if similar in text or by clustering

to implement later. Would remove some false positives that stem from having different names for the same thing. Matching can be done via string matching or possibly also via speaker embeddings although this is a little unsafe.

# Find critical segments via features/metadata using fairlearn

In [8]:
from fairlearn.metrics import MetricFrame
import numpy as np

Mark groups with very few examples and groups that don't work well

In [9]:
FEATURES_TO_CHECK = ["age", "gender", "accent"]
CNT_RATIO_THRESHOLD = 0.1
METRIC_THRESHOLD = 0.1

In [10]:
def wer_metric(y_true, y_pred):
    return np.mean([wer(s_y, s_pred) for s_y, s_pred in zip(y_true, y_pred)])

In [11]:
df["issue_count_feature_univ"] = False
df["issue_fairness_feature_univ"] = False

for feature in FEATURES_TO_CHECK:
    mf = MetricFrame(metrics={"wer": wer_metric}, y_true=df["sentence"], y_pred=df["prediction"], sensitive_features=df[[feature]])
    overall_metric = mf.overall[0]
    overall_cnt = len(df)
    
    group_counts = df[feature].value_counts()
    
    cnt_fairness_df = mf.by_group.join(group_counts)
    
    for idx, row in cnt_fairness_df.iterrows():
        if row["count"] < (CNT_RATIO_THRESHOLD * (overall_cnt / len(cnt_fairness_df))):
            df.loc[df[feature] == idx, "issue_count_feature_univ"] = True
        metric_diff = (row["wer"] - mf.overall)[0]
        if metric_diff > METRIC_THRESHOLD:
            df.loc[(df[feature] == idx) & ((df["wer"] - overall_metric) > METRIC_THRESHOLD) , "issue_fairness_feature_univ"] = True

In [15]:
spotlight.show(df, dtype={"audio": Audio, "text_embedding_ann": Embedding, "text_embedding_pred": Embedding, "speaker_embedding": Embedding})

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:44651/'), HBox(children=(Button(description=…

# Find interaction effect based unfairness

will maybe be implemented later, however multifeature unfairness does sort of cover this, although in a less interpretable way

# Find multi feature unfairness

ToDo: Support Ordinal features such as age?

ToDo: Probably offer something similar with outlier detection in order to not rely on model predictions only

In [12]:
import numpy as np
from hnne import HNNE
from sklearn.preprocessing import OneHotEncoder

In [13]:
one_hot_encoded_features = []

data = None
for feature in FEATURES_TO_CHECK:
    feature_data = df[feature]
    feature_dtype = feature_data.dtype.name
    if feature_dtype in ["category", "string", "object"]:
        feature_data = feature_data.values
        if len(feature_data.shape) == 1:
            feature_data = feature_data[:, np.newaxis]
        oh_feature_data = OneHotEncoder(sparse_output=False).fit_transform(feature_data)
        
        oh_feature = f"{feature}_oh"
        df[oh_feature] = [f.tolist() for f in oh_feature_data]
        one_hot_encoded_features.append(oh_feature)
        
        if oh_feature_data.shape[1] > 200:
            print("Warning: Large one hot encoding")
        if data is None:
            data = oh_feature_data
        else:
            data = np.concatenate([data, oh_feature_data], axis=1)

print(f"Reducing data of shape {data.shape}")
hnne = HNNE()
projection = hnne.fit_transform(data)

Reducing data of shape (20000, 120)


In [15]:
df["feature_projection"] = [p.tolist() for p in projection]

In [16]:
partitions = hnne.hierarchy_parameters.partitions
partition_sizes = hnne.hierarchy_parameters.partition_sizes

In [17]:
for p_idx in range(partitions.shape[1]):
    df[f"clustering_{p_idx}"] = partitions[:, p_idx]

In [18]:
# ToDo: Basically same as for univariate. maybe generalize the selection rules.
df["issue_count_feature_muv"] = False
df["issue_fairness_feature_muv"] = False

for p_idx in range(partitions.shape[1]):
    feature = f"clustering_{p_idx}"
    mf = MetricFrame(metrics={"wer": wer_metric}, y_true=df["sentence"], y_pred=df["prediction"], sensitive_features=df[feature])
    overall_metric = mf.overall[0]
    overall_cnt = len(df)
    
    group_counts = df[feature].value_counts()
    
    cnt_fairness_df = mf.by_group.join(group_counts)
    
    for idx, row in cnt_fairness_df.iterrows():
        if row["count"] < (CNT_RATIO_THRESHOLD * (overall_cnt / len(cnt_fairness_df))):
            df.loc[df[feature] == idx, "issue_count_feature_muv"] = True
        metric_diff = (row["wer"] - mf.overall)[0]
        if metric_diff > METRIC_THRESHOLD:
            df.loc[(df[feature] == idx) & ((df["wer"] - overall_metric) > METRIC_THRESHOLD) , "issue_fairness_feature_muv"] = True

In [18]:
dtype_dict = {"audio": Audio,
                          "text_embedding_ann": Embedding,
                          "text_embedding_pred": Embedding,
                          "speaker_embedding": Embedding,
                          "feature_projection": Embedding}
for f in one_hot_encoded_features:
    dtype_dict[f] = Embedding
spotlight.show(df, dtype=dtype_dict)

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:40963/'), HBox(children=(Button(description=…

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [37]:
df

Unnamed: 0,sentence,up_votes,down_votes,age,gender,accent,locale,segment,variant,prediction,audio,wer,speaker_embedding,text_embedding_ann,text_embedding_pred,feature_projection
0,These rights were not reinstated during Nazi r...,2,0,,,,en,,,These rates were not reinstated during Nazi r...,audios/f4312eaa-8a00-436f-b99d-89f09bde5739.wav,0.250000,"[0.48153656720000004, 51.8129119873, 5.9365177...","[-0.012564905000000001, 0.07866118100000001, -...","[0.045725155600000005, 0.0401938185, -0.009251...","[4.0436070388160275, -0.6697940767497417]"
1,The album was recorded at The Forum in Los Ang...,4,0,fourties,male,"German English,Non native speaker",en,,,The album was recorded at the forum in Los An...,audios/4d9acf55-f0e1-44e0-8304-5132f89b3b90.wav,0.200000,"[-45.7863960266, 27.8419494629, 24.8209590912,...","[0.043684762, -0.061664264600000004, -0.031076...","[0.043684724700000004, -0.0616642721, -0.03107...","[-2.0917964089262697, -3.120701493348443]"
2,"A veteran of the United States Army, Lowenstei...",2,0,,,,en,,,"A veteran of the United States Army, Lawrence...",audios/b3b02b6c-1b2b-4c1e-ad10-c25a2da3c3c5.wav,0.214286,"[27.2776126862, 59.8899421692, 46.0985794067, ...","[0.0312165152, 0.0511640087, 0.007571328400000...","[-0.0033591513, 0.0733282343, -0.0340879634000...","[4.0436070388160275, -0.6697940767497439]"
3,The album features heavy involvement from the ...,2,1,teens,other,Australian English,en,,,The album features heavy involvement from the...,audios/a29ace65-41c3-4ddc-bcd1-bfcca34e0847.wav,0.400000,"[97.3435974121, -15.7048883438, 17.7362995148,...","[0.0134361545, -0.0262203366, -0.0016528621, 0...","[0.0047452236, -0.0391150378, -0.0390664600000...","[1.5350056288722154, 0.9490219857053079]"
4,Rogers served in the Kentucky and North Caroli...,2,0,,,,en,,,Roger served in the Kentucky North Carolina A...,audios/9c1ff557-1898-4713-ad14-4536855034e0.wav,0.181818,"[12.9974803925, 6.7657399178, 11.4522953033, 2...","[-0.0807927251, 0.024178849500000002, -0.01822...","[-0.10895261910000001, 0.0415730923, 0.0069424...","[4.0436070388160275, -0.669794076749741]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,He was by now a regular on the starting fifteen.,2,0,thirties,male,England English,en,,,He was by now a regular on the starting 15.,audios/8ecff702-b635-42ca-80fe-c4de3255c418.wav,0.100000,"[-17.9059429169, -0.9018530250000001, -8.17157...","[0.0664717183, 0.0155410888, -0.0084157716, -0...","[0.0297332369, -0.0002950175, -0.0190396663, -...","[-1.65164614654853, -0.2811413119269855]"
19996,The team's home ballpark is Prince George's St...,2,0,,,,en,,,The team's Home Ball Park is French Georgia S...,audios/11ad111d-7272-48eb-82fd-2f6d4b0a4331.wav,0.625000,"[23.8514080048, 17.4905891418, 33.3259124756, ...","[0.0723667592, 0.0241533518, 0.0584359132, -0....","[-0.0026451834000000003, 0.0213504396, 0.00195...","[4.0436070388160275, -0.6697940767497415]"
19997,"Even so, the surveys were showing a degraded s...",4,2,fourties,male,"German English,Non native speaker",en,,,"Even so, the surveys showing a degraded surfi...",audios/b025707d-2a2c-4ae1-a36a-6266c52364f6.wav,0.153846,"[-30.3821411133, 24.6229801178, 10.1435604095,...","[-0.013233019, -0.0332972743, 0.100652054, 0.0...","[-0.0102967089, -0.0505346283, 0.097537227, 0....","[-2.0917964089262733, -3.1207014933484363]"
19998,"To date, however, it has no recorded national ...",2,1,thirties,male,United States English,en,,,"2Date, however, it has now recorded national ...",audios/d5fb80b7-c291-49f3-aae4-1e4908b2f1ca.wav,0.300000,"[46.5772247314, 3.6495623589, 7.2550296783, 35...","[-0.0062213158, -0.016576109500000002, -0.0802...","[-0.0190290306, -0.033360444, -0.02469551, -0....","[-1.693252485180613, -0.217159516935523]"


# Find hidden stratification based unfairness

In [19]:
import numpy as np

In [20]:
df.columns

Index(['sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent',
       'locale', 'segment', 'variant', 'prediction', 'audio', 'wer',
       'speaker_embedding', 'text_embedding_ann', 'text_embedding_pred',
       'issue_count_feature_univ', 'issue_fairness_feature_univ', 'age_oh',
       'gender_oh', 'accent_oh', 'feature_projection', 'clustering_0',
       'clustering_1', 'clustering_2', 'issue_count_feature_muv',
       'issue_fairness_feature_muv'],
      dtype='object')

In [21]:
speaker_embedding_data = np.vstack(df["speaker_embedding"])
text_embedding_data = np.vstack(df["text_embedding_ann"])

In [22]:
print(f"Reducing data of shape {speaker_embedding_data.shape}")
speaker_hnne = HNNE()
speaker_projection = speaker_hnne.fit_transform(speaker_embedding_data)

Reducing data of shape (20000, 512)


In [24]:
df["speaker_projection"] = [p.tolist() for p in speaker_projection]

In [25]:
speaker_partitions = speaker_hnne.hierarchy_parameters.partitions
speaker_partition_sizes = speaker_hnne.hierarchy_parameters.partition_sizes

In [26]:
for p_idx in range(speaker_partitions.shape[1]):
    df[f"speaker_clustering_{p_idx}"] = speaker_partitions[:, p_idx]

In [27]:
# ToDo: Same again here. Definitely generalize this
df["issue_count_speaker_emb"] = False
df["issue_fairness_speaker_emb"] = False

for p_idx in range(speaker_partitions.shape[1]):
    feature = f"speaker_clustering_{p_idx}"
    mf = MetricFrame(metrics={"wer": wer_metric}, y_true=df["sentence"], y_pred=df["prediction"], sensitive_features=df[feature])
    overall_metric = mf.overall[0]
    overall_cnt = len(df)
    
    group_counts = df[feature].value_counts()
    
    cnt_fairness_df = mf.by_group.join(group_counts)
    
    for idx, row in cnt_fairness_df.iterrows():
        if row["count"] < (CNT_RATIO_THRESHOLD * (overall_cnt / len(cnt_fairness_df))):
            df.loc[df[feature] == idx, "issue_count_speaker_emb"] = True
        metric_diff = (row["wer"] - mf.overall)[0]
        if metric_diff > METRIC_THRESHOLD:
            df.loc[(df[feature] == idx) & ((df["wer"] - overall_metric) > METRIC_THRESHOLD) , "issue_fairness_speaker_emb"] = True

In [28]:
spotlight.show(df, dtype={"audio": Audio,
                          "text_embedding_ann": Embedding,
                          "text_embedding_pred": Embedding,
                          "speaker_embedding": Embedding,
                          "feature_projection": Embedding})

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:34575/'), HBox(children=(Button(description=…

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [28]:
print(f"Reducing data of shape {text_embedding_data.shape}")
text_hnne = HNNE()
text_projection = text_hnne.fit_transform(text_embedding_data)

Reducing data of shape (20000, 384)


In [29]:
df["text_projection"] = [p.tolist() for p in text_projection]

In [30]:
text_partitions = text_hnne.hierarchy_parameters.partitions
text_partition_sizes = text_hnne.hierarchy_parameters.partition_sizes

In [31]:
for p_idx in range(text_partitions.shape[1]):
    df[f"text_clustering_{p_idx}"] = text_partitions[:, p_idx]

In [32]:
# ToDo: Same again here. Definitely generalize this
# Idea for hierarchy merging. Go down from coarsest clusters. If going more finegranular makes the metric significantly worse assign to finer hierarchy level.
df["issue_count_text_emb"] = False
df["issue_fairness_text_emb"] = False
df["cur_hierarchy"] = -1
df["cur_cluster"] = -1

for p_num in range(text_partitions.shape[1]):
    p_idx = text_partitions.shape[1] - p_num - 1
    feature = f"text_clustering_{p_idx}"
    mf = MetricFrame(metrics={"wer": wer_metric}, y_true=df["sentence"], y_pred=df["prediction"], sensitive_features=df[feature])
    print(mf.by_group)
    overall_metric = mf.overall[0]
    overall_cnt = len(df)
    
    group_counts = df[feature].value_counts()
    
    cnt_fairness_df = mf.by_group.join(group_counts)
    
    for idx, row in cnt_fairness_df.iterrows():
        if row["count"] < (CNT_RATIO_THRESHOLD * (overall_cnt / len(cnt_fairness_df))):
            df.loc[df[feature] == idx, "issue_count_text_emb"] = True
        metric_diff = (row["wer"] - mf.overall)[0]
        if metric_diff > METRIC_THRESHOLD:
            df.loc[(df[feature] == idx) & ((df["wer"] - overall_metric) > METRIC_THRESHOLD) , "issue_fairness_text_emb"] = True

                        wer
text_clustering_4          
0                  0.226748
1                  0.281987
2                  0.279186
                        wer
text_clustering_3          
0                  0.205555
1                  0.280525
2                  0.292410
3                  0.244693
4                  0.278659
5                  0.230866
6                  0.291258
7                  0.243022
8                  0.365301
9                  0.274099
10                 0.264149
11                 0.299902
12                 0.272566
13                 0.226087
14                 0.279822
15                 0.280920
                        wer
text_clustering_2          
0                  0.184015
1                  0.288433
2                  0.304552
3                  0.259441
4                  0.249907
...                     ...
67                 0.339443
68                 0.187876
69                 0.241693
70                 0.251045
71                 0

ToDo: Treat hiearchy levels in clustering in some way. like this it doesn't make much sense

In [33]:
dtype_dict = {"audio": Audio,
                          "text_embedding_ann": Embedding,
                          "text_embedding_pred": Embedding,
                          "speaker_embedding": Embedding,
                          "feature_projection": Embedding}
for f in one_hot_encoded_features:
    dtype_dict[f] = Embedding
spotlight.show(df, dtype=dtype_dict)

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:32927/'), HBox(children=(Button(description=…

# Find the most probable cause for model problem based on simple detectors (classification)