In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#importing final NER model results
df = pd.read_csv('/content/gdrive/My Drive/sample_model_predictions.csv')

In [None]:
import warnings
warnings.filterwarnings('ignore')

#formatting labels
for i in range(len(df)):
    df['bio_labels'][i] = df['bio_labels'][i].replace("'", "").replace(']','').replace('[','').replace(" ",'').split(',')
    #removing None labels (corresponding to padding and special tokens)
    df['bio_labels'][i] = [x for x in df['bio_labels'][i] if x != 'None']

#formatting predictions
for i in range(len(df)):
    df['predictions'][i] = df['predictions'][i].replace("'", "").replace(']','').replace('[','').replace(" ",'').split(',')

### Output Categories
* **Correct (COR):** the predicted bias spans match the labeled spans
* **Partial (PAR):** at least one predicted bias span overlaps with the labeled spans
* **Spurious (SPU):** the prediction contains a biased span, the label does not
* **Missing (MIS):** the prediction does not contain a biased span, the label does
* **Incorrect (INC):** the prediction contains bias spans that do not overlap with the labeled spans.

In [None]:
#categorizing predictions vs. labels using MUC-5 system
def muc_label(table):
    muc = []
    for i in range(len(table)):
        if table['bio_labels'][i] == table['predictions'][i]:
            muc.append('COR')
        elif ('B' in table['bio_labels'][i]) & ('B' not in table['predictions'][i]):
            muc.append('MIS')
        elif ('B' not in table['bio_labels'][i]) & ('B' in table['predictions'][i]):
            muc.append('SPU')
        else:
            for j in range(len(table['predictions'][i])):
                if (table['bio_labels'][i][j] == 'B') & (table['predictions'][i][j] == 'B'):
                    muc.append('PAR')
                    break
                elif (table['bio_labels'][i][j] == 'B') & (table['predictions'][i][j] == 'I'):
                    muc.append('PAR')
                    break
                elif (table['bio_labels'][i][j] == 'I') & (table['predictions'][i][j] == 'B'):
                    muc.append('PAR')
                    break
            else:
                muc.append('INC')
    #creating column for error types
    table['error type'] = muc
    return table

In [None]:
muc_label(df)

Unnamed: 0.1,Unnamed: 0,tokens,bio_labels,labels,input_ids,attention_mask,BIAS,BIASED_LIST,predictions,bin_cls_pred_lbl,...,Unnamed: 14,Diff,B,I,O,Unnamed: 19,B.1,I.1,O.1,error type
0,4795,"['<s>', 'Ä Hey', 'Ä Just', 'Ä wanted', 'Ä to',...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 11468, 1801, 770, 7, 224, 2446, 13, 5, 211...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,52,1,0,0,52,,12206,25605,161283,COR
1,4796,"['<s>', 'Ä Im', 'Ä not', 'Ä posting', 'Ä their...","[O, O, O, O, O, O, O, O, O, O, O, O, B, I, I, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 5902, 45, 6016, 49, 37, 393, 1286, 6461, 5...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,['give him hell'],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,19,1,0,0,19,,6.1%,12.9%,81.0%,MIS
2,4797,"['<s>', 'Ä How', 'Ä quickly', 'Ä we', 'Ä forge...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1336, 1335, 52, 4309, 84, 375, 479, 1336, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,162,1,3,8,151,,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",,,SPU
3,4798,"['<s>', 'Ä Look', 'Ä forward', 'Ä to', 'Ä a', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 6893, 556, 7, 10, 3620, 715, 784, 3863, 21...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Yes,"['foolish Liberal tool', 'force up Toronto hou...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,55,1,8,13,34,,,,,PAR
4,4799,"['<s>', 'Ä As', 'Ä you', 'Ä know', 'Ä since', ...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 287, 47, 216, 187, 47, 486, 162, 10, 29989...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Yes,"['troll', 'absurd', 'baseless attacks', 'boot ...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, ...",0,...,128,1,12,13,103,,,,,PAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2392,7187,"['<s>', 'Ä Crazy', 'Ä crazy', 'Ä work', 'in', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 24605, 5373, 173, 179, 3422, 27785, 27785,...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,[],"[O, B, I, I, O, O, O, B, I, I, B, I, I, O, B, ...",0,...,31,1,4,9,18,,,,,SPU
2393,7188,"['<s>', 'Ä Hey', 'Ä I', 'Ä posted', 'Ä my', 'Ä...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 11468, 38, 1278, 127, 856, 4097, 8112, 376...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Yes,"['hated', 'slut', 'Eeew', 'screw it', 'poor re...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,236,1,10,7,219,,,,,PAR
2394,7189,"['<s>', 'Ä N', 'onsense', 'Ä And', 'Ä this', '...","[O, O, O, O, O, O, O, O, O, O, O, B, I, I, I, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, ...","[0, 234, 34040, 178, 42, 16, 567, 31, 65, 9, 5...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,"['nonsense', 'sockpuppets', 'Bahai administrat...","[O, O, O, O, O, O, O, O, O, O, O, B, I, I, I, ...",0,...,20,1,1,3,16,,,,,PAR
2395,7190,"['<s>', 'Ä There', 'Ä is', 'Ä ON', 'GO', 'ING'...","[O, O, O, O, O, B, I, I, I, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, ...","[0, 345, 16, 5121, 14740, 1862, 11902, 30, 312...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Yes,"['bullying by Mayor Caldwell', 'shut down diss...","[O, O, B, I, I, B, O, O, O, O, O, O, O, O, O, ...",0,...,207,1,7,17,183,,,,,PAR


Unnamed: 0.1,Unnamed: 0,tokens,bio_labels,labels,input_ids,attention_mask,BIAS,BIASED_LIST,predictions,bin_cls_pred_lbl,...,Unnamed: 14,Diff,B,I,O,Unnamed: 19,B.1,I.1,O.1,error type
0,4795,"['<s>', 'Ä Hey', 'Ä Just', 'Ä wanted', 'Ä to',...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 11468, 1801, 770, 7, 224, 2446, 13, 5, 211...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,52,1,0,0,52,,12206,25605,161283,COR
1,4796,"['<s>', 'Ä Im', 'Ä not', 'Ä posting', 'Ä their...","[O, O, O, O, O, O, O, O, O, O, O, O, B, I, I, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 5902, 45, 6016, 49, 37, 393, 1286, 6461, 5...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,['give him hell'],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,19,1,0,0,19,,6.1%,12.9%,81.0%,MIS
2,4797,"['<s>', 'Ä How', 'Ä quickly', 'Ä we', 'Ä forge...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1336, 1335, 52, 4309, 84, 375, 479, 1336, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,162,1,3,8,151,,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",,,SPU
3,4798,"['<s>', 'Ä Look', 'Ä forward', 'Ä to', 'Ä a', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 6893, 556, 7, 10, 3620, 715, 784, 3863, 21...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Yes,"['foolish Liberal tool', 'force up Toronto hou...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,55,1,8,13,34,,,,,PAR
4,4799,"['<s>', 'Ä As', 'Ä you', 'Ä know', 'Ä since', ...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 287, 47, 216, 187, 47, 486, 162, 10, 29989...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Yes,"['troll', 'absurd', 'baseless attacks', 'boot ...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, ...",0,...,128,1,12,13,103,,,,,PAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2392,7187,"['<s>', 'Ä Crazy', 'Ä crazy', 'Ä work', 'in', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 24605, 5373, 173, 179, 3422, 27785, 27785,...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,[],"[O, B, I, I, O, O, O, B, I, I, B, I, I, O, B, ...",0,...,31,1,4,9,18,,,,,SPU
2393,7188,"['<s>', 'Ä Hey', 'Ä I', 'Ä posted', 'Ä my', 'Ä...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 11468, 38, 1278, 127, 856, 4097, 8112, 376...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Yes,"['hated', 'slut', 'Eeew', 'screw it', 'poor re...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,...,236,1,10,7,219,,,,,PAR
2394,7189,"['<s>', 'Ä N', 'onsense', 'Ä And', 'Ä this', '...","[O, O, O, O, O, O, O, O, O, O, O, B, I, I, I, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, ...","[0, 234, 34040, 178, 42, 16, 567, 31, 65, 9, 5...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",No,"['nonsense', 'sockpuppets', 'Bahai administrat...","[O, O, O, O, O, O, O, O, O, O, O, B, I, I, I, ...",0,...,20,1,1,3,16,,,,,PAR
2395,7190,"['<s>', 'Ä There', 'Ä is', 'Ä ON', 'GO', 'ING'...","[O, O, O, O, O, B, I, I, I, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, ...","[0, 345, 16, 5121, 14740, 1862, 11902, 30, 312...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Yes,"['bullying by Mayor Caldwell', 'shut down diss...","[O, O, B, I, I, B, O, O, O, O, O, O, O, O, O, ...",0,...,207,1,7,17,183,,,,,PAR


In [None]:
#function to generate dataframe of error counts by type
def get_error_counts(table):
  error_counts = table['error type'].value_counts().rename_axis('error type').reset_index(name = 'count')
  return error_counts

In [None]:
#error counts for full test set
full_error_counts = get_error_counts(df)
full_error_counts

Unnamed: 0,error type,count
0,PAR,1161
1,COR,758
2,SPU,408
3,MIS,57
4,INC,13


Unnamed: 0,error type,count
0,PAR,1161
1,COR,758
2,SPU,408
3,MIS,57
4,INC,13


### Metrics

* Error Rate =(INC +PAR/2 + SPU + MIS)/(COR +PAR + INC +SPU + MIS)
* Overgeneration = SPU/(COR+PAR+INC+SPU)
* Undergeneration = MIS/(COR + PAR + INC + MIS)
* Accuracy = (COR + PAR/2)/(COR + PAR + SPU + MIS + INC)

In [None]:
#function to generate dataframe of evaluation metrics
def get_ner_metrics(table):
  inc = int(table[table['error type'] == 'INC']['count'])
  par = int(table[table['error type'] == 'PAR']['count'])
  spu = int(table[table['error type'] == 'SPU']['count'])
  mis = int(table[table['error type'] == 'MIS']['count'])
  cor = int(table[table['error type'] == 'COR']['count'])
  metric_names = ['error rate', 'overgeneration', 'undergeneration', 'derived accuracy']
  metric_values = [((inc + par/2 + spu + mis)/(cor + par + inc + spu + mis)),
                   (spu/(cor + par + inc + spu)),
                   (mis/(cor + par + inc + spu)),
                   ((cor + par/2)/(cor + par + inc + spu + mis))]
  return pd.DataFrame(metric_values, index=metric_names)

In [None]:
#eval metrics for full test set
get_ner_metrics(full_error_counts)

Unnamed: 0,0
error rate,0.441594
overgeneration,0.174359
undergeneration,0.024359
derived accuracy,0.558406


Unnamed: 0,0
error rate,0.441594
overgeneration,0.174359
undergeneration,0.024359
derived accuracy,0.558406


### Subgroup Analysis

In [None]:
#creating columns for input length to categorize long (51-512 tokens)
# vs. short (0-50 tokens) inputs
df['length'] = 0
df['input_category'] = 0

for i in range(len(df)):
  count = len(df['bio_labels'][i])
  df['length'][i] = count
  if df['length'][i] < 51:
    df['input_category'][i] = 'short'
  else:
    df['input_category'][i] = 'long'


In [None]:
#checking balance in test set
df['input_category'].value_counts()

Unnamed: 0_level_0,count
input_category,Unnamed: 1_level_1
long,1350
short,1047


Unnamed: 0_level_0,count
input_category,Unnamed: 1_level_1
long,1350
short,1047


In [None]:
#splitting into short vs. long input dataframes
test_short = df[df['input_category'] == 'short']
test_long =  df[df['input_category'] == 'long']

In [None]:
short_error_counts = get_error_counts(test_short)
short_error_counts

Unnamed: 0,error type,count
0,COR,498
1,PAR,317
2,SPU,194
3,MIS,34
4,INC,4


Unnamed: 0,error type,count
0,COR,498
1,PAR,317
2,SPU,194
3,MIS,34
4,INC,4


In [None]:
#metrics for short inputs
get_ner_metrics(short_error_counts)

Unnamed: 0,0
error rate,0.37297
overgeneration,0.19151
undergeneration,0.033564
derived accuracy,0.62703


Unnamed: 0,0
error rate,0.37297
overgeneration,0.19151
undergeneration,0.033564
derived accuracy,0.62703


In [None]:
long_error_counts = get_error_counts(test_long)
long_error_counts

Unnamed: 0,error type,count
0,PAR,844
1,COR,260
2,SPU,214
3,MIS,23
4,INC,9


Unnamed: 0,error type,count
0,PAR,844
1,COR,260
2,SPU,214
3,MIS,23
4,INC,9


In [None]:
#metrics for long inputs
get_ner_metrics(long_error_counts)

Unnamed: 0,0
error rate,0.494815
overgeneration,0.161266
undergeneration,0.017332
derived accuracy,0.505185


Unnamed: 0,0
error rate,0.494815
overgeneration,0.161266
undergeneration,0.017332
derived accuracy,0.505185
