In [1]:
from tqdm import tqdm
from copy import deepcopy
import pandas as pd
import numpy as np
import random
import joblib
import ast
import gc

In [2]:
y_preds = joblib.load('y_preds')
y_true = joblib.load('y_true')

In [3]:
for p, t in zip(y_preds, y_true):
    if len(p)!=len(t):
        print("Here")

In [4]:
from sklearn.metrics import f1_score, accuracy_score

In [5]:
labels = ['OTHER_PERSON', 'GPE', 'PROVISION', 'WITNESS', 'O', 'RESPONDENT', 'DATE', 'COURT', 'CASE_NUMBER', 'JUDGE', 'STATUTE', 'PETITIONER', 'ORG', 'PRECEDENT']

In [6]:
f1_score(y_true[0], y_preds[0], average='macro')

0.6481720430107527

In [7]:
f1_score(y_true[0], y_preds[0], average=None)

array([0.8       , 1.        , 0.77419355, 0.        , 0.66666667])

In [8]:
def create_label_lookups(true):
    all_l2i = []
    all_i2l = []
    for labels in true:
        l2i = {}
        i2l = {}
        for i, label in enumerate(set(labels)):
            l2i[label] = i
            i2l[i] = label
        all_l2i.append(l2i)
        all_i2l.append(i2l)
    
    return all_l2i, all_i2l

In [9]:
all_label2index, all_index2label = create_label_lookups(y_true)

In [10]:
all_index2label[0]

{0: 'O', 1: 'OTHER_PERSON', 2: 'PRECEDENT', 3: 'CASE_NUMBER', 4: 'DATE'}

In [13]:
all_label2index[0]

{'DATE': 0, 'O': 1, 'OTHER_PERSON': 2, 'PRECEDENT': 3, 'CASE_NUMBER': 4}

In [14]:
trues = []
preds = []
for true, pred, l2i in zip(y_true, y_preds, all_label2index):
    p = []
    t = []
    for i in range(len(true)):
        t.append(l2i[true[i]])
        p.append(l2i[pred[i]] if pred[i] in l2i else l2i['O'])
    trues.append(t)
    preds.append(p)


In [15]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [51]:
def find_scores_per_class(y_test, y_preds, i2l, labels=labels, metrics='f1'):
  all_scores = {l: [] for l in labels}
  for true, pred, idx2label in zip(y_test, y_preds, i2l):
    if metrics=='f1':
      scores = f1_score(true, pred, average=None)
    if metrics=='precision':
      scores = precision_score(true, pred, average=None, zero_division=0.0)
    elif metrics=='recall':
      scores = recall_score(true, pred, average=None, zero_division=0.0)
    for i in range(len(scores)):
      lab = idx2label[i]
      all_scores[lab].append(scores[i])

  final_scores = {l: [] for l in labels}
  for k, v in all_scores.items():
    final_scores[k] = np.mean(v)

  return pd.DataFrame(final_scores, index=[f'Average {metrics} scores'], 
                      columns=['COURT', 'JUDGE', 'WITNESS', 'STATUTE', 'PETITIONER', 'DATE', 'OTHER_PERSON', 'PRECEDENT', 'O', 'RESPONDENT', 'GPE', 'CASE_NUMBER', 'PROVISION', 'ORG'])

In [52]:
len(preds)

401

In [53]:
precision_scores = find_scores_per_class(trues, preds, all_index2label, metrics='precision').T
precision_scores

Unnamed: 0,Average precision scores
COURT,0.427042
JUDGE,0.589311
WITNESS,0.358063
STATUTE,0.208337
PETITIONER,0.360788
DATE,0.670182
OTHER_PERSON,0.209916
PRECEDENT,0.361695
O,0.839365
RESPONDENT,0.362381


In [54]:
f1_scores = find_scores_per_class(trues, preds, all_index2label).T
f1_scores

Unnamed: 0,Average f1 scores
COURT,0.450952
JUDGE,0.644912
WITNESS,0.448511
STATUTE,0.207847
PETITIONER,0.387229
DATE,0.662891
OTHER_PERSON,0.225324
PRECEDENT,0.361472
O,0.848185
RESPONDENT,0.402479


In [55]:
recall_scores = find_scores_per_class(trues, preds, all_index2label, metrics='recall').T
recall_scores

Unnamed: 0,Average recall scores
COURT,0.526055
JUDGE,0.809091
WITNESS,0.725775
STATUTE,0.237758
PETITIONER,0.484848
DATE,0.688949
OTHER_PERSON,0.282037
PRECEDENT,0.411625
O,0.885816
RESPONDENT,0.533333


In [50]:
import pandas as pd

 
exel = pd.ExcelWriter('recall.xlsx')
recall_scores.to_excel(exel)
 
exel.close()

In [209]:
final_f1_score = []
final_acc = []
for y_true, y_pred in zip(trues, preds):
    final_f1_score.append(f1_score(y_true, y_pred, average='macro'))
    final_acc.append(accuracy_score(y_true, y_pred))

final_f1_score = np.mean(final_f1_score)
final_acc = np.mean(final_acc)

In [210]:
final_f1_score, final_acc

(0.4886614331846166, 0.7594979546308163)