In [1]:
from tqdm import tqdm
from copy import deepcopy
import pandas as pd
import numpy as np
import random
import joblib
import ast
import gc

In [2]:
y_preds = joblib.load('predictions')
y_true = joblib.load('truths')

In [3]:
for p, t in zip(y_preds, y_true):
    if len(p)!=len(t):
        print("Here")

In [4]:
from sklearn.metrics import f1_score, accuracy_score

In [5]:
labels = ['OTHER_PERSON', 'GPE', 'PROVISION', 'WITNESS', 'O', 'RESPONDENT', 'DATE', 'COURT', 'CASE_NUMBER', 'JUDGE', 'STATUTE', 'PETITIONER', 'ORG', 'PRECEDENT']

In [6]:
f1_score(y_true[0], y_preds[0], average='macro')

0.5281818181818182

In [7]:
f1_score(y_true[0], y_preds[0], average=None)

array([0.8       , 1.        , 0.84090909, 0.        , 0.        ])

In [8]:
def create_label_lookups(true):
    all_l2i = []
    all_i2l = []
    for labels in true:
        l2i = {}
        i2l = {}
        for i, label in enumerate(set(labels)):
            l2i[label] = i
            i2l[i] = label
        all_l2i.append(l2i)
        all_i2l.append(i2l)
    
    return all_l2i, all_i2l

In [9]:
all_label2index, all_index2label = create_label_lookups(y_true)

In [11]:
all_index2label[0], all_label2index[0]

({0: 'CASE_NUMBER', 1: 'O', 2: 'PRECEDENT', 3: 'DATE', 4: 'OTHER_PERSON'},
 {'CASE_NUMBER': 0, 'O': 1, 'PRECEDENT': 2, 'DATE': 3, 'OTHER_PERSON': 4})

In [12]:
trues = []
preds = []
for true, pred, l2i in zip(y_true, y_preds, all_label2index):
    p = []
    t = []
    for i in range(len(true)):
        t.append(l2i[true[i]])
        p.append(l2i[pred[i]] if pred[i] in l2i else l2i['O'])
    trues.append(t)
    preds.append(p)

In [13]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [14]:
def find_scores_per_class(y_test, y_preds, i2l, labels=labels, metrics='f1'):
  fscores = {l: [] for l in labels}
  pscores = {l: [] for l in labels}
  rscores = {l: [] for l in labels}
  for true, pred, idx2label in zip(y_test, y_preds, i2l):
    f_scores = f1_score(true, pred, average=None)
    p_scores = precision_score(true, pred, average=None, zero_division=0.0)
    r_scores = recall_score(true, pred, average=None, zero_division=0.0)
    for i in range(len(f_scores)):
      lab = idx2label[i]
      fscores[lab].append(f_scores[i])
      pscores[lab].append(p_scores[i])
      rscores[lab].append(r_scores[i])

  final_scores = {l: [] for l in labels}
  for k in fscores:
    final_scores[k].append(np.mean(fscores[k]))
    final_scores[k].append(np.mean(pscores[k]))
    final_scores[k].append(np.mean(rscores[k]))

  return pd.DataFrame(final_scores, index=['Average f1 scores', 'Average precision scores', 'Average recall scores'], 
                      columns=['COURT', 'JUDGE', 'WITNESS', 'STATUTE', 'PETITIONER', 'DATE', 'OTHER_PERSON', 'PRECEDENT', 'O', 'RESPONDENT', 'GPE', 'CASE_NUMBER', 'PROVISION', 'ORG'])

In [15]:
all_scores = find_scores_per_class(trues, preds, all_index2label).T
all_scores

Unnamed: 0,Average f1 scores,Average precision scores,Average recall scores
COURT,0.67577,0.664222,0.741199
JUDGE,0.417722,0.380936,0.518519
WITNESS,0.112838,0.095546,0.178295
STATUTE,0.431894,0.414364,0.570165
PETITIONER,0.326299,0.321429,0.395982
DATE,0.812099,0.82597,0.844376
OTHER_PERSON,0.377405,0.347924,0.475284
PRECEDENT,0.239919,0.284459,0.229718
O,0.847897,0.821153,0.898467
RESPONDENT,0.445926,0.427302,0.52


In [16]:
import pandas as pd

 
exel = pd.ExcelWriter('outputs.xlsx')
all_scores.to_excel(exel)
 
exel.close()