In [None]:
%pip install transformers 
%pip install https://github.com/sadrasabouri/plda/tarball/master
!wget https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/train.json
!wget https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/dev.json

Prior installations and downloading of train and validation file.

In [None]:
# Imports of necessary packages
import json
from transformers import AutoTokenizer, AutoModel, AutoConfig
import numpy as np
import torch
from torch.nn.functional import normalize
import plda
from sklearn.metrics import f1_score

PLDA_classifier = plda.Classifier()

This Block of code imports necessary packages including transformers, and sklearn classifiers.
It also initialize Decision Tree regressor with squared mean error.

In [None]:
# Import bm-25 tokenizer and model 
tokenizer = AutoTokenizer.from_pretrained('facebook/spar-wiki-bm25-lexmodel-query-encoder')
query_encoder = AutoModel.from_pretrained('facebook/spar-wiki-bm25-lexmodel-query-encoder')

bm-23 tokenizer and model is initialized

In [None]:
def clean_text(text):
    return text.strip()

def generate_word_embedding(query):
    input_ids = tokenizer(query, truncation=True, return_tensors='pt')

    # Compute embeddings: take the last-layer hidden state of the [CLS] token
    query_emb = query_encoder(**input_ids).last_hidden_state[:, 0, :]
    return query_emb

def predict(query, labels_dev):
    tensor_value = generate_word_embedding(query)
    numpy_value = tensor_value.detach().numpy()
    numpy_value = np.squeeze(numpy_value)
    predictions, log_p_predictions = PLDA_classifier.predict(numpy_value)
    predictions = labels_dev[predictions]
    return predictions

1. **clean_text()** : Function to clean text.
2. **generate_word_embedding()**: Function to generate text embedding using english text. 
3. **predict()** : Function to take input as dev data and generate embedding of it. This embedding is then used as input to trained classifier model and output is predicted.


In [None]:
train = json.load(open('train.json'))
dev = json.load(open('dev.json'))
labels_all=[]
data=[]

for i in train:
  for annotations in i['annotations']:
    for results in annotations['result']:
      data.append(clean_text(results['value']['text']))
      labels_all.append(results['value']['labels'][0])

TRAIN_SIZE = len(labels_all)

Train and dev data is loaded. Englist text and its corresponding labels are stored in an array.

In [None]:
X = []
labels = list(set(labels_all))
y = []
progress = 0
for sentence, label in zip(data[:TRAIN_SIZE],
                        labels_all[:TRAIN_SIZE]):
    tensor_value = generate_word_embedding(sentence)   
    numpy_value = tensor_value.detach().numpy()
    numpy_value = np.squeeze(numpy_value)                 
    X.append(numpy_value)
    y.append(labels.index(label))
    progress += 1
    if progress % 500 == 0:
        print('Progress Percent = {}%'.format(100 * progress / TRAIN_SIZE))

X = np.array(X)
y = np.array(y)
print(X.shape, y.shape)


This piece of code aims to use training data, create embeddings for query text and save it in npy file. 

In [None]:
# Regression Tree classifier
PLDA_classifier.fit_model(np.array(X), np.array(y))

The classifier model which is initialized, is trained with training data and corresponding labels.

In [None]:
labels_all_dev=[]
data_dev=[]

for i in dev:
  for annotations in i['annotations']:
    for results in annotations['result']:
      data_dev.append(clean_text(results['value']['text']))
      labels_all_dev.append(results['value']['labels'][0])
        
preds_dev=[]
labels_dev = list(set(labels_all))
for query in data_dev:
    preds = predict(query, labels_dev)
    preds_dev.append(preds)

After training of model, validation or dev dataset is used and stored to array for prediction. Fitted model is used for getting output of prediction.

preds_dev

In [None]:
# Calculate score
f1score=f1_score(labels_all_dev, preds_dev, average="macro")
print(f1score)

After getting the prediction, F1 score of predicted output is called.

In [None]:
# Calculate precision
matches=0
for i,j in zip(labels_all_dev, preds_dev):
  if i==j:
    matches+=1
precision=matches/len(labels_all_dev)
print(precision) 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels_all_dev, preds_dev)

# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm_df = pd.DataFrame(cm,
                     index = ['PREAMBLE', 'NONE', 'FAC', 'ARG_RESPONDENT', 'RLC', 'ARG_PETITIONER', 'ANALYSIS', 'PRE_RELIED', 'RATIO', 'RPC', 'ISSUE', 'STA', 'PRE_NOT_RELIED'], 
                     columns = ['PREAMBLE', 'NONE', 'FAC', 'ARG_RESPONDENT', 'RLC', 'ARG_PETITIONER', 'ANALYSIS', 'PRE_RELIED', 'RATIO', 'RPC', 'ISSUE', 'STA', 'PRE_NOT_RELIED'])

#Plotting the confusion matrix
plt.figure(figsize=(15,10))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.savefig("matrix_plda.png")

Pecision is calculated