In [None]:
#FINE TUNING
#1. Model -> bert-large-uncased-whole-word-masking-finetuned-squad
#2. Dataset -> Legal Dataset
#3. Batch size -> 16

#Steps
#1. Loading the Dataset
#2. Processing the Data
#3. Fine Tuning
#4. Evaluation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Split into train,test and val : 70,15,15
import json
import random

# Assuming your JSON data is stored in a file named 'data.json'
with open('/content/trial.json', 'r') as f:
    data = json.load(f)

# Shuffle the data
random.shuffle(data)

# Split the data into training and testing datasets
split_index = int(0.7 * len(data))
split_index2 = int(0.15 * len(data)) + split_index
train_data = data[:split_index]
test_data = data[split_index:split_index2]
val_data = data[split_index2:]

# Save the training and testing datasets to separate files
with open('train.json', 'w') as f:
    json.dump(train_data, f, indent=4)

# Save the training and testing datasets to separate files
with open('val.json', 'w') as f:
    json.dump(val_data, f, indent=4)

with open('test.json', 'w') as f:
    json.dump(test_data, f, indent=4)

In [None]:
!pip install datasets
!pip install transformers
!pip install -U accelerate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

**Fine Tuning Parameters**

In [None]:
#Pretrained Model Name
model_name = "deepset/bert-large-uncased-whole-word-masking-squad2"

#Dataset Files
training_file = '/content/train.json'
validation_file = '/content/val.json'
test_file = '/content/test.json'

#Batch_size for training
batch_size = 16

## **Loading Dataset**

In [None]:
#Loading the Dataset
from datasets import load_dataset
dataset = load_dataset('json', data_files={'train': training_file, 'validation': validation_file, 'test': test_file})
#Viewing the attributes and features of the datasets
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'answer', 'question', 'context'],
        num_rows: 385
    })
    validation: Dataset({
        features: ['id', 'answer', 'question', 'context'],
        num_rows: 84
    })
    test: Dataset({
        features: ['id', 'answer', 'question', 'context'],
        num_rows: 82
    })
})


## **Data Processing**

In [None]:
#Processing the Data
import transformers
from transformers import AutoTokenizer

#Fetching the Fast Tokenizer(if available) for the ML model
tokenizer = AutoTokenizer.from_pretrained(model_name)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

#Function to prepare the train examples for training
def prepare_train_samples(examples):
    max_length = 512    #max length of the input(question+context)
    doc_stride = 128    #length of overlap between consecutive samples of the same example

    #Tokenizing with truncation and padding, but keeping the overflows using a stride.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    #One example might give us several samples if it has a long context
    #a mapping indicating the map from the sample to its corresponding example
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    #offset mappings give us a map from token to character position spans in the original context
    offset_mapping = tokenized_examples.pop("offset_mapping")

    #labeling of the examples with the start and end character positions of the answer
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]

        #the impossible answers are marked with the index of the CLS token.
        cls_index = input_ids.index(tokenizer.cls_token_id)

        #Grab the sequence corresponding to that sample (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        #One example can give several samples
        #fetching the original example number for the sample
        example_index = sample_mapping[i]
        answer = examples["answer"][example_index]

        #Start/end character indexes for the answer in the example
        start_char_index = answer["answer_start"]
        end_char_index = len(answer["text"]) + start_char_index

        #Start token index for the current sample context
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        #End token index for the current sample context
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        #Detect if the answer is out of the span and label the sample with the CLS index
        if not (offsets[token_start_index][0] <= start_char_index and offsets[token_end_index][1] >= end_char_index):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        #bringing the token_start_index and token_end_index to the answer_start_index and the answer_end_index respectively
        #the last offset is considered in case of the last word answer
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char_index :
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char_index:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/540 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
#Preparing the tokenized data input
tokenized_dataset = dataset.map(prepare_train_samples, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/385 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

## **Fine Tuning**

In [None]:
#Fine Tuning
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

#fetching the pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

#Not changing the base parameters of the model(Non-Task specific layer)
for param in model.base_model.parameters():
    param.requires_grad = False

#Defining the Training Arguments
args = TrainingArguments(
    f"trial",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir = './logs',
    logging_steps = 10
)

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-large-uncased-whole-word-masking-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
#Fetching the Data Collator to batch the processed examples
from transformers import default_data_collator
data_collator = default_data_collator

In [None]:
#Defining the trainer Object for training
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

#Train the model (with evaluation loss only)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,1.2144,1.104866
2,1.3195,1.095868
3,1.2899,1.092889


TrainOutput(global_step=75, training_loss=1.3274059104919433, metrics={'train_runtime': 147.8094, 'train_samples_per_second': 7.976, 'train_steps_per_second': 0.507, 'total_flos': 1094945543718912.0, 'train_loss': 1.3274059104919433, 'epoch': 3.0})

In [None]:
model_path = '/content/drive/MyDrive/LegalQA_demo'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/LegalQA_demo/tokenizer_config.json',
 '/content/drive/MyDrive/LegalQA_demo/special_tokens_map.json',
 '/content/drive/MyDrive/LegalQA_demo/vocab.txt',
 '/content/drive/MyDrive/LegalQA_demo/added_tokens.json',
 '/content/drive/MyDrive/LegalQA_demo/tokenizer.json')

Evaluation

In [None]:
#Evaluation
#Function to prepare the validation examples for evaluation
def prepare_test_samples(examples):
    max_length = 512    #max length of input(question + context)
    doc_stride = 128    #length of overlap between consecutive samples of the same example

    #Tokenizing with truncation and padding, but keeping the overflows using a stride
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    #One example might give us several samples if it has a long context
    #a mapping indicating the map from the sample to its corresponding example
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    #recording the example_id that gave us this sample
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        #Grab the sequence corresponding to that sample (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        #One example can give several samples
        #fetching the original example number for the sample
        sample_index = sample_mapping[i]
        #apeending the example_id for fetching the reference answer for evaluation
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        #the offset_mapping of tokens that are not part of the context is set to None
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == 1 else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
dataset['test']

Dataset({
    features: ['id', 'answer', 'question', 'context'],
    num_rows: 82
})

In [None]:
#preparing the validation input
test_samples = dataset["test"].map(
    prepare_test_samples,
    batched=True,
    remove_columns=dataset["test"].column_names
)

#Getting the raw predictions on the test set
raw_predictions = trainer.predict(test_samples)

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

In [None]:
from tqdm.auto import tqdm
import numpy as np
import collections

#Function to find the best possible answers using the raw predictions
def postprocess_qa_predictions(examples, samples, raw_predictions, best_size = 20, max_answer_length = 100):
    #fetching the start and end scores for the predicted answers
    all_start_logits, all_end_logits = raw_predictions

    #Building a mapping of examples to its samples list
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    samples_per_example = collections.defaultdict(list)
    for i, sample in enumerate(samples):
        samples_per_example[example_id_to_index[sample["example_id"]]].append(i)

    #final predictions to be worked from the raw predictions
    predictions = collections.OrderedDict()

    for example_index, example in enumerate(examples):
        #indices of the samples associated to the current example.
        sample_indices = samples_per_example[example_index]

        valid_answers = []                #the list of valid predicted answers for the example
        context = example["context"]      #the context for the example

        #Looping through all the samples associated to the current example.
        for sample_index in sample_indices:
            #fetching the predictions of the model for this sample
            start_logits = all_start_logits[sample_index]
            end_logits = all_end_logits[sample_index]

            #Mapping to map the logit indexes to the characters in the context
            offset_mapping = samples[sample_index]["offset_mapping"]

            #Fetching the top 'best_size' start and end logits
            start_indexes = np.argsort(start_logits)[-1 : -best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -best_size - 1 : -1].tolist()

            #Looping through all combinations of start and end indexes to find the best valid answer among all
            for start_index in start_indexes:
                for end_index in end_indexes:
                    #Not considering answers which are out of the context or not in the context
                    if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None:
                      continue

                    #Not considering answers with length < 0 or > max_answer_length
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                      continue

                    #Appending the valid answer along with its final score
                    #Final score = start_score + end_score
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        #Finding the best answer among all the possible valid answers based on the final score
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        #To avoid failure of no answers, we create a blank answer with score = 0.0
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[example["id"]] = best_answer["text"]
    return predictions

In [None]:
# #TRial
# test = {
#         "context": " procedure in case of commission of offence by child and determination of age by special court.  \u2014 where any offence under this act is committed by a child such child shall be dealt with under the provisions of the juvenile justice act 2000 .  if any question arises in any proceeding before the special court whether a person is a child or not such question shall be determined by the special court after satisfying itself about the age of such person and it shall record in writing its reasons for such determination. ",
#         "question": "When was the juvenile justice act passed?",
#         "answer": {
#             "text": "2000",
#             "answer_start": 236,
#             "answer_end": 240
#         },
#         "id": 67
#     }

In [None]:
# import json
# from datasets import load_dataset

# # Save your test data to a JSON file
# with open('try_test.json', 'w') as f:
#     json.dump(test, f)

# # Load the dataset from the JSON file
# dataset = load_dataset('json', data_files='try_test.json')

# # Now you can access your dataset
# print(dataset)

In [None]:
# #preparing the validation input
# test_samples = dataset["train"].map(
#     prepare_test_samples,
#     batched=True
# )

# #Getting the raw predictions on the test set
# raw_predictions = trainer.predict(test_samples)

In [None]:
# #Getting the column names of the test_samples
# test_samples.set_format(type=test_samples.format["type"], columns=list(test_samples.features.keys()))

# #finding the final predictions (the best possible answers)
# final_predictions = postprocess_qa_predictions(dataset["train"], test_samples, raw_predictions.predictions)

In [None]:
# final_predictions

In [None]:
#Getting the column names of the test_samples
test_samples.set_format(type=test_samples.format["type"], columns=list(test_samples.features.keys()))

#finding the final predictions (the best possible answers)
final_predictions = postprocess_qa_predictions(dataset["test"], test_samples, raw_predictions.predictions)

In [None]:
len(final_predictions)

82

In [None]:
len(references)

82

In [None]:
#Constructing the predicted and reference answer sets for evaluation
predictions = tuple([v for k, v in final_predictions.items()])
references = []
for ex in dataset['test']:
    references.append(ex["context"][ex["answer"]["answer_start"] : ex["answer"]["answer_end"]])
references = tuple(references)

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.6.1


In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained sentence transformer model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Compute embeddings for ground truth answers and predicted answers
ground_truth_embeddings = model.encode(references, convert_to_tensor=True)
predicted_embeddings = model.encode(predictions, convert_to_tensor=True)

# Move tensors to CPU
ground_truth_embeddings = ground_truth_embeddings.cpu().detach().numpy()
predicted_embeddings = predicted_embeddings.cpu().detach().numpy()

# Compute cosine similarity between each predicted answer and all ground truth answers
cos_similarities = cosine_similarity(predicted_embeddings, ground_truth_embeddings)

# Evaluate the model based on semantic similarity
average_semantic_similarity = cos_similarities.max(axis=1).mean()

print("Average Semantic Similarity:", average_semantic_similarity)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Average Semantic Similarity: 0.9446449


In [None]:
predictions = list(predictions)
references = list(references)

In [None]:
for i in range(len(predictions)):
  print(predictions[i]," | ",references[i])

section 62  |  section 62 or sub-section of section 63
four months  |  six months
six months  |  six months
two years or upwards  |  for life or rigorous imprisonment for a term of two years
section 53  |  68e
within sixty days  |  sixty days
section 42  |  section 42
68p  |  68p
sub -section or sub -section of section 8 or section 58b or sub -section of section 60  |  section 8 or section 58b or sub -section of section 60
section 15 to section 25  section 28 section 29 or section 30  |  section 15 to section 25  section 28 section 29 or section 30
section 14  |  section 14
68s  |  68s
section 8  |  section 8
ten years but which may extend to twenty years  |  hich may ext
section100  |  section 42
section 37 or section 38  |  section 37 or section 38
three years  |  three years
two years  |  two years
section 161  |  section 161
one month  |  one month
section 3  |  section 3
when they enact or revise their laws  |  inter alia
43a  |  43a
1958  |  1958
2000  |  2000
section 19  |  sect

In [None]:
from nltk.translate.bleu_score import sentence_bleu
gram1 = 0
gram2 = 0
gram3 = 0
gram4 = 0

for i in range(len(references)):
  gram1 += sentence_bleu([references[i].split()], predictions[i].split(), weights=(1, 0, 0, 0))
  gram2 += sentence_bleu([references[i].split()], predictions[i].split(), weights=(0, 1, 0, 0))
  gram3 += sentence_bleu([references[i].split()], predictions[i].split(), weights=(0, 0, 1, 0))
  gram4 += sentence_bleu([references[i].split()], predictions[i].split(), weights=(0, 0, 0, 1))

print(gram1 / len(references))
print(gram2 / len(references))
print(gram3 / len(references))
print(gram4 / len(references))

0.7814370012872458
0.5823704746337938
0.10994412599947843
0.1055306532933228


In [None]:
# EM = 0.72