## TUSHAR NAGPAL -MT2022125
## MANDATE 4 : Final Submission
### This file contains code to fine-tune BERT on IPC_Dataset created using Haystack-annotation tool as mentioned in Mandate 2

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ipc-context/IPC_dataset.json
/kaggle/input/ipc-context/file.txt
/kaggle/input/ipc-context/ipc_haystack.json


In [2]:
pip install -U nltk

[0mNote: you may need to restart the kernel to use updated packages.


### Importing Libraries

In [3]:
import json
from pathlib import Path
import nltk
import urllib.request
import urllib
!nltk.download('reuters')

/bin/bash: -c: line 0: syntax error near unexpected token `'reuters''
/bin/bash: -c: line 0: `nltk.download('reuters')'


In [4]:
!pip install transformers

[0m

### This code is to read the json dataset and convert it in train-test split

In [5]:
def read_ipc(data_dict, split):
    contexts = []
    questions = []
    answers = []
    train_contexts = []
    train_questions = []
    train_answers = []
    val_contexts = []
    val_questions = []
    val_answers = []
    
    for x,group in enumerate(data_dict['data']):
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    if answer['answer_start']!=-1:
                        contexts.append(context)
                        questions.append(question)
                        answers.append(answer)
    
    train_end=int(len(contexts)*split)
    
    for x in range(train_end):
        train_contexts.append(contexts[x])
        train_questions.append(questions[x])
        train_answers.append(answers[x])
    print(len(train_answers))
    
    for x in range(train_end,len(contexts)):
        val_contexts.append(contexts[x])
        val_questions.append(questions[x])
        val_answers.append(answers[x])
        
    print(len(val_answers))   
    return train_contexts, train_questions, train_answers,val_contexts, val_questions, val_answers

### Loading the dataset for IPC which I have created using pdf available at https://www.indiacode.nic.in/
#### Dataset is made by separating each Indian Penal Code Section.

In [6]:
f = open('/kaggle/input/ipc-context/ipc_haystack.json')
data = json.load(f)

In [7]:
contexts = pd.json_normalize(data)
con = data['data'][0]['paragraphs'][0]['context']
con

cont = []
for d in data['data']:
    for p in d['paragraphs']:
        cont.append(p['context'])
cont
contDF = pd.DataFrame(cont)

In [8]:
contDF.rename(columns = {0: 'description'})

Unnamed: 0,description
0,Title and extent of operation of the Code
1,This Act shall be called the Indian Penal Code...
2,Punishment of offences committed within India
3,Every person shall be liable to punishment und...
4,"Punishment of offences committed beyond, but w..."
...,...
1015,"Whoever, intending to insult the modesty of an..."
1016,Misconduct in public by a drunken person
1017,"Whoever, in a state of intoxication, appears i..."
1018,Punishment for attempting to commit offences p...


In [9]:
train_contexts, train_questions, train_answers,val_contexts, val_questions, val_answers = read_ipc(data,0.8)

294
74


In [10]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [11]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [12]:
print(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [13]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    print(len(answers))
    for i in range(len(answers)):
        try:
            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
            if end_positions[-1] is None:
                end_positions[-1] = tokenizer.model_max_length
        except:
            print(answers[i],i)
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

294
74


In [14]:
import torch

class IpcDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = IpcDataset(train_encodings)
val_dataset = IpcDataset(val_encodings)

In [15]:
from transformers import DistilBertForQuestionAnswering
model_db = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
# from transformers import BertForQuestionAnswering
# model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

### Training Model:

In [16]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_db.to(device)
model_db.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model_db.parameters(), lr=5e-5)

for epoch in range(10):
    for x,batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model_db(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        if x == 10:
            print("batch: "+str(x)+" loss: "+str(loss))
model_db.eval()



batch: 10 loss: tensor(2.7969, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(1.7382, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(1.3361, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(0.9043, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(0.5150, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(0.6062, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(0.3214, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(0.3535, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(0.2393, device='cuda:0', grad_fn=<DivBackward0>)
batch: 10 loss: tensor(0.4138, device='cuda:0', grad_fn=<DivBackward0>)


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

### This question will be provided by user,
### and paragraph will be predicted by model trained in file "Predicting the context.ipynb"

In [17]:
question = "punishment if woman dies within seven years of marriage"
# paragraph="Whoever makes any preparation for committing dacoity, shall be punished with rigorous imprisonment for a term which may extend to ten years, and shall also be liable to fine."
# val_contexts[2]
paragraph = "Dowry death.(1) Where the death of a woman is caused by any burns or bodily injury or occurs otherwise than under normal circumstances within seven years of her marriage and it is shown that soon before her death she was subjected to cruelty or harassment by her husband or any relative of her husband for, or in connection with, any demand for dowry, such death shall be called dowry death, and such husband or relative shall be deemed to have caused her death. Explanation.For the purposes of this sub-section, dowry shall have the same meaning as in section 2 of the Dowry Prohibition Act, 1961 (28 of 1961). (2) Whoever commits dowry death shall be punished with imprisonment for a term which shall not be less than seven years but which may extend to imprisonment for life.]"
# paragraph = "Word, gesture or act intended to insult the modesty of a woman.—Whoever, intending to insult the modesty of any woman, utters any words, makes any sound or gesture, or exhibits any object, intending that such word or sound shall be heard, or that such gesture or object shall be seen, by such woman, or intrudes upon the privacy of such woman, 1[shall be punished with simple imprisonment for a term which may extend to three years, and also with fine]."
# ans=val_answers[2]
# val_answers[2]
print(question)
print(paragraph)
# print(ans)

punishment if woman dies within seven years of marriage
Dowry death.(1) Where the death of a woman is caused by any burns or bodily injury or occurs otherwise than under normal circumstances within seven years of her marriage and it is shown that soon before her death she was subjected to cruelty or harassment by her husband or any relative of her husband for, or in connection with, any demand for dowry, such death shall be called dowry death, and such husband or relative shall be deemed to have caused her death. Explanation.For the purposes of this sub-section, dowry shall have the same meaning as in section 2 of the Dowry Prohibition Act, 1961 (28 of 1961). (2) Whoever commits dowry death shall be punished with imprisonment for a term which shall not be less than seven years but which may extend to imprisonment for life.]


In [18]:
encoding = tokenizer.encode_plus(text=question, text_pair = paragraph)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
tokens = tokenizer.convert_ids_to_tokens(input_ids) #input tokens
print(tokens)

['[CLS]', 'punishment', 'if', 'woman', 'dies', 'within', 'seven', 'years', 'of', 'marriage', '[SEP]', 'dowry', 'death', '.', '(', '1', ')', 'where', 'the', 'death', 'of', 'a', 'woman', 'is', 'caused', 'by', 'any', 'burns', 'or', 'bodily', 'injury', 'or', 'occurs', 'otherwise', 'than', 'under', 'normal', 'circumstances', 'within', 'seven', 'years', 'of', 'her', 'marriage', 'and', 'it', 'is', 'shown', 'that', 'soon', 'before', 'her', 'death', 'she', 'was', 'subjected', 'to', 'cruelty', 'or', 'harassment', 'by', 'her', 'husband', 'or', 'any', 'relative', 'of', 'her', 'husband', 'for', ',', 'or', 'in', 'connection', 'with', ',', 'any', 'demand', 'for', 'dowry', ',', 'such', 'death', 'shall', 'be', 'called', 'dowry', 'death', ',', 'and', 'such', 'husband', 'or', 'relative', 'shall', 'be', 'deemed', 'to', 'have', 'caused', 'her', 'death', '.', 'explanation', '.', 'for', 'the', 'purposes', 'of', 'this', 'sub', '-', 'section', ',', 'dowry', 'shall', 'have', 'the', 'same', 'meaning', 'as', 'in'

In [19]:
ip=torch.tensor([input_ids]).to(device)
attention=torch.tensor([attention_mask]).to(device)

### Prediction:

In [20]:
output = model_db(ip, attention)
start_scores = output.start_logits
end_scores = output.end_logits

In [21]:
max_startscore = torch.argmax(start_scores)
max_endscore = torch.argmax(end_scores)
ans_tokens = input_ids[max_startscore: max_endscore + 1]
# answer = ' '.join(tokens[start_index:end_index+1])
print(ans_tokens)
answer = ' '.join(tokens[max_startscore:max_endscore+1])
print("QUESTION: "+str(question))
print("")
print("ANSWER: "+str(answer))
print("")
print("CONTEXT: "+str(paragraph))

[10219, 2005, 1037, 2744, 2029, 4618, 2025, 2022, 2625, 2084, 2698, 2086, 2021, 2029, 2089, 7949, 2000, 10219, 2005, 2166, 1012]
QUESTION: punishment if woman dies within seven years of marriage

ANSWER: imprisonment for a term which shall not be less than seven years but which may extend to imprisonment for life .

CONTEXT: Dowry death.(1) Where the death of a woman is caused by any burns or bodily injury or occurs otherwise than under normal circumstances within seven years of her marriage and it is shown that soon before her death she was subjected to cruelty or harassment by her husband or any relative of her husband for, or in connection with, any demand for dowry, such death shall be called dowry death, and such husband or relative shall be deemed to have caused her death. Explanation.For the purposes of this sub-section, dowry shall have the same meaning as in section 2 of the Dowry Prohibition Act, 1961 (28 of 1961). (2) Whoever commits dowry death shall be punished with impris

### Since, this model will just provide us answer of any question that user will ask related to IPC: Accuracy is not measure because the answers will be general.