#Imports


In [1]:
!pip install git+https://github.com/csebuetnlp/normalizer
!pip install datasets -q
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-2rq6o3b0
  Running command git clone --filter=blob:none --quiet https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-2rq6o3b0
  Resolved https://github.com/csebuetnlp/normalizer to commit d80c3c484e1b80268f2b2dfaf7557fe65e34f321
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji==1.4.2 (from normalizer==0.0.1)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy==6.0.3 (from normalizer==0.0.1)
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?2

In [2]:
from normalizer import normalize
import torch
from transformers import ElectraTokenizer, ElectraForPreTraining, ElectraForTokenClassification, AdamW
from transformers import pipeline, AutoTokenizer, AutoModelForPreTraining , BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pickle

In [3]:
from tqdm import tqdm

#Gazeteer import

In [4]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()
        self.entity_tags = ["PER", "LOC", "CW", "CORP", "GRP", "PROD"]
        self.tag_encoding = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4,
                             "B-CW": 5, "I-CW": 6, "B-CORP": 7, "I-CORP": 8, 
                             "B-GRP": 9, "I-GRP": 10, "B-PROD": 11, "I-PROD": 12}

    def insert(self, word, entity_type):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
        node.entity_type = entity_type

    def search(self, sentence):
        encoding = [0] * len(sentence)
        for i in range(len(sentence)):
            node = self.root
            for j in range(i, len(sentence)):
                char = sentence[j]
                if char not in node.children:
                    break
                node = node.children[char]
                if node.is_end_of_word:
                    entity_type = node.entity_type
                    entity_length = j - i + 1
                    start_index = i
                    end_index = j
                    for k in range(start_index, end_index + 1):
                        if k == start_index:
                            encoding[k] = self.tag_encoding["B-" + entity_type]
                        else:
                            encoding[k] = self.tag_encoding["I-" + entity_type]
                    break
        one_hot_encoding = [[0] * 13 for i in range(len(encoding))]
        for i, tag in enumerate(encoding):
            one_hot_encoding[i][tag] = 1
        return one_hot_encoding

import numpy as np

def group_encodings_by_word(encoding, sentence):
    # Create an empty list to store the word encodings
    word_encodings = []
    
    # Create an empty list to store the current word encoding
    current_word_encoding = []
    
    # Create an empty string to store the current word
    current_word = ""
    
    # Iterate over each character encoding and character in the input encoding list and sentence, respectively
    for char_encoding, char in zip(encoding, sentence):
        # If the current character is a whitespace character, finish the current word and add its first character encoding to the word encodings list
        if char == " ":
            if len(current_word_encoding) > 0:
                word_encodings.append(np.array(current_word_encoding[0]))
                current_word_encoding = []
            current_word = ""
        # If the current character is part of a word, append the character encoding to the current word encoding and the character to the current word
        else:
            current_word_encoding.append(char_encoding)
            current_word += char
    
    # Add the last word encoding to the word encodings list, if it exists
    if len(current_word_encoding) > 0:
        word_encodings.append(np.array(current_word_encoding[0]))
    
    # Return the word encodings as a NumPy array
    return torch.from_numpy(np.array(word_encodings)).type(torch.float32)



def save_trie(trie, filename):
    with open(filename, "wb") as f:
        pickle.dump(trie, f)

def load_trie(filename):
    with open(filename, "rb") as f:
        trie = pickle.load(f)
    return trie


In [5]:
def Gtoken(text):
  inputs = tokenizer.encode_plus(text, return_tensors='pt')

  encoded_dict = tokenizer.encode_plus(
                  text,       # Sentence to encode.
                  add_special_tokens = False, # Add '[CLS]' and '[SEP]'
                  max_length = 314,           # Pad & truncate all sentences.
                  padding = 'max_length',
                  return_attention_mask = True,   # Construct attn. masks.
                  return_tensors = 'pt',
                  truncation=False)
  input_ids = encoded_dict['input_ids']
  tokenized = tokenizer.convert_ids_to_tokens([i.item() for i in input_ids.squeeze() if i > 1])
  return "< "+" ".join(tokenized)+" >"

In [9]:
%cd /content/drive/MyDrive/Bracu/THESIS/Trie DS/

/content/drive/MyDrive/Bracu/THESIS/Trie DS


In [10]:
trie = load_trie('Trie.bin')

In [11]:
model_name = '/content/drive/MyDrive/Thesis/BERTOUTPUT/checkpoint-11000/'
tokenizer = ElectraTokenizer.from_pretrained(model_name)

In [12]:
def pad_tensor(tensor):
    current_size = tensor.size(0)
    if current_size >= 64:
        return tensor[:64, :]  # if the tensor is larger than (64, 13), truncate it
    
    padded_tensor = torch.zeros((64, 13))
    padded_tensor[:current_size, :] = tensor  # copy the input tensor to the padded tensor
    return padded_tensor



In [13]:
def Gazetteer(sentence, pad = True):
  sentence = Gtoken(normalize(sentence))
  encoding = trie.search(sentence)
  tensor = group_encodings_by_word(encoding,sentence)
  if pad:
    current_size = tensor.size(0)
    if current_size >= 64:
        return tensor[:64, :]  # if the tensor is larger than (64, 13), truncate it
    
    padded_tensor = torch.zeros((64, 13))
    padded_tensor[:current_size, :] = tensor  # copy the input tensor to the padded tensor
    return padded_tensor
  else: 
    return tensor

In [32]:
Gazetteer("২০১৮ এর সেরা বর্ণানুক্রমিকভাবে তালিকাভুক্ত র‍্যাঙ্ক করা হয়নি এনপিআর").size()

torch.Size([64, 13])

#Dataset Load

In [15]:
%cd /content/drive/MyDrive/Bracu/THESIS/DatasetSir

/content/drive/MyDrive/Bracu/THESIS/DatasetSir


In [16]:
# Load the NER dataset
df_train = pd.read_csv('trainData2022PP2.csv')
df_val = pd.read_csv('devData2022PP2.csv')
df_test = pd.read_csv('testData2022PP2.csv')

In [17]:
d = {'O': 0, 'B-CORP': 1, 'I-CORP': 2, 'B-CW': 3, 'I-CW': 4, 'B-GRP': 5, 'I-GRP': 6, 'B-LOC': 7, 'I-LOC': 8, 'B-PER': 9, 'I-PER': 10, 'B-PROD': 11, 'I-PROD': 12}
def label_encoder(x):
    x = eval(x)
    y = []
    for i in x:
      y.append(d[i])
    return y

def text_normalizer(x):
    return normalize(x)

df_train['Word'] = df_train['Word'].apply(lambda x: text_normalizer(x))
df_test['Word'] = df_test['Word'].apply(lambda x: text_normalizer(x))
df_val['Word'] = df_val['Word'].apply(lambda x: text_normalizer(x))
df_train['Tag'] = df_train['Tag'].apply(lambda x: label_encoder(x))
df_test['Tag'] = df_test['Tag'].apply(lambda x: label_encoder(x))
df_val['Tag'] = df_val['Tag'].apply(lambda x: label_encoder(x))
df_train.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,২০১৮ এর সেরা বর্ণানুক্রমিকভাবে তালিকাভুক্ত র‍্...,"[0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,Sentence: 10,সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।,"[0, 0, 0, 0, 3, 4, 0]"
2,Sentence: 100,করে বাদ্যযন্ত্রের থিম এবং চলচ্চিত্রের জন্য প্র...,"[5, 0, 0, 0, 0, 0, 0, 0]"
3,Sentence: 1000,প্রতিষ্ঠান ২২১১ ইঙ্গিত করে যে আউগুস্তুস পৃথক ক...,"[0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Sentence: 10000,শো স্টপারে তাদের বেকড আলাস্কা করতে সাড়ে চার ঘ...,"[0, 0, 0, 11, 12, 0, 0, 0, 0, 0, 0]"


In [18]:
def one_hot_encode_list(lst, pad = True):
    """
    One-hot encodes each element in a Python list and returns a tensor containing the one-hot encodings.
    
    Args:
    - lst: A Python list
    
    Returns:
    - A PyTorch tensor with shape (len(lst), 13), where each row represents the one-hot encoding of an element in the input list.
    """
    one_hot_tensors = torch.zeros(len(lst), 13)
    for i, elem in enumerate(lst):
        one_hot = torch.zeros(13)
        one_hot[elem] = 1
        one_hot_tensors[i] = one_hot
    if pad:
      tensor = one_hot_tensors
      current_size = tensor.size(0)
      if current_size >= 64:
          return tensor[:64, :]  # if the tensor is larger than (64, 13), truncate it
      
      padded_tensor = torch.zeros((64, 13))
      padded_tensor[:current_size, :] = tensor  # copy the input tensor to the padded tensor
      return padded_tensor


one_hot_encode_list([0, 0, 0, 0, 3, 4, 0]).size()

torch.Size([64, 13])

In [19]:
def tokenized_df(text):
    inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')
    input_ids = inputs['input_ids']
    return input_ids

def tokenized_df1(text):
    inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')
    attention_mask = inputs['attention_mask']
    return attention_mask
def tokenized_df2(text):
    gazetteer = Gazetteer(text).unsqueeze(0)
    return gazetteer

In [20]:
df_val['input_ids'] =  df_val['Word'].apply(lambda x: tokenized_df(x))
df_val['attention_mask'] =  df_val['Word'].apply(lambda x: tokenized_df1(x))
df_val['gazetteer'] =  df_val['Word'].apply(lambda x: tokenized_df2(x))
df_train['input_ids'] = df_train['Word'].apply(lambda x: tokenized_df(x))
df_train['attention_mask'] = df_train['Word'].apply(lambda x: tokenized_df1(x))
df_train['gazetteer'] = df_train['Word'].apply(lambda x: tokenized_df2(x))

In [21]:
df_val['Tag'] =  df_val['Tag'].apply(lambda x: one_hot_encode_list(x))
df_train['Tag'] =  df_train['Tag'].apply(lambda x: one_hot_encode_list(x))

In [22]:
df_train.head()

Unnamed: 0,Sentence #,Word,Tag,input_ids,attention_mask,gazetteer
0,Sentence: 1,২০১৮ এর সেরা বর্ণানুক্রমিকভাবে তালিকাভুক্ত র‍্...,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...","[[tensor(2), tensor(7349), tensor(919), tensor...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[[[tensor(1.), tensor(0.), tensor(0.), tensor(..."
1,Sentence: 10,সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...","[[tensor(2), tensor(13509), tensor(20928), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[[[tensor(1.), tensor(0.), tensor(0.), tensor(..."
2,Sentence: 100,করে বাদ্যযন্ত্রের থিম এবং চলচ্চিত্রের জন্য প্র...,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...","[[tensor(2), tensor(792), tensor(31661), tenso...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[[[tensor(1.), tensor(0.), tensor(0.), tensor(..."
3,Sentence: 1000,প্রতিষ্ঠান ২২১১ ইঙ্গিত করে যে আউগুস্তুস পৃথক ক...,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...","[[tensor(2), tensor(2857), tensor(3312), tenso...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[[[tensor(1.), tensor(0.), tensor(0.), tensor(..."
4,Sentence: 10000,শো স্টপারে তাদের বেকড আলাস্কা করতে সাড়ে চার ঘ...,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...","[[tensor(2), tensor(3057), tensor(22353), tens...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[[[tensor(1.), tensor(0.), tensor(0.), tensor(..."


#Model

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [24]:
bert = ElectraForTokenClassification.from_pretrained(model_name, output_hidden_states=True)

In [None]:
text1 = "সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।"
text2 = "সিনেমায় গানটির বৈশিষ্ট্য"
inputs1 = tokenizer(text1, max_length = 64, padding = 'max_length', return_tensors='pt')
inputs2 = tokenizer(text1, max_length = 64, padding = 'max_length', return_tensors='pt')

In [81]:
text = ["সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।", "সিনেমায় গানটির বৈশিষ্ট্য"]
inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')

In [82]:
inputs['input_ids'].size()

torch.Size([2, 64])

In [83]:
x = bert(inputs['input_ids'], inputs['attention_mask'])

In [84]:
x[0].size()

torch.Size([2, 64, 13])

In [90]:
hidden_states = x.hidden_states
layer6 = hidden_states[6]
layer12 = hidden_states[12]
layer18 = hidden_states[18]


torch.mean(torch.stack([layer6, layer12, layer18]), dim=0).size()

torch.Size([2, 64, 1024])

In [None]:
x

In [93]:
class NERClassification(nn.Module):
  def __init__(self, n_classes):
    super(NERClassification, self).__init__()
    self.bert = ElectraForTokenClassification.from_pretrained(model_name, output_hidden_states=True)
    self.drop = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.gOut = nn.Linear(n_classes,n_classes)
    self.final_out = nn.Linear(2*n_classes, n_classes)
    self.softmax = nn.Softmax(dim = 1)

  def forward(self, input_ids, attention_mask, gazetteer):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    hidden_states = pooled_output.hidden_states

    # print(hidden_states)
    
    layer6 = hidden_states[6]
    layer12 = hidden_states[12]
    layer18 = hidden_states[18]


    averaged_hidden_states = torch.mean(torch.stack([layer6, layer12, layer18]), dim=0)

    # print(pooled_output)

    output = self.drop(averaged_hidden_states)
    output = self.out(output)
    goutput = self.gOut(gazetteer)
    final_output = torch.concat((output,goutput), dim = 2)
    final_output = self.final_out(final_output)
    final_output = self.softmax(final_output)
    return final_output

In [94]:
model = NERClassification(n_classes=13)

In [92]:
from torchsummary import summary
summary(model, [(2, 64), (2, 64), (2,64,13)])

TypeError: ignored

In [105]:
text = "সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।"
inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')


In [None]:
inputs['input_ids']

tensor([[    2, 13509, 20928,  7887,  1401,  2772,  5078,   205,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [    2, 13509, 20928,  7887,     3,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0

In [None]:
inputs['attention_mask'].size()

torch.Size([1, 64])

In [107]:
Gazetteer(text)[0][0]

tensor(1.)

In [110]:
torch.full((64, 13),-100.)[0][0]

tensor(-100.)

In [52]:
inputs['input_ids'].to(device).size()

torch.Size([1, 64])

In [53]:
inputs['attention_mask'].to(device).size()

torch.Size([1, 64])

In [95]:


text = "সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।"
inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')

input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
gazetteer = Gazetteer(text).unsqueeze(0).to(device)
# Run the model
outputs = model(input_ids=input_ids, attention_mask=attention_mask, gazetteer = gazetteer)

print(outputs.size())

# Get the predicted label
x, predicted_label = torch.max(outputs, dim=2)

predicted_label

# Print the predicted label
# print(predicted_label.item())

torch.Size([1, 64, 13])


tensor([[ 9,  9,  6,  9,  0,  9,  3,  9,  9,  2,  5, 11,  8,  6,  0, 10,  5,  1,
          5,  4, 10,  4,  0,  2, 12,  0,  2,  4,  2,  4,  4,  0, 10,  4,  3,  2,
         11,  5,  8,  6, 10,  5,  1,  7,  6,  2,  1,  8,  0, 11,  0,  1, 10,  2,
          4, 11,  1,  0, 10, 10, 11,  7,  8,  8]])

In [103]:
torch.tensor([[[]]])

tensor([], size=(1, 1, 0))

In [96]:
#Multiple

text = ["সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।", "সিনেমায় গানটির বৈশিষ্ট্য"]
inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')

input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
gazetteer1 = Gazetteer(text[0]).unsqueeze(0).to(device)
gazetteer2 =  Gazetteer(text[1]).unsqueeze(0).to(device)
gazetteer = torch.cat((gazetteer1,gazetteer2),0)
# Run the model
outputs = model(input_ids=input_ids, attention_mask=attention_mask, gazetteer = gazetteer)

print(outputs.size())

# Get the predicted label
x, predicted_label = torch.max(outputs, dim=2)

predicted_label

# Print the predicted label
# print(predicted_label.item())

torch.Size([2, 64, 13])


tensor([[ 9,  9, 12,  6,  0,  9,  3,  9,  6,  4,  8,  8,  8,  9,  0, 10,  5, 12,
         11,  6,  4,  5,  0,  3,  5,  1,  7,  5,  2,  7,  7,  0, 10,  5,  1, 11,
          8,  5,  7,  1,  5,  5,  0, 11,  8, 10,  6,  5,  0,  0,  7, 10,  7,  1,
          4,  4,  1,  0,  8, 10,  9,  1, 11, 10],
        [ 9,  9,  4,  9,  6,  4,  0,  0,  9,  2,  2,  8,  5,  6,  2,  0,  2,  2,
          5, 12,  8,  8,  8,  1,  3,  1,  4, 12,  0, 10,  4,  7,  0,  4,  0,  5,
         10, 11,  7,  1,  1, 11,  0,  0,  7,  1,  8,  1,  8,  1,  7,  7, 12,  8,
          8, 10,  8,  8,  7,  7,  2,  7,  5, 10]])

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()


NERClassification(
  (bert): ElectraForTokenClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(32000, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0-23): 24 x ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linea

In [None]:
df_train['input_ids'][0]

tensor([[    2,  7349,   919,  3577, 26501, 11126, 12900,  5319, 15785, 16053,
           913,  1702,  2594,  1353,  3173,     3,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])

In [None]:
df_train.shape[0]

15301

In [None]:
from torch.nn import BCELoss

loss1 = BCELoss()
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for j in tqdm(range(df_train.shape[0])):
        optim.zero_grad()
        i,a,g, label = df_train['input_ids'][j].to(device) , df_train['attention_mask'][j].to(device) ,df_train['gazetteer'][j].to(device) ,df_train['Tag'][j].unsqueeze(0).to(device)
        outputs = model(input_ids = i, attention_mask=a, gazetteer = g)
        loss = loss1(outputs,label)
        if j % 100 == 0:
          print(loss)
        loss.backward()
        optim.step()

model.eval()

  0%|          | 1/15301 [00:00<34:25,  7.41it/s]

tensor(0.0388, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  1%|          | 102/15301 [00:14<33:23,  7.59it/s]

tensor(0.0334, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  1%|▏         | 202/15301 [00:31<37:25,  6.73it/s]

tensor(0.0407, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  2%|▏         | 301/15301 [00:45<40:16,  6.21it/s]

tensor(0.0347, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  3%|▎         | 401/15301 [00:58<37:47,  6.57it/s]

tensor(0.0953, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  3%|▎         | 501/15301 [01:12<36:45,  6.71it/s]

tensor(0.0211, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  4%|▍         | 601/15301 [01:25<35:29,  6.90it/s]

tensor(0.0730, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  5%|▍         | 702/15301 [01:40<31:48,  7.65it/s]

tensor(0.0384, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  5%|▌         | 801/15301 [01:56<33:48,  7.15it/s]

tensor(0.0612, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  6%|▌         | 902/15301 [02:10<31:54,  7.52it/s]

tensor(0.1020, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  7%|▋         | 1002/15301 [02:23<31:10,  7.64it/s]

tensor(0.0309, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  7%|▋         | 1102/15301 [02:37<30:41,  7.71it/s]

tensor(0.0977, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  8%|▊         | 1202/15301 [02:51<30:46,  7.64it/s]

tensor(0.0254, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  9%|▊         | 1302/15301 [03:05<30:27,  7.66it/s]

tensor(0.0872, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


  9%|▉         | 1402/15301 [03:19<30:20,  7.64it/s]

tensor(0.0385, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


 10%|▉         | 1502/15301 [03:33<30:18,  7.59it/s]

tensor(0.0433, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


 10%|█         | 1602/15301 [03:46<30:54,  7.39it/s]

tensor(0.0608, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


 11%|█         | 1701/15301 [04:00<35:51,  6.32it/s]

tensor(0.0454, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


 12%|█▏        | 1801/15301 [04:14<33:22,  6.74it/s]

tensor(0.0317, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


 12%|█▏        | 1888/15301 [04:26<28:56,  7.72it/s]

In [None]:
class NERDataset(Dataset):
  def __init__(self,df):
    self.input_ids = df['input_ids']
    self.attention_mask = df['attention_mask']
    self.gazetteer = df['gazetteer']
    self.label = df['Tag']
    self.n_samples = df.shape[0]

  def __getitem__(self, idx):
    return self.input_ids[idx].to(device),self.attention_mask[idx].to(device),self.gazetteer[idx].to(device),self.label[idx].to(device)
  
  def __len__(self):
    return self.n_samples
    

In [None]:
train_dataset = NERDataset(df_train)

In [None]:
train_dataset.n_samples

15301

In [None]:
train_loader = DataLoader(dataset = train_dataset, batch_size=1, shuffle=True)

In [None]:
from torch.nn import BCELoss

loss = BCELoss()
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        i,a,g, label = batch
        outputs = model(input_ids = i, attention_mask=a, gazetteer = g)
        loss = loss(outputs,label)
        loss.backward()
        optim.step()

model.eval()

In [None]:
from torch.nn import CrossEntropyLoss

loss1 = CrossEntropyLoss()
optim = AdamW(model.parameters(), lr=5e-5)

batch = 32
for i in tqdm(range(df_train.shape[0]//batch)):
  text = []
  g = None
  for t in tqdm(range(batch*i, (batch*i)+batch)):
    optim.zero_grad()
    df_train['Tag'][j].unsqueeze(0).to(device)
    text.append(df_train['Word'][t])
    if t == batch*i:
      g = Gazetteer(df_train['Word'][t]).unsqueeze(0).to(device)
    else:
      g = torch.cat((g,Gazetteer(df_train['Word'][t]).unsqueeze(0).to(device)),0)



  inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')
  inp = inputs['input_ids'].to(device)
  a = inputs['attention_mask'].to(device)
  label = df_train['Tag'][j].unsqueeze(0).to(device)
  outputs = model(input_ids = inp, attention_mask=a, gazetteer = g)
  loss = loss1(outputs,label)
  if j % 100 == 0:
    print(loss)
  loss.backward()
  optim.step()

model.eval()



In [None]:
from torch.nn import CrossEntropyLoss

loss1 = CrossEntropyLoss()
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for j in tqdm(range(df_train.shape[0]//32)):
        optim.zero_grad()
        
        i,a,g, label = df_train['input_ids'][j].to(device) , df_train['attention_mask'][j].to(device) ,df_train['gazetteer'][j].to(device) ,df_train['Tag'][j].unsqueeze(0).to(device)
        outputs = model(input_ids = i, attention_mask=a, gazetteer = g)
        loss = loss1(outputs,label)
        if j % 100 == 0:
          print(loss)
        loss.backward()
        optim.step()

model.eval()

ValueError: ignored