#Imports


In [1]:
!pip install git+https://github.com/csebuetnlp/normalizer
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-89r3t068
  Running command git clone --filter=blob:none --quiet https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-89r3t068
  Resolved https://github.com/csebuetnlp/normalizer to commit d80c3c484e1b80268f2b2dfaf7557fe65e34f321
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:

from normalizer import normalize
import torch
from transformers import ElectraTokenizer, ElectraForPreTraining, ElectraForTokenClassification, AdamW
from transformers import pipeline, AutoTokenizer, AutoModelForPreTraining , BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pickle

In [3]:
from tqdm import tqdm

#Gazeteer import

In [4]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()
        self.entity_tags = ["PER", "LOC", "CW", "CORP", "GRP", "PROD"]
        self.tag_encoding = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4,
                             "B-CW": 5, "I-CW": 6, "B-CORP": 7, "I-CORP": 8, 
                             "B-GRP": 9, "I-GRP": 10, "B-PROD": 11, "I-PROD": 12}

    def insert(self, word, entity_type):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
        node.entity_type = entity_type

    def search(self, sentence):
        encoding = [0] * len(sentence)
        for i in range(len(sentence)):
            node = self.root
            for j in range(i, len(sentence)):
                char = sentence[j]
                if char not in node.children:
                    break
                node = node.children[char]
                if node.is_end_of_word:
                    entity_type = node.entity_type
                    entity_length = j - i + 1
                    start_index = i
                    end_index = j
                    for k in range(start_index, end_index + 1):
                        if k == start_index:
                            encoding[k] = self.tag_encoding["B-" + entity_type]
                        else:
                            encoding[k] = self.tag_encoding["I-" + entity_type]
                    break
        one_hot_encoding = [[0] * 13 for i in range(len(encoding))]
        for i, tag in enumerate(encoding):
            one_hot_encoding[i][tag] = 1
        return one_hot_encoding

import numpy as np

def group_encodings_by_word(encoding, sentence):
    # Create an empty list to store the word encodings
    word_encodings = []
    
    # Create an empty list to store the current word encoding
    current_word_encoding = []
    
    # Create an empty string to store the current word
    current_word = ""
    
    # Iterate over each character encoding and character in the input encoding list and sentence, respectively
    for char_encoding, char in zip(encoding, sentence):
        # If the current character is a whitespace character, finish the current word and add its first character encoding to the word encodings list
        if char == " ":
            if len(current_word_encoding) > 0:
                word_encodings.append(np.array(current_word_encoding[0]))
                current_word_encoding = []
            current_word = ""
        # If the current character is part of a word, append the character encoding to the current word encoding and the character to the current word
        else:
            current_word_encoding.append(char_encoding)
            current_word += char
    
    # Add the last word encoding to the word encodings list, if it exists
    if len(current_word_encoding) > 0:
        word_encodings.append(np.array(current_word_encoding[0]))
    
    # Return the word encodings as a NumPy array
    return torch.from_numpy(np.array(word_encodings)).type(torch.float32)



def save_trie(trie, filename):
    with open(filename, "wb") as f:
        pickle.dump(trie, f)

def load_trie(filename):
    with open(filename, "rb") as f:
        trie = pickle.load(f)
    return trie


In [5]:

def Gtoken(text):
  inputs = tokenizer.encode_plus(text, return_tensors='pt')

  encoded_dict = tokenizer.encode_plus(
                  text,       # Sentence to encode.
                  add_special_tokens = False, # Add '[CLS]' and '[SEP]'
                  max_length = 314,           # Pad & truncate all sentences.
                  padding = 'max_length',
                  return_attention_mask = True,   # Construct attn. masks.
                  return_tensors = 'pt',
                  truncation=False)
  input_ids = encoded_dict['input_ids']
  tokenized = tokenizer.convert_ids_to_tokens([i.item() for i in input_ids.squeeze() if i > 1])
  return "< "+" ".join(tokenized)+" >"

In [6]:
# tree = Trie()
# entity_tags = ["PER", "LOC", "CW", "CORP", "GRP", "PROD"]
# for entity in entity_tags:
#     with open('{}.txt'.format(entity), 'r', encoding = 'utf-8') as file:
#         print(entity)
#         for line in file:
#             tree.insert(Gtoken(normalize(line.strip())), entity)

# save_trie(tree, "Trie.bin")


In [8]:
%cd /content/drive/MyDrive/Bracu/THESIS/Trie DS/

/content/drive/MyDrive/Bracu/THESIS/Trie DS


In [10]:
model_name = '/content/drive/MyDrive/Thesis/BERTOUTPUT/checkpoint-11000/'
tokenizer = ElectraTokenizer.from_pretrained(model_name)

In [11]:
trie = load_trie('Trie final.bin')
sentence = Gtoken(normalize("লিওনিদ ক্যান্টোরোভিচ টোকিও বিশ্ববিদ্যালয় এ"))
encoding = trie.search(sentence)
# for i, word in enumerate(sentence):
#     # print(encoding[i],word)

word_encodings = group_encodings_by_word(encoding,sentence)

# Print the word encodings
for word, word_encoding in zip(sentence.split(), word_encodings):
    print(word, word_encoding)

print(word_encodings.shape)

< tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
লিও tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
##নি tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
##দ tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
ক্যান্ট tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
##োর tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
##োভিচ tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
টোকিও tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
বিশ্ববিদ্যালয় tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
এ tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
> tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
torch.Size([11, 13])


In [12]:
def pad_tensor(tensor):
    current_size = tensor.size(0)
    if current_size >= 64:
        return tensor[:64, :]  # if the tensor is larger than (64, 13), truncate it
    
    padded_tensor = torch.full((64, 13),0.)
    padded_tensor[:current_size, :] = tensor  # copy the input tensor to the padded tensor
    return padded_tensor



In [13]:
def Gazetteer(sentence, pad = True):
  sentence = Gtoken(normalize(sentence))
  encoding = trie.search(sentence)
  tensor = group_encodings_by_word(encoding,sentence)
  if pad:
    current_size = tensor.size(0)
    if current_size >= 64:
        return tensor[:64, :]  # if the tensor is larger than (64, 13), truncate it
    
    padded_tensor = torch.full((64, 13),0.)
    padded_tensor[:current_size, :] = tensor  # copy the input tensor to the padded tensor
    return padded_tensor
  else: 
    return tensor

In [None]:
trie.search("ব্লেকলকের চিত্রকর্মগুলি ২০২০ চলচ্চিত্র আমি বিষয় শেষ করার চিন্তা করছি এর একটি মূল প্লট পয়েন্ট")

In [15]:
Gazetteer("ব্লেকলকের চিত্রকর্মগুলি ২০২০ চলচ্চিত্র আমি বিষয় শেষ করার চিন্তা করছি এর একটি মূল প্লট পয়েন্ট")

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        

#Dataset Load

In [16]:
%cd /content/drive/MyDrive/Bracu/THESIS/DatasetSir

/content/drive/MyDrive/Bracu/THESIS/DatasetSir


In [17]:
# Load the NER dataset
df_train = pd.read_csv('trainData2022PP2.csv')
df_val = pd.read_csv('devData2022PP2.csv')
df_test = pd.read_csv('testData2022PP2.csv')

In [18]:
d = {'O': 0, 'B-CORP': 1, 'I-CORP': 2, 'B-CW': 3, 'I-CW': 4, 'B-GRP': 5, 'I-GRP': 6, 'B-LOC': 7, 'I-LOC': 8, 'B-PER': 9, 'I-PER': 10, 'B-PROD': 11, 'I-PROD': 12}
def label_encoder(x):
    x = eval(x)
    y = []
    for i in x:
      y.append(d[i])
    return y

def text_normalizer(x):
    return normalize(x)

df_train['Word'] = df_train['Word'].apply(lambda x: text_normalizer(x))
df_test['Word'] = df_test['Word'].apply(lambda x: text_normalizer(x))
df_val['Word'] = df_val['Word'].apply(lambda x: text_normalizer(x))
df_train['Tag'] = df_train['Tag'].apply(lambda x: label_encoder(x))
df_test['Tag'] = df_test['Tag'].apply(lambda x: label_encoder(x))
df_val['Tag'] = df_val['Tag'].apply(lambda x: label_encoder(x))
df_train.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,২০১৮ এর সেরা বর্ণানুক্রমিকভাবে তালিকাভুক্ত র‍্...,"[0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,Sentence: 10,সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।,"[0, 0, 0, 0, 3, 4, 0]"
2,Sentence: 100,করে বাদ্যযন্ত্রের থিম এবং চলচ্চিত্রের জন্য প্র...,"[5, 0, 0, 0, 0, 0, 0, 0]"
3,Sentence: 1000,প্রতিষ্ঠান ২২১১ ইঙ্গিত করে যে আউগুস্তুস পৃথক ক...,"[0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Sentence: 10000,শো স্টপারে তাদের বেকড আলাস্কা করতে সাড়ে চার ঘ...,"[0, 0, 0, 11, 12, 0, 0, 0, 0, 0, 0]"


In [19]:
def one_hot_encode_list(lst, pad = True):

    one_hot_tensors = torch.zeros(len(lst), 13)
    for i, elem in enumerate(lst):
        one_hot = torch.zeros(13)
        # print(type(elem), elem)
        one_hot[elem] = 1
        
        one_hot_tensors[i] = one_hot
    if pad:
      tensor = one_hot_tensors
      current_size = tensor.size(0)
      if current_size >= 64:
          return tensor[:64, :]  # if the tensor is larger than (64, 13), truncate it
      
      padded_tensor = torch.full((64, 13),0.)
      padded_tensor[:current_size, :] = tensor  # copy the input tensor to the padded tensor
      return padded_tensor


one_hot_encode_list([0, 0, 0, 0, 3, 4, 0]).size()

torch.Size([64, 13])

In [20]:
df_val.tail()

Unnamed: 0,Sentence #,Word,Tag
796,Sentence: 95,১৬ থেকে ২৫ সেন্টিমিটার সারির মধ্যে একটি দূরত্ব...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11,..."
797,Sentence: 96,এই পাথরগুলির প্রায়ই তাদের কোণগুলি কাটা থাকে এ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0,..."
798,Sentence: 97,এই সময়কালে তিনি স্বপ্নও দেখেছিলেন যে নিরো ক্ল...,"[0, 0, 0, 0, 0, 0, 9, 10, 10, 0, 0, 0, 0, 0, 0..."
799,Sentence: 98,১২ মাস পর্যন্ত বয়স হওয়ায় ছিদ্রটি জলপাই তেল ...,"[0, 0, 0, 0, 0, 0, 11, 12, 0, 0, 0, 0]"
800,Sentence: 99,প্রতিযোগিতামূলক অপারেটিং সিস্টেমের তুলনায় সফট...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [21]:
def align_label(df):
  for row in tqdm(range(df.shape[0])):
    text = df['Word'][row]
    Tag = df['Tag'][row]
    inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')
    tokenized = tokenizer.convert_ids_to_tokens([i.item() for i in inputs['input_ids'].squeeze() if i > 1])
    i = 0
    j = 0
    new_tag = []
    prev_tag = 0
    while i < len(tokenized):
      if tokenized[i].startswith('['): 
        new_tag.append(0)
      elif tokenized[i].startswith('#'):
        if prev_tag % 2 != 0:
          new_tag.append(prev_tag+1)
        else:
          new_tag.append(prev_tag)
      else:
        if tokenized[i] == '.' or tokenized[i] == '′':
          new_tag.append(Tag[j])
          prev_tag = Tag[j]
        else:
          new_tag.append(Tag[j])
          prev_tag = Tag[j]
          j+=1
      i+=1
    df['Tag'][row] = new_tag
  return df


In [22]:
df_val = align_label(df_val)


100%|██████████| 801/801 [00:00<00:00, 863.88it/s]


In [23]:
df_train = align_label(df_train)

100%|██████████| 15301/15301 [00:21<00:00, 726.72it/s]


In [24]:
df_test = align_label(df_test)

100%|██████████| 133114/133114 [02:31<00:00, 876.66it/s]


In [25]:
df_train.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,২০১৮ এর সেরা বর্ণানুক্রমিকভাবে তালিকাভুক্ত র‍্...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0]"
1,Sentence: 10,সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।,"[0, 0, 0, 0, 0, 3, 4, 0, 0]"
2,Sentence: 100,করে বাদ্যযন্ত্রের থিম এবং চলচ্চিত্রের জন্য প্র...,"[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Sentence: 1000,প্রতিষ্ঠান ২২১১ ইঙ্গিত করে যে আউগুস্তুস পৃথক ক...,"[0, 0, 0, 0, 0, 0, 0, 9, 10, 10, 10, 0, 0, 0, ..."
4,Sentence: 10000,শো স্টপারে তাদের বেকড আলাস্কা করতে সাড়ে চার ঘ...,"[0, 0, 0, 0, 0, 11, 12, 12, 12, 0, 0, 0, 0, 0,..."


In [26]:
text = "ট্র্যাক ৭ থেকে একদা... ১৯৭৭ ক্যাসাব্লাঙ্কা"
inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')
tokenized = tokenizer.convert_ids_to_tokens([i.item() for i in inputs['input_ids'].squeeze() if i > 1])
print(tokenized)
Tag = [0, 0, 0, 3, 0, 0]
i = 0
j = 0
new_tag = []
prev_tag = 0
while i < len(tokenized):
  if tokenized[i].startswith('['): 
    new_tag.append(0)
  elif tokenized[i].startswith('#'):
    if prev_tag % 2 != 0:
      new_tag.append(prev_tag+1)
    else:
      new_tag.append(prev_tag)
  else:
    if tokenized[i] == '.':
      new_tag.append(Tag[j])
      prev_tag = Tag[j]
    else:
      new_tag.append(Tag[j])
      prev_tag = Tag[j]
      j+=1
  
  i+=1
print(new_tag)

['[CLS]', 'ট্র্যাক', '৭', 'থেকে', 'একদা', '.', '.', '.', '১৯৭৭', 'ক্যাস', '##াব', '##্লা', '##ঙ্কা', '[SEP]']
[0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [27]:
d

{'O': 0,
 'B-CORP': 1,
 'I-CORP': 2,
 'B-CW': 3,
 'I-CW': 4,
 'B-GRP': 5,
 'I-GRP': 6,
 'B-LOC': 7,
 'I-LOC': 8,
 'B-PER': 9,
 'I-PER': 10,
 'B-PROD': 11,
 'I-PROD': 12}

In [28]:
df_val['Tag'][1]

[0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [29]:
df_val['Tag'] =  df_val['Tag'].apply(lambda x: one_hot_encode_list(x))
df_train['Tag'] =  df_train['Tag'].apply(lambda x: one_hot_encode_list(x))
df_test['Tag'] =  df_test['Tag'].apply(lambda x: one_hot_encode_list(x))

In [30]:
df_train.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,২০১৮ এর সেরা বর্ণানুক্রমিকভাবে তালিকাভুক্ত র‍্...,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0..."
1,Sentence: 10,সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0..."
2,Sentence: 100,করে বাদ্যযন্ত্রের থিম এবং চলচ্চিত্রের জন্য প্র...,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0..."
3,Sentence: 1000,প্রতিষ্ঠান ২২১১ ইঙ্গিত করে যে আউগুস্তুস পৃথক ক...,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0..."
4,Sentence: 10000,শো স্টপারে তাদের বেকড আলাস্কা করতে সাড়ে চার ঘ...,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0..."


#Model

In [31]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
# bert = ElectraForTokenClassification.from_pretrained(model_name, output_hidden_states=True)

In [None]:
# text1 = "সিনেমায় গানটির বৈশিষ্ট্য রয়েছে রাস্তা যাত্রা ।"
# text2 = "সিনেমায় গানটির বৈশিষ্ট্য"
# inputs1 = tokenizer(text1, max_length = 64, padding = 'max_length', return_tensors='pt')
# inputs2 = tokenizer(text1, max_length = 64, padding = 'max_length', return_tensors='pt')
# i

In [32]:
class NERClassification(nn.Module):
  def __init__(self, n_classes):
    super(NERClassification, self).__init__()
    self.bert = ElectraForTokenClassification.from_pretrained(model_name, output_hidden_states=True)
    self.drop = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.gOut = nn.Linear(n_classes,n_classes)
    self.final_out = nn.Linear(2*n_classes, n_classes)
    self.softmax = nn.Softmax(dim = 1)

  def forward(self, input_ids, attention_mask, gazetteer):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    hidden_states = pooled_output.hidden_states

    # print(hidden_states)
    
    layer6 = hidden_states[6]
    layer12 = hidden_states[12]
    layer18 = hidden_states[18]


    averaged_hidden_states = torch.mean(torch.stack([layer6, layer12, layer18]), dim=0)

    # print(pooled_output)

    output = self.drop(averaged_hidden_states)
    output = self.out(output)
    goutput = self.gOut(gazetteer)
    final_output = torch.concat((output,goutput), dim = 2)
    final_output = self.final_out(final_output)
    final_output = self.softmax(final_output)
    return final_output

In [33]:
model = NERClassification(n_classes=13)

In [34]:


text = "১৯৬০ শীতকালীন অলিম্পিকে স্পেন কোর্স ক্যাটালগ,"
inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')

input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
gazetteer = Gazetteer(text).unsqueeze(0).to(device)
# Run the model
outputs = model(input_ids=input_ids, attention_mask=attention_mask, gazetteer = gazetteer)

print(outputs.size())

# Get the predicted label
x, predicted_label = torch.max(outputs, dim=2)

predicted_label

# Print the predicted label
# print(predicted_label.item())

RuntimeError: ignored

In [35]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()


NERClassification(
  (bert): ElectraForTokenClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(32000, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0-23): 24 x ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linea

In [36]:
df_train.shape[0]

15301

In [None]:
# from torch.nn import BCELoss

# loss1 = BCELoss()
# optim = AdamW(model.parameters(), lr=5e-5)

# for epoch in range(3):
#     for j in tqdm(range(df_train.shape[0])):
#         optim.zero_grad()
#         i,a,g, label = df_train['input_ids'][j].to(device) , df_train['attention_mask'][j].to(device) ,df_train['gazetteer'][j].to(device) ,df_train['Tag'][j].unsqueeze(0).to(device)
#         outputs = model(input_ids = i, attention_mask=a, gazetteer = g)
#         loss = loss1(outputs,label)
#         if j % 100 == 0:
#           print(loss)
#         loss.backward()
#         optim.step()

# model.eval()

In [None]:
# class NERDataset(Dataset):
#   def __init__(self,df):
#     self.input_ids = df['input_ids']
#     self.attention_mask = df['attention_mask']
#     self.gazetteer = df['gazetteer']
#     self.label = df['Tag']
#     self.n_samples = df.shape[0]

#   def __getitem__(self, idx):
#     return self.input_ids[idx].to(device),self.attention_mask[idx].to(device),self.gazetteer[idx].to(device),self.label[idx].to(device)
  
#   def __len__(self):
#     return self.n_samples
    

In [None]:
# train_dataset = NERDataset(df_train)

In [None]:
# train_dataset.n_samples

15301

In [None]:
# train_loader = DataLoader(dataset = train_dataset, batch_size=1, shuffle=True)

In [None]:
# from torch.nn import BCELoss

# loss = BCELoss()
# optim = AdamW(model.parameters(), lr=5e-5)

# for epoch in range(3):
#     for batch in train_loader:
#         optim.zero_grad()
#         i,a,g, label = batch
#         outputs = model(input_ids = i, attention_mask=a, gazetteer = g)
#         loss = loss(outputs,label)
#         loss.backward()
#         optim.step()

# model.eval()

In [37]:
from torch.nn import CrossEntropyLoss

loss1 = CrossEntropyLoss()
optim = AdamW(model.parameters(), lr=5e-5)

batch = 32
for epoch in tqdm(range(10)):
  for i in range((df_train.shape[0]//batch)-1):
    text = []
    g = None
    label = None
    for t in range(batch*i, (batch*i)+batch):
      optim.zero_grad()
      text.append(df_train['Word'][t])
      if t == batch*i:
        g = Gazetteer(df_train['Word'][t]).unsqueeze(0).to(device)
        label = df_train['Tag'][t].unsqueeze(0).to(device)
      else:
        g = torch.cat((g,Gazetteer(df_train['Word'][t]).unsqueeze(0).to(device)),0)
        label = torch.cat((label,df_train['Tag'][t].unsqueeze(0).to(device)),0)



    inputs = tokenizer(text, max_length = 64, padding = 'max_length', return_tensors='pt')
    inp = inputs['input_ids'].to(device)
    a = inputs['attention_mask'].to(device)
    outputs = model(input_ids = inp, attention_mask=a, gazetteer = g)
    loss = loss1(outputs,label)
    if i % 100 == 0:
      print(loss)
    loss.backward()
    optim.step()
  if epoch+1 % 3 == 0:
    with open('model e{}.pkl'.format(epoch+1), 'wb') as f:
      pickle.dump(model, f)
model.eval()



  0%|          | 0/10 [00:00<?, ?it/s]

tensor(5.8297, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.3045, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.9094, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8796, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2426, device='cuda:0', grad_fn=<DivBackward1>)


 10%|█         | 1/10 [08:08<1:13:20, 488.98s/it]

tensor(5.6469, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2236, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8893, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8722, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2429, device='cuda:0', grad_fn=<DivBackward1>)


 20%|██        | 2/10 [16:18<1:05:15, 489.46s/it]

tensor(5.6421, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2275, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8890, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8786, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2415, device='cuda:0', grad_fn=<DivBackward1>)


 30%|███       | 3/10 [24:28<57:08, 489.79s/it]  

tensor(5.6417, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2236, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8881, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8718, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2424, device='cuda:0', grad_fn=<DivBackward1>)


 40%|████      | 4/10 [32:40<49:03, 490.54s/it]

tensor(5.6364, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2228, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8888, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8709, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2402, device='cuda:0', grad_fn=<DivBackward1>)


 50%|█████     | 5/10 [40:49<40:49, 489.98s/it]

tensor(5.6380, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2229, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8865, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8702, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2473, device='cuda:0', grad_fn=<DivBackward1>)


 60%|██████    | 6/10 [48:58<32:37, 489.48s/it]

tensor(5.6364, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2240, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8888, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8725, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2398, device='cuda:0', grad_fn=<DivBackward1>)


 70%|███████   | 7/10 [57:06<24:27, 489.24s/it]

tensor(5.6362, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2227, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8914, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8787, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2426, device='cuda:0', grad_fn=<DivBackward1>)


 80%|████████  | 8/10 [1:05:15<16:18, 489.06s/it]

tensor(5.6455, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2243, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8965, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8698, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2398, device='cuda:0', grad_fn=<DivBackward1>)


 90%|█████████ | 9/10 [1:13:24<08:08, 488.97s/it]

tensor(5.6385, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2227, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.8901, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.8679, device='cuda:0', grad_fn=<DivBackward1>)
tensor(5.2396, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 10/10 [1:21:33<00:00, 489.31s/it]


NERClassification(
  (bert): ElectraForTokenClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(32000, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0-23): 24 x ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linea

In [None]:
%cd /content

/content


In [76]:
import pickle
with open('model10.pkl', 'wb') as f:
    pickle.dump(model, f)

In [75]:
 'model e{}.pkl'.format(9+1)

'model e10.pkl'