In [1]:
import json
import torch
import torch.nn as nn
import numpy as np  
import datasets
from datasets import Dataset as Datasets
from datasets import DatasetDict
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer,AutoTokenizer, DataCollatorForTokenClassification, logging

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
if torch.cuda.is_available():
    device=torch.device("cuda:0")
    print("Running on the GPU")
    torch.cuda.empty_cache()

else:
    device=torch.device("cpu")
    print("Running on the CPU")


Running on the GPU


In [3]:
# This are some functions to help in loading and saving json file

def save_json(data,name):
    with open(name+'.json', 'w',encoding='utf8') as f:
        json.dump(data, f,ensure_ascii=False)
        
def save_jsonl(data,name):
    with open(name+'.jsonl', 'w') as f:
        for entry in data:
            json.dump(entry, f)
            outfile.write('\n')

def load_json(path):
    f = open (path, "r")
    data = json.loads(f.read())
    f.close()
    return data

def load_jsonl(path):
    f = open(path,"r").readlines()
    data=[]
    for i in f:
        d = json.loads(i)
        data.append(d)
    return data

# Data Preparation

The raw datasets are processed according to the need. The notebook for this is shared in another notebook. Let's load the data and make a dictionary of whole dataset. 

In [4]:
# Replace the derectory according to yours
train_data = load_json("path/to/train.json")
valid_data = load_json("path/to/valid.json")
test_data = load_json("path/to/test.json")

all_data_dict = DatasetDict({"train": Datasets.from_dict(train_data),"valid":Datasets.from_dict(valid_data),"test":Datasets.from_dict(test_data)})

The dataset is prepared in such way that each spaces separated parts in a sentence will have a label. But during tokenization those parts may get splitted, so, we will have to readjust the labels according to the tokenizer with some other basic operations during preprocessing of the dataset.

In [5]:
def preprocess_function(examples):
    
    dic = {}
    dic["input_ids"] = []
    dic['token_type_ids'] = []
    dic['attention_mask'] = []
    dic["labels"] = []
     
    for i in range(len(examples["sentences"])):
        sen_li = examples["sentences"][i].split(" ")
        tok = tokenizer(sen_li, max_length=20, truncation=True)

        dic_input_ids = [2]
        dic_token_type_ids = [0]
        dic_attention_mask = [1]
        dic_labels = [0]

        input_ids =tok["input_ids"]
        token_type_ids =tok['token_type_ids']
        attention_mask=tok['attention_mask']
        label=examples['label'][i]

        for t in range(len(sen_li)):
            input_id_len = len(input_ids[t])
            dic_input_ids.extend(input_ids[t][1:input_id_len-1])
            dic_token_type_ids.extend(token_type_ids[t][1:input_id_len-1])
            dic_attention_mask.extend(attention_mask[t][1:input_id_len-1])
            dic_labels.extend([label[t]]*(input_id_len-2))
    
        
        dic["input_ids"].append(dic_input_ids+[3]+[0]*(256-len(dic_input_ids)-1))
        dic['token_type_ids'].append(dic_token_type_ids+[0]*(256-len(dic_token_type_ids)))
        dic['attention_mask'].append(dic_attention_mask+[1]+[0]*(256-len(dic_attention_mask)-1))
        dic['labels'].append(dic_labels+[0]*(256-len(dic_labels)))
    
    return dic


In [6]:
# Let's load the tokenizer first
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")

# Creat the dataset according to the preprocessing
all_dataset = all_data_dict.map(preprocess_function, batched=True)
all_dataset_back = all_dataset.copy() # Keeping a copy
all_dataset=all_dataset.remove_columns(['sentences','entity','label']) #Dropping unnecessary columns for training

# Preparing the collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Modifying Trainer

After preparing the data the next task is to changing the trainer according to our need. In our case, we are doning Bengali NER only for Persons. As it is a binary classinfication we will have to change the loss calculation process in the trainer. Also, token's are imbalanced here. So, we will use a F1 score a our metric. For this purpose we will write another function here.

In [7]:

class MyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.my_loss = nn.BCELoss(reduction = 'none')
        self.sigmoid = nn.Sigmoid()
        
    def compute_loss(self, model, inputs,return_outputs=False):

        outputs = model(**inputs)
        outputs.logits = outputs.logits[:,:,0]
        att_mask = inputs.attention_mask.type(torch.float32) # attention mask will be used to exclude redundent portion from loss calculation
        
        loss = torch.sum(self.my_loss(self.sigmoid(outputs.logits),inputs.labels.type(torch.float32))*att_mask)/torch.sum(att_mask)
        
        return (loss, outputs) if return_outputs else loss
    

def compute_metrics(eval_pred):
    predictions, label_ids, inputs = eval_pred
    pred_mask = (inputs!=2)*(inputs!=3)*(inputs!=0)*1 # This mask will be used to exclude redundent parts from calculation
    predictions = (1/(1 + np.exp(-predictions))>0.5)*1

    TP = np.sum((label_ids==predictions)*label_ids*pred_mask)
    FP = np.sum((label_ids!=predictions)*predictions*pred_mask)
    FN = np.sum((label_ids!=predictions)*label_ids*pred_mask)
    
    F1 = TP/(TP+0.5*(FP+FN)+1e-10)
    
    result = {}
    result["F1"]=F1
    
    return {k: round(v, 4) for k, v in result.items()}

# Training

After completing all the basic needs for training we can now start our training.

In [8]:
import wandb
wandb.login(key='your wandb log in key') # logging in to wandb

# Loading model
model = AutoModelForTokenClassification.from_pretrained("csebuetnlp/banglabert", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    metric_for_best_model="F1",
    num_train_epochs=3,
    weight_decay=0.01,
    include_inputs_for_metrics =True,
    seed = 0
)

trainer = MyTrainer(
    model=model.to(device),
    args=training_args,
    train_dataset=all_dataset["train"],
    eval_dataset=all_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of the model checkpoint at csebuetnlp/banglabert were not used when initializing ElectraForTokenClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You shoul

In [9]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msknahin[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.0719,0.03567,0.8923
2,0.0344,0.043089,0.8982
3,0.0143,0.04424,0.8992


TrainOutput(global_step=2178, training_loss=0.033961700123313156, metrics={'train_runtime': 947.9656, 'train_samples_per_second': 36.748, 'train_steps_per_second': 2.298, 'total_flos': 4551266908827648.0, 'train_loss': 0.033961700123313156, 'epoch': 3.0})

# Inference

In [10]:
out = trainer.predict(all_dataset["test"])
predictions, label_ids ,metric_= out
print(metric_)

{'test_loss': 0.04884174093604088, 'test_F1': 0.8852, 'test_runtime': 8.3207, 'test_samples_per_second': 111.889, 'test_steps_per_second': 7.091}


In [11]:
all_sentences = all_dataset_back["test"]["sentences"]
all_inputs = np.array(all_dataset_back["test"]["input_ids"])
all_labels = np.array(all_dataset_back["test"]["labels"])
all_masks = (all_inputs!=0)*(all_inputs!=2)*(all_inputs!=3)*1
predictions = (1/(1 + np.exp(-predictions))>0.5)*all_masks

In [12]:
# The code bellow will generate a dataframe of predictions and groundtruths. 
# It is tried to keep the names separated with a comma if there are two or more names in a sentence

import pandas as pd

all_sen = []
all_name = []
all_name_leb = []

for i in range(len(predictions)):
    sen = all_sentences[i]
    name_tok_idx = np.where(predictions[i]==1)[0]
    name_tok0 = all_inputs[i][name_tok_idx]
    if len(name_tok_idx)!=0:
        name_tok = [name_tok0[0]]
        for k in range(len(name_tok_idx)-1):
            if name_tok_idx[k+1]-name_tok_idx[k]!=1:
                name_tok.extend([16])
            else:
                name_tok.extend([name_tok0[k+1]])
    else:
        name_tok = []
    name = tokenizer.decode(name_tok,skip_special_tokens=True)
    
    name_leb_tok_idx = np.where(all_labels[i]==1)[0]
    name_leb_tok0 = all_inputs[i][name_leb_tok_idx]
    if len(name_leb_tok_idx)!=0:
        name_leb_tok = [name_leb_tok0[0]]
        for k in range(len(name_leb_tok_idx)-1):
            if name_leb_tok_idx[k+1]-name_leb_tok_idx[k]!=1:
                name_leb_tok.extend([16])
            else:
                name_leb_tok.extend([name_leb_tok0[k+1]])
    else:
        name_leb_tok = []
        
    name_leb = tokenizer.decode(name_leb_tok,skip_special_tokens=True)
    
    all_sen.append(sen)
    all_name.append(name)
    all_name_leb.append(name_leb)

df = pd.DataFrame({"sentence":all_sen,"pred_name":all_name,"gt_name":all_name_leb})

In [13]:
df[0:100]

Unnamed: 0,sentence,pred_name,gt_name
0,সিলেটের উত্তর সার্কেলের সহকারী পুলিশ সুপার ( এ...,"মহিউদ্দিন সুহেল,. শফিকুর রহমান খান, আমীনসহ","মহিউদ্দিন সুহেল,. শফিকুর রহমান খান, আমীনসহ"
1,এরকম একজন হারিয়ে যাওয়া মনীষী কিশোরগঞ্জের আনন্দ...,আনন্দমোহন বসু,আনন্দমোহন বসু
2,একতরফা এই নির্বাচন করতে না পারলে শক্তির খেলায় ...,শমসের মবিন চৌধুরী,শমসের মবিন চৌধুরী
3,ভারতের কূটনীতিক দেবযানী খোবরাগাড়েকে জাতিসংঘে ব...,দেবযানী,দেবযানী
4,বিপ্রবেলঘরিয়া ইউনিয়ন আওয়ামী লীগের সভাপতি হে...,"হেলাল উদ্দীন, হোসেন","হেলাল উদ্দীন, হোসেন"
...,...,...,...
95,চট্টগ্রাম বিশ্ববিদ্যালয়ের সমাজতত্ত্ব বিভাগের ...,ইফতেখার উদ্দিনকে,অধ্যাপক ইফতেখার উদ্দিনকে
96,প্রশাসনিক এই সংকটের মধ্যেই এএপির অভ্যন্তরীণ সং...,জিতেন্দ্র সিং টোমারকে,জিতেন্দ্র সিং টোমারকে
97,ইত্তেফাক-এর মাঈনুল আলম ও ভোরের কাগজ-এর আঙ্গুর ...,"মাঈনুল আলম,ুর নাহার","মাঈনুল আলম,ুর নাহার"
98,উপমহাদেশের প্রখ্যাত নৃত্যগুরু প্রয়াত বুলবুল চৌ...,"বুলবুল চৌধুরীর, চৌধুরী","বুলবুল চৌধুরীর, চৌধুরী"


In [14]:
df.to_csv("prediction.csv",index = False)