In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
from transformers import AdamW
from tqdm import tqdm  
import random

from torch.utils.data import random_split

from torch.utils.data import DataLoader

from transformers import TrainingArguments
from transformers import Trainer


# BERT BASE uncased
## Loading BERT Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',Fast = True)

## Loading Pre Trained Model

In [2]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased',return_dict=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Data Pre Processing 
## Loading File and sub sampling 

In [None]:
with open('Roman_Urdu_Twitter.txt', 'r', encoding='utf-8') as fp:
    text = fp.read().split('\n')
sub_text = random.sample(text,101338)

## Creating Tokens

In [None]:
inputs = tokenizer(sub_text, return_tensors='pt', max_length=64, truncation=True, padding='max_length')

In [None]:
torch.save(inputs,'inputs_sub_new.pt')

In [3]:
inputs = torch.load('inputs_sub_new.pt')

## Creating Labels  

In [4]:
inputs['labels'] = inputs.input_ids.detach().clone()

## Masking

In [5]:
random_tensor = torch.rand(inputs['input_ids'].shape)

In [6]:
masked_tensor = (random_tensor < 0.15)*(inputs['input_ids'] != 101)*(inputs['input_ids'] != 102)*(inputs['input_ids'] != 0)

In [7]:
# getting all those indices from each row which are set to True, i.e. masked.
nonzeros_indices = []
for i in range(len(masked_tensor)):
    nonzeros_indices.append(torch.flatten(masked_tensor[i].nonzero()).tolist())

In [8]:
# setting the values at those indices to be a MASK token (103) for every row in the original input_ids.
for i in range(len(inputs['input_ids'])):
    inputs['input_ids'][i, nonzeros_indices[i]] = 103
    
# 101 CLS TOKEN  
# 103 MASK
# 102 SCP Token
# 0 PADDING TOKEN

## Custom Dataset Class Pytorch

In [9]:
class RomanUrduDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, index):
        input_ids = self.encodings['input_ids'][index]
        labels = self.encodings['labels'][index]
        attention_mask = self.encodings['attention_mask'][index]
        token_type_ids = self.encodings['token_type_ids'][index]
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids}

In [10]:
dataset = RomanUrduDataset(inputs)

## Train and validation dataset split

In [23]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

In [13]:
train_dataset, val_dataset = random_split(dataset, [train_size,val_size])

## Hyperparameters

In [15]:
epochs = 10
batch_size = 12
#optimizer = AdamW(model.parameters(), lr=1e-5)

# Pytorch DataLoader

In [16]:
dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True
)

### Device setting

In [17]:
model.to(torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

device(type='cuda')

# Training using Hugging Face Trainer

In [19]:
args = TrainingArguments(
    output_dir='MLM/new/model',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size= batch_size,
    num_train_epochs=epochs,
    logging_dir = 'MLM/new/log',
    evaluation_strategy="steps",
    logging_steps=1000,
    logging_first_step=True,
    save_steps=1000,
)

In [21]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset, 
    eval_dataset = val_dataset
)

In [22]:
results = trainer.train()

***** Running training *****
  Num examples = 91204
  Num Epochs = 10
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 76010


Step,Training Loss,Validation Loss
1000,0.4009,0.298398
2000,0.2927,0.281184
3000,0.2867,0.272883
4000,0.2738,0.266143
5000,0.2675,0.26104
6000,0.2638,0.255184
7000,0.264,0.252878
8000,0.2418,0.254567
9000,0.2178,0.249827
10000,0.222,0.25039


***** Running Evaluation *****
  Num examples = 10134
  Batch size = 12
Saving model checkpoint to MLM/new/model\checkpoint-1000
Configuration saved in MLM/new/model\checkpoint-1000\config.json
Model weights saved in MLM/new/model\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10134
  Batch size = 12
Saving model checkpoint to MLM/new/model\checkpoint-2000
Configuration saved in MLM/new/model\checkpoint-2000\config.json
Model weights saved in MLM/new/model\checkpoint-2000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10134
  Batch size = 12
Saving model checkpoint to MLM/new/model\checkpoint-3000
Configuration saved in MLM/new/model\checkpoint-3000\config.json
Model weights saved in MLM/new/model\checkpoint-3000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10134
  Batch size = 12
Saving model checkpoint to MLM/new/model\checkpoint-4000
Configuration saved in MLM/new/model\checkpoint-4000\config.json
Model weigh

In [24]:
results

TrainOutput(global_step=76010, training_loss=0.11892884478704221, metrics={'train_runtime': 69813.9237, 'train_samples_per_second': 13.064, 'train_steps_per_second': 1.089, 'total_flos': 3.0006664683264e+16, 'train_loss': 0.11892884478704221, 'epoch': 10.0})

In [25]:
type(results)

transformers.trainer_utils.TrainOutput