In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

## Loading Model
<hr>

In [2]:
model_checkpoint = "bert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Tokenizer
<hr>

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,truncation=True)

## Dataset
<hr>

In [4]:
from datasets import load_dataset

urdu_ds = load_dataset('text', data_files={'train': ['Roman_Urdu_Twitter.txt']} )

Using custom data configuration default-6d2b9b11ef8e8dc4
Reusing dataset text (C:\Users\Tayyab\.cache\huggingface\datasets\text\default-6d2b9b11ef8e8dc4\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


  0%|          | 0/1 [00:00<?, ?it/s]

## Pre processing
<hr>

In [5]:
def preProcess(text):
    string = {}
    value =  text["text"]
    string["text"] = value.strip('\t').strip()
    return string

cleaned = urdu_ds.map(preProcess)

Loading cached processed dataset at C:\Users\Tayyab\.cache\huggingface\datasets\text\default-6d2b9b11ef8e8dc4\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-98b2a4f37384c637.arrow


In [6]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = cleaned.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets

Loading cached processed dataset at C:\Users\Tayyab\.cache\huggingface\datasets\text\default-6d2b9b11ef8e8dc4\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-c68c74a2aed6aaf5.arrow


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 3040167
    })
})

In [7]:
chunk_size = 64

In [8]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Loading cached processed dataset at C:\Users\Tayyab\.cache\huggingface\datasets\text\default-6d2b9b11ef8e8dc4\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-4279f5072cd1d942.arrow


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 1615951
    })
})

In [10]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

## Train Test Split      
<hr>

In [11]:
train_size = 200000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size = train_size, 
    test_size = test_size, 
    seed = 14
)
downsampled_dataset

Loading cached split indices for dataset at C:\Users\Tayyab\.cache\huggingface\datasets\text\default-6d2b9b11ef8e8dc4\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-88f56ff9f3823e75.arrow and C:\Users\Tayyab\.cache\huggingface\datasets\text\default-6d2b9b11ef8e8dc4\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-a9b57ffe17ce1c52.arrow


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 200000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 20000
    })
})

In [12]:
batch_size = 14
model_name = model_checkpoint.split("/")[-1]

learning_rate = 1e-5
weight_decay = 0.01

## Training Aurguments
<hr>

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = f"{model_name}-finetuned-Roman_Urdu",
    overwrite_output_dir = True,
    learning_rate = learning_rate,
    weight_decay = weight_decay,
    
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    
    evaluation_strategy="steps",
    logging_steps=1000,
    logging_first_step=True
)

In [14]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = downsampled_dataset["train"],
    eval_dataset = downsampled_dataset["test"],
    data_collator = data_collator,
)

## Perplexity before Training
<hr>

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

## Training
<hr>

In [15]:
trainer.train()

***** Running training *****
  Num examples = 200000
  Num Epochs = 3
  Instantaneous batch size per device = 14
  Total train batch size (w. parallel, distributed & accumulation) = 14
  Gradient Accumulation steps = 1
  Total optimization steps = 42858


Step,Training Loss,Validation Loss
1000,4.3056,3.923249
2000,3.9004,3.721309
3000,3.7681,3.596276
4000,3.6502,3.512527
5000,3.5711,3.429344
6000,3.519,3.390066
7000,3.459,3.328017
8000,3.4349,3.289398
9000,3.3823,3.248232
10000,3.3293,3.230811


Saving model checkpoint to bert-base-uncased-finetuned-Roman_Urdu\checkpoint-500
Configuration saved in bert-base-uncased-finetuned-Roman_Urdu\checkpoint-500\config.json
Model weights saved in bert-base-uncased-finetuned-Roman_Urdu\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 14
Saving model checkpoint to bert-base-uncased-finetuned-Roman_Urdu\checkpoint-1000
Configuration saved in bert-base-uncased-finetuned-Roman_Urdu\checkpoint-1000\config.json
Model weights saved in bert-base-uncased-finetuned-Roman_Urdu\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to bert-base-uncased-finetuned-Roman_Urdu\checkpoint-1500
Configuration saved in bert-base-uncased-finetuned-Roman_Urdu\checkpoint-1500\config.json
Model weights saved in bert-base-uncased-finetuned-Roman_Urdu\checkpoint-1500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 14
Saving model checkpoint to bert-base-uncased-finetuned-

TrainOutput(global_step=42858, training_loss=3.213156647085615, metrics={'train_runtime': 50905.7241, 'train_samples_per_second': 11.786, 'train_steps_per_second': 0.842, 'total_flos': 1.974036096e+16, 'train_loss': 3.213156647085615, 'epoch': 3.0})

## Perplexity after Training
<hr>

In [18]:
import math
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 20000
  Batch size = 14


>>> Perplexity: 18.16


In [1]:
eval_results

NameError: name 'eval_results' is not defined

In [None]:
42858 , 3.213156647085615 , 3.0

<hr>