<a href="https://colab.research.google.com/github/OmkarDhekane/Data-science-projects/blob/main/pretraining_step_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Importing libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip3 install transformers
!pip3 install datasets
!pip3 install huggingface-hub

In [None]:
import random
import logging

import tensorflow as tf
from tensorflow import keras

from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments,Trainer


# Only log error messages
tf.get_logger().setLevel(logging.ERROR)
# Set random seed
tf.keras.utils.set_random_seed(42)

# 2. Defining variables

In [None]:
# TOKENIZER_BATCH_SIZE = 1024  # Batch-size to train the tokenizer on
# TOKENIZER_VOCABULARY = 30522  # Total number of unique subwords the tokenizer can have
MAX_LENGTH = 512  # Maximum number of tokens in an input sample after padding

# TRAIN_BATCH_SIZE = 8  # Batch-size for pretraining the model on
# LEARNING_RATE = 5e-4  # Learning rate for training the model

MODEL_CHECKPOINT = "bert-base-cased"  # Name of pretrained model from huggingface Model Hub

# 3. Loading dataset

In [None]:
#ngrams
train_file = '/content/drive/MyDrive/L3 Cube/ngram_selected_data_realnews_imdb.txt'
#train_file = '/content/drive/MyDrive/L3cube_folder/L3Cube/ngram_selected_data_realnews_imdb.txt'
#random selection
#train_file = '/content/drive/MyDrive/L3cube_folder/L3Cube/random_selected_data_realnews.txt'

dataset = load_dataset("text", data_files=train_file)

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-35579358f20fc31c/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-35579358f20fc31c/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 200001
    })
})

# 4. Tokenization and spliting the dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def tokenize(examples):
  return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)

tokenized_dataset = dataset['train'].map(tokenize,
                                remove_columns=["text"],
                                batched=True,
                                num_proc = 1).shuffle(seed=42)

In [None]:
realnews = tokenized_dataset.train_test_split(test_size = 0.1)

train_dataset = realnews['train']
eval_dataset  = realnews['test']

print(len(train_dataset),len(eval_dataset))

180000 20001


In [None]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200001
})

# 5. Define MLM Objective and BERT-Model

In [None]:
MLM_PROB = 0.2  # Probability with which tokens are masked in MLM
MLM_FLAG = True

collater = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=MLM_FLAG,
    mlm_probability=MLM_PROB,
)

In [None]:
model = AutoModelForMaskedLM.from_pretrained("bert-base-cased")

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 6. Train Model

In [None]:
training_args = TrainingArguments(
    output_dir='/content/outputs',          # output directory to where save model checkpoint
    overwrite_output_dir=True,
    per_device_train_batch_size = 12, # the training batch size, putting it as high as GPU memory fits
    per_device_eval_batch_size  = 32, # evaluation batch size
    evaluation_strategy = 'epoch',
    save_strategy  = 'epoch',
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    gradient_checkpointing=True,
    fp16=True,
    learning_rate  = 1e-4,
    weight_decay  = 0.01,
    num_train_epochs= 2,            # number of training epochs, feel free to tweak
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collater,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [None]:
results = trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,2.4384,2.233584


In [None]:
print(results)

# 7. Evalute Model


In [None]:
trainer.evaluate()

# 8. Saving model to huggingface-hub

In [None]:
## list of pretraining files to upload on hub

import os
# os.listdir('/content/outputs/checkpoint-2388')

['rng_state.pth',
 'scheduler.pt',
 'training_args.bin',
 'generation_config.json',
 'pytorch_model.bin',
 'scaler.pt',
 'optimizer.pt',
 'config.json',
 'trainer_state.json']

In [None]:
## create a local folder that acts as local repository

# os.mkdir('/content/bert-base-cased-realnews-1M-ngram-pretrained')

In [None]:
## login to your account

# from huggingface_hub import notebook_login
# notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
## create a remote repository on hub

# from huggingface_hub import create_repo
# create_repo("OmkarDhekane/bert-base-realnews-1M-ngram-pretrained" ,repo_type="model")

RepoUrl('https://huggingface.co/OmkarDhekane/bert-base-realnews-1M-ngram-pretrained', endpoint='https://huggingface.co', repo_type='model', repo_id='OmkarDhekane/bert-base-realnews-1M-ngram-pretrained')

In [None]:
## create an object that refer to both local and remote repos.
## `local_dir`  -> local repo.
## `clone_from` -> remote repo.
## note: generally keep the same local and remote repo-names. here its kept diff.

# from huggingface_hub import Repository
# repo = Repository(local_dir="/content/bert-base-cased-realnews-1M-ngram-pretrained", clone_from="OmkarDhekane/bert-base-realnews-1M-ngram-pretrained")

Cloning https://huggingface.co/OmkarDhekane/bert-base-realnews-1M-ngram-pretrained into local empty directory.


In [None]:
## copying the model files i.e. from outputs/<filenames>  to    local repository/folder created in last cell

# import shutil
# outputfiles = ['rng_state.pth',
#               'scheduler.pt',
#               'training_args.bin',
#               'generation_config.json',
#               'pytorch_model.bin',
#               'scaler.pt',
#               'optimizer.pt',
#               'config.json',
#               'trainer_state.json']

# for f in outputfiles:
#   src = f'/content/outputs/checkpoint-2388/{f}'
#   dst = f'/content/bert-base-cased-realnews-1M-ngram-pretrained/{f}'
#   shutil.copyfile(src,dst)

In [None]:
## create zip of model. (optional, not used for pushing model  but run it once for safer side just in case colab session might crash)
#shutil.make_archive('bert-base-realnews-1M-ngram-pretrained', 'zip', '/content/bert-base-realnews-1M-ngram-pretrained')

'/content/bert-base-realnews-1M-ngram-pretrained.zip'

In [None]:
## fetch the latest changes from remote repo.

# from huggingface_hub import Repository
# repo.git_pull()

In [None]:
## finally!! push from local repo. to remote repo.

#repo.push_to_hub(commit_message="First Commit")

## and done!

Upload file optimizer.pt:   0%|          | 1.00/827M [00:00<?, ?B/s]

Upload file scheduler.pt:   0%|          | 1.00/627 [00:00<?, ?B/s]

Upload file scaler.pt:   0%|          | 1.00/557 [00:00<?, ?B/s]

Upload file rng_state.pth:   0%|          | 1.00/14.2k [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/3.50k [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 1.00/413M [00:00<?, ?B/s]

To https://huggingface.co/OmkarDhekane/bert-base-realnews-1M-ngram-pretrained
   820f334..d0430d1  main -> main

   820f334..d0430d1  main -> main



'https://huggingface.co/OmkarDhekane/bert-base-realnews-1M-ngram-pretrained/commit/d0430d18e2f62aa39ad03eaf496bdad4a404f2fc'

In [None]:
#repo.push_to_hub(commit_message="First Commit")

In [None]:
#from transformers import AutoModel
#pretrained_bert = AutoModel.from_pretrained('OmkarDhekane/bert-base-realnews-1M-ngram-pretrained')

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at OmkarDhekane/bert-base-realnews-1M-ngram-pretrained were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Extra...

## Training new tokenizer from realnews dataset

In [None]:
# all_texts = [ doc for doc in dataset['train']['text'] if len(doc) > 0 ]

# def batch_iterator():
#     for i in range(0, len(all_texts), TOKENIZER_BATCH_SIZE):
#         yield all_texts[i : i + TOKENIZER_BATCH_SIZE]

In [None]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
## train new tokenizer from existing tokenizer(bert) for realnews set
## it just changes the vocab while keeping the same tokenizer for bert

# tokenizer = tokenizer.train_new_from_iterator(
#     batch_iterator(),
#     vocab_size=TOKENIZER_VOCABULARY)

### Saving and loading tokenizer to/from huggingface hub

In [None]:
# tokenizer.save_pretrained("realnews-ngram-selected-tokenizer")

('realnews-ngram-selected-tokenizer/tokenizer_config.json',
 'realnews-ngram-selected-tokenizer/special_tokens_map.json',
 'realnews-ngram-selected-tokenizer/vocab.txt',
 'realnews-ngram-selected-tokenizer/added_tokens.json',
 'realnews-ngram-selected-tokenizer/tokenizer.json')

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# from huggingface_hub import create_repo
# create_repo("OmkarDhekane/realnews-ngram-selected-tokenizer" ,repo_type=None)

RepoUrl('https://huggingface.co/OmkarDhekane/realnews-ngram-selected-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='OmkarDhekane/realnews-ngram-selected-tokenizer')

In [None]:
# tokenizer.push_to_hub("OmkarDhekane/realnews-ngram-selected-tokenizer",
#                       commit_message="First Commit")

CommitInfo(commit_url='https://huggingface.co/OmkarDhekane/realnews-ngram-selected-tokenizer/commit/1cd202cb8a4266f0e0237c72dca78fda4cd31320', commit_message='First Commit', commit_description='', oid='1cd202cb8a4266f0e0237c72dca78fda4cd31320', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# load_tokenizer = AutoTokenizer.from_pretrained('OmkarDhekane/realnews-ngram-selected-tokenizer')
# load_tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='OmkarDhekane/realnews-ngram-selected-tokenizer', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
# del trainer
# del training_args
# del model

In [None]:
# !pip install numba

# from numba import cuda
# device = cuda.get_current_device()
# device.reset()