# Import Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import datasets
import transformers
import pandas as pd
import torch
from torch.utils.data.dataset import Dataset
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

from transformers import RobertaConfig
from transformers import RobertaForMaskedLM # RobertaLM for learning
from transformers import RobertaTokenizerFast # After training tokenizern we will wrap it so it can be used by Roberta model

from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import Trainer, TrainingArguments

# Parameters for Training

In [None]:
TRAIN_BATCH_SIZE = 64   # input batch size for training (default: 64)
VALID_BATCH_SIZE = 256   # input batch size for testing (default: 1000)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
MAX_LEN = 128           # Max length for product description
SUMMARY_LEN = 20         # Max length for product names

TRAIN_EPOCHS = 20       # number of epochs to train (default: 10)
WEIGHT_DECAY = 0.01
MAX_LEN = 128
SUMMARY_LEN = 20   # Maximum length of caption generated by the model

In [None]:
caption_path = "data.json"

# Preparing the Dataset

In [None]:
import os
import json
import pandas as pd


with open(caption_path, 'r') as openfile:
    json_object = json.load(openfile)

images_caption_dict = dict(json_object)
images_path = "Flicker8k_Dataset/"
images = list(images_caption_dict.keys())

for image_path in images:
    if image_path.endswith('jpg'):
        new = images_path + image_path.split('/')[-1]
        images_caption_dict[new] = images_caption_dict.pop(image_path)
    else:
        images_caption_dict.pop(image_path)

In [None]:


df = pd.DataFrame([])
captions = []
images = []
for image in list(images_caption_dict.keys()):
    caption = images_caption_dict[image]
    for capt in caption:
        captions.append(capt.replace('<s> ','').replace('  <e>','').strip())
        images.append(image)
        
df['images'] = images
df['captions'] = captions

# ROBERTA
### Training the Decoder Model for Language Understanding and build Vocabulary

### Tokenizer
#### Converting captions in to .txt file for training of the tokenizer

In [None]:
# Store values in a dataframe column (Series object) to files, one file per record
def column_to_files(column, prefix, txt_files_dir = "./text_split"):
    # The prefix is a unique ID to avoid to overwrite a text file
    i=prefix
    #For every value in the df, with just one column
    for row in column.to_list():
      # Create the filename using the prefix ID
        file_name = os.path.join(txt_files_dir, str(i)+'.txt')
        try:
            # Create the file and write the column text to it
            f = open(file_name, 'wb')
            f.write(row.encode('utf-8'))
            f.close()
        except Exception as e:  #catch exceptions(for eg. empty rows)
            print(row, e) 
        i+=1
    # Return the last ID
    return i

data = df["captions"]
data = data.replace("\n"," ")
# Set the ID to 0
prefix=0
# Create a file for every description value
prefix = column_to_files(data, prefix)
# Print the last ID

#### Training tokenizer

In [None]:
%%time 
paths = [str(x) for x in Path(".").glob("text_split/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

# Customize training
tokenizer.train(files=paths, vocab_size=10000, min_frequency=2,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "<e>",
                                "<unk>",
                                "<mask>",
])

Wall time: 2min 54s


#### Save Tokenizer

In [None]:
tokenizer.save_model('Byte_tokenizer_finetuned')

['Byte_tokenizer_finetuned\\vocab.json',
 'Byte_tokenizer_finetuned\\merges.txt']

## Decoder
#### Intialization & Training

In [None]:

config = RobertaConfig(
    vocab_size=10000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
    #hidden_dropout_prob = 0.5,
    #attention_probs_dropout_prob = 0.5
)
model = RobertaForMaskedLM.from_pretrained("bert-base-uncased",config=config)
# Create the tokenizer from a trained one
tokenizer = RobertaTokenizerFast.from_pretrained('Byte_tokenizer_finetuned', max_len=MAX_LEN)

loading weights file pytorch_model.bin from cache at C:\Users\giaco/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\pytorch_model.bin
Some weights of the model checkpoint at bert-base-uncased were not used when initializing RobertaForMaskedLM: ['bert.encoder.layer.8.output.dense.weight', 'bert.encoder.layer.3.attention.self.value.bias', 'bert.encoder.layer.4.attention.self.query.weight', 'bert.encoder.layer.2.intermediate.dense.weight', 'bert.encoder.layer.10.output.LayerNorm.bias', 'bert.encoder.layer.1.intermediate.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.9.intermediate.dense.weight', 'bert.encoder.layer.5.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.intermediate.dense.weight', 'bert.embeddings.LayerNorm.weight', 'bert.encoder.layer.7.intermediate.dense.weight', 'bert.encoder.layer.11.attention.output.LayerNorm.weight', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.e

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['encoder.layer.4.output.dense.bias', 'encoder.layer.1.attention.output.dense.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.2.attention.output.LayerNorm.weight', 'lm_head.dense.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.5.output.LayerNorm.bias', 'lm_head.bias', 'encoder.layer.5.output.LayerNorm.weight', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.1.attention.self.key.weight', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.1.attention.output.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'encoder.layer.3.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.3.attention.self.query.weight', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.output

Num parameters:  51206416


In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.examples = []
        for example in df.values:
            x=tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
            self.examples += [x.input_ids]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])

In [None]:
# Create the train and evaluation dataset
train_dataset = CustomDataset(df['captions'][:38000], tokenizer)
eval_dataset = CustomDataset(df['captions'][38000:], tokenizer)

#### Batching Data

In [None]:
from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Training the Decoder

In [None]:
model_folder = "RobertaMLM_finetuned"
# Define the training arguments
training_args = TrainingArguments(
    output_dir=model_folder,
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_steps=8192,
    save_total_limit=1
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 38000
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 11880
  Number of trainable parameters = 51206416


Epoch,Training Loss,Validation Loss
1,5.0778,4.000446
2,3.8707,3.391426
3,3.395,3.165023
4,3.1257,2.840739
5,2.9487,2.747571
6,2.6836,2.641423
7,2.5716,2.600869
8,2.5279,2.544489
9,2.4723,2.491435
10,2.3691,2.424572


***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256
Saving model checkpoint to RobertaMLM_finetuned\checkpoint-8192


TrainOutput(global_step=11880, training_loss=2.560482644232034, metrics={'train_runtime': 1349.3729, 'train_samples_per_second': 563.225, 'train_steps_per_second': 8.804, 'total_flos': 4875959524403712.0, 'train_loss': 2.560482644232034, 'epoch': 20.0})

#### Check Perplexity score of the model

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 2455
  Batch size = 256


Perplexity: 9.36


### Saving tokenizer & Model to use in Encoder Decoder architecture

In [None]:
tokenizer.save_pretrained('Byte_tokenizer_finetuned')

tokenizer config file saved in Byte_tokenizer_finetuned\tokenizer_config.json
Special tokens file saved in Byte_tokenizer_finetuned\special_tokens_map.json


('Byte_tokenizer_finetuned\\tokenizer_config.json',
 'Byte_tokenizer_finetuned\\special_tokens_map.json',
 'Byte_tokenizer_finetuned\\vocab.json',
 'Byte_tokenizer_finetuned\\merges.txt',
 'Byte_tokenizer_finetuned\\added_tokens.json',
 'Byte_tokenizer_finetuned\\tokenizer.json')

In [None]:
trainer.save_model(model_folder)

Saving model checkpoint to RobertaMLM_finetuned
Configuration saved in RobertaMLM_finetuned\config.json
Model weights saved in RobertaMLM_finetuned\pytorch_model.bin


# Evaluating Decoder(ROBERTA)

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model= r'RobertaMLM_finetuned',
    tokenizer= 'Byte_tokenizer_finetuned'
)

loading configuration file RobertaMLM_finetuned\config.json
Model config RobertaConfig {
  "_name_or_path": "RobertaMLM_finetuned",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 10000
}

loading configuration file RobertaMLM_finetuned\config.json
Model config RobertaConfig {
  "_name_or_path": "RobertaMLM_finetuned",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_toke

In [None]:
fill_mask("a girl going into a <mask> building")

[{'score': 0.17281213402748108,
  'token': 340,
  'token_str': ' white',
  'sequence': 'a girl going into a white building'},
 {'score': 0.11950363218784332,
  'token': 491,
  'token_str': ' large',
  'sequence': 'a girl going into a large building'},
 {'score': 0.09886179864406586,
  'token': 377,
  'token_str': ' red',
  'sequence': 'a girl going into a red building'},
 {'score': 0.06501814723014832,
  'token': 488,
  'token_str': ' yellow',
  'sequence': 'a girl going into a yellow building'},
 {'score': 0.06376089155673981,
  'token': 402,
  'token_str': ' blue',
  'sequence': 'a girl going into a blue building'}]