### DATA ACQUISITION

##### PARANMT_50M_DATASET

https://paperswithcode.com/dataset/paranmt-50m



In [1]:
import os
from typing import Optional
from datasets import load_dataset, IterableDataset
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling

# current working directory changes when imported from other modules, so to ensure para_nmt_path is correct we store
# the absolute path to the module for reference.
package_directory = os.path.dirname(os.path.abspath('__file__'))


class PARANMT_50M_DATASET(LightningDataModule):
    
    file_path = os.path.join(package_directory, "para-nmt-5m-processed.zip")

    def __init__(self, opt_type, batch_size, steps_per_epoch, num_workers=0, seed=69, pre_tokenize=True):
        """
        Parameters
        ----------
        opt_name: str
            name of the OPT model type (i.e. facebook/opt-350m)
        batch_size: int
            batch_size output by dataloader
        steps_per_epoch: int
            dataset_size = steps_per_epoch * batch_size
            Since we do not know the dataset size we simply leave it to the user to determine how many steps per epoch
            we should have.
        num_workers: int
            refer to note above on PR https://github.com/huggingface/datasets/pull/4375
        seed: int
            haha funny number
        pre_tokenize: bool
            should we tokenize the texts (if true: dataset will return tokenized ids instead of source text)
        """

        super().__init__()
        self.opt_type = opt_type
        self.batch_size = batch_size
        self.steps_per_epoch = steps_per_epoch
        self.num_workers = num_workers
        self.seed = seed
        self.pre_tokenize = pre_tokenize

        # init None to make pycharm happy
        self.tokenizer = None
        self.dataset = None

    def prepare_data(self) -> None:
        # download and cache
        GPT2Tokenizer.from_pretrained(self.opt_type)

    def setup(self, stage: Optional[str] = None) -> None:
        # load tokenizer (should be cached)
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.opt_type, use_fast=False)

        # preprocess function for the dataset's entries
        def preprocess(examples):
            # list of len batch
            batch = examples['text']
            processed_batch = list()
            for i in batch:
                # replace the \t splitting with a '</s>' token to denote source-target
                processed_batch.append(str.replace(i, "\t", self.tokenizer.eos_token))

            if self.pre_tokenize:
                outputs = self.tokenizer(
                    processed_batch,
                    truncation=True,
                    max_length=69,
                )
            else:
                outputs = {"source": processed_batch}
            return outputs

        # init dataset in streaming mode
        self.dataset = load_dataset("text", data_files=self.file_path, streaming=True)['train']
        
        # elements within buffer size will be shuffled as they are loaded in
        self.dataset = self.dataset.shuffle(seed=self.seed, buffer_size=10_000)
        
        # preprocessing will take place while being streamed by dataloader
        self.dataset = self.dataset.map(preprocess, batched=True, remove_columns=['text'])
        
        # ensure pytorch tensors are returned
        self.dataset = self.dataset.with_format("torch")

        # monkeypatch of __len__ function in the dataloader so that the trainer knows how many
        # steps there are per epoch. Sure this violates many programming paradigms but it works.
        n = self.steps_per_epoch

        def __len__(self):
            return n

        IterableDataset.__len__ = __len__

    # dataloaders are basically all the same since we cannot split a streamed dataset
    def train_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def val_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def test_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def predict_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader


if __name__ == "__main__":
    model_name = "facebook/opt-1.3b"
    datamodule = PARANMT_50M_DATASET(model_name, 1, 1000, seed=1337)
    datamodule.setup()
    dl = datamodule.val_dataloader()
    it = iter(dl)

    for i in range(10):
        print(datamodule.tokenizer.batch_decode(next(it)['input_ids'])[0])

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-12387731b9302b2b


</s>where a reference is made to this paragraph, article 5 of regulation ( eu ) no 182/2011 shall apply.</s>where reference is made to this article, article 5 of regulation ( eu ) no 182 / 2011 shall apply.
</s>don't worry, pete, i'm coming to save you.</s>don't worry, pete.
</s>opening of sitting the sitting opened at 09.00. 2.</s>start of the session the session started at 9 p.m.
</s>it'il get as fat as you are.</s>he 'll be as fat as you.
</s>that had struck jess and will as funny, because everyone knew that your posterior was the scientific name for your situpon.</s>this seemed to be jess and will's funny, since everyone knew that the back tie was a scientific name for the butt.
</s>his long, lethal fingers rhythmically clawed the ground as they gained strength.</s>the long, deadly fingers moved rhythmically into the soil, and the original forces were slow.
</s>oh... honey.</s>oh, baby...
</s>i couldn't tell half the time if he was talking... or you were reading his mind.</s>i coul

##### PARABANK_Dataset
https://paperswithcode.com/dataset/parabank

In [2]:
"""
README from parabank-2.0.zip
The TSV file contains ParaBank 2, a diverse collection of paraphrases generated
through bilingual generation. Details of the dataset and how it's created can
be found here:
Hu, J. E., A. Singh, N. Holzenberger, M. Post, & B. Van Durme. 2019. Large-scale,
Diverse, Paraphrastic Bitexts via Sampling and Clustering. Proceedings of CoNLL 2019,
Hong Kong, Nov 3 – Nov 4, 2019.
Each line of the file contains a bilingual dual-condition score, a reference
sentence, and paraphrases of the same reference sentence. A reference sentence may
have between one to five distinct paraphrases. The lines are in descending
order of the dual-conditioned score, a measurement of the quality of the
original bilingual sentence pair. Within the same line, paraphrases are ranked by
model score as described in the paper - i.e., the first paraphrase from left
to right correspond to the system with subscript "1" in evaluation, and the
last to "5". All sentences are raw text (untokenized). The reference sentences
appear in ascending order of their bidirectional model scores (the lower the
better), which we use to filter the bilingual resource used to generate ParaBank 2.
"""
from typing import Optional
from datasets import load_dataset, IterableDataset
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling


class PARABANK_Dataset(LightningDataModule):
    """
    LightningDataModule for parabank dataset for causal language modelling
    Note on num_workers: https://github.com/huggingface/datasets/pull/4375
    IterableDatasets do not support Dataloaders with num_workers > 0. Watch the PR to see if the fix will be merged.
    """
    parabank_url = "http://cs.jhu.edu/~vandurme/data/parabank-2.0.zip"

    def __init__(self, opt_name, batch_size, steps_per_epoch, num_workers=0, seed=69, pre_tokenize=True):
        """
        Parameters
        ----------
        opt_name: str
            name of the OPT model type (i.e. facebook/opt-350m)
        batch_size: int
            batch_size output by dataloader
        steps_per_epoch: int
            dataset_size = steps_per_epoch * batch_size
            Since we do not know the dataset size we simply leave it to the user to determine how many steps per epoch
            we should have.
        num_workers: int
            refer to note above on PR https://github.com/huggingface/datasets/pull/4375
        seed: int
            haha funny number
        pre_tokenize: bool
            should we tokenize the texts (if true: dataset will return tokenized ids instead of source text)
        """

        super().__init__()
        self.opt_name = opt_name
        self.batch_size = batch_size
        self.steps_per_epoch = steps_per_epoch
        self.num_workers = num_workers
        self.seed = seed
        self.pre_tokenize = pre_tokenize

        # init None to make pycharm happy
        self.tokenizer = None
        self.dataset = None

    def prepare_data(self) -> None:
        # download and cache
        GPT2Tokenizer.from_pretrained(self.opt_name)

    def setup(self, stage: Optional[str] = None) -> None:
        # load tokenizer (should be cached)
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.opt_name, use_fast=False)

        # preprocess function for the dataset's entries
        def preprocess(examples):
            # list of len batch
            batch = examples['text']
            processed_batch = list()
            for i in batch:
                # split by \t (it is a tsv file) and omit the initial dual-condition score (it is useless)
                i = i.split('\t')[1:]
                # filter entries without paraphrases and split them with a '</s>' token to denote source-target
                if len(i) > 1:
                    processed_batch.append(i[0] + self.tokenizer.eos_token + i[1])

            if self.pre_tokenize:
                outputs = self.tokenizer(
                    processed_batch,
                    truncation=True,
                    max_length=69,
                )
            else:
                outputs = {"source": processed_batch}
            return outputs

        # init dataset in streaming mode
        self.dataset = load_dataset("text", data_files=self.parabank_url, streaming=True)['train']
        # elements within buffer size will be shuffled as they are loaded in
        self.dataset = self.dataset.shuffle(seed=self.seed, buffer_size=10_000)
        # preprocessing will take place while being streamed by dataloader
        self.dataset = self.dataset.map(preprocess, batched=True, remove_columns=['text'])
        # ensure pytorch tensors are returned
        self.dataset = self.dataset.with_format("torch")

        # monkeypatch of __len__ function in the dataloader so that the trainer knows how many
        # steps there are per epoch. Sure this violates many programming paradigms but it works.
        n = self.steps_per_epoch

        def __len__(self):
            return n

        IterableDataset.__len__ = __len__

    # dataloaders are basically all the same since we cannot split a streamed dataset
    def train_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def val_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def test_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def predict_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader


if __name__ == "__main__":
    model_name = "facebook/opt-1.3b"
    datamodule = PARABANK_Dataset(model_name, 1, 1000, seed=1337)
    datamodule.setup()
    dl = datamodule.val_dataloader()
    it = iter(dl)

    for i in range(10):
        print(datamodule.tokenizer.batch_decode(next(it)['input_ids'])[0])

Using custom data configuration default-a233796b5026b737


</s>E-3612/10 (RO) Elena Oana Antonescu (PPE) to the Commission (25 May 2010)</s>
</s>Much of the poor majority of the world is mired in a vicious circle of disease, poverty, and political instability.</s>A great deal of the poor majority of the world is drowned in an enchanted ring of ills, poverty, and political instalency.
</s>You promised me 24 hours.</s>I've been promised 24 hours.
</s>Management of deep-sea fish stocks (vote)</s>Management of fishing stocks in deep waters (vote)
</s>Forty seconds!</s>40 seconds!
</s>Regulation (EEC) No 3846/87 should therefore be amended accordingly.</s>Regulation (EEC) No 3846/87 should therefore be amended in accordance with the above-mentioned provisions.
</s>Richard saved me.</s>Richard rescued me.
</s>He had a good teacher.</s>He had a good tutor.
</s>For the first time since 1974.</s>First since 1974.
</s>You are in my hospital.</s>You are in my Hospital.


##### QUORA_Dataset

##### DataCombiner

In [3]:
from typing import List, Type, Optional

from datasets import IterableDataset, interleave_datasets
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling


class DataCombiner(LightningDataModule):
    """
    LightningDataModule for combining different datasets for causal language modelling
    Note on num_workers: https://github.com/huggingface/datasets/pull/4375
    IterableDatasets do not support Dataloaders with num_workers > 0. Watch the PR to see if the fix will be merged.
    """
    def __init__(self, opt_name, batch_size, steps_per_epoch, datamodules: List[Type[LightningDataModule]],
                 probabilities: List[float], num_workers=0, seed=69, pre_tokenize=True):
        """
        Parameters
        ----------
        opt_name: str
            Name of model type
        batch_size: int
            batch_size output by dataloader
        steps_per_epoch: int
            dataset_size = steps_per_epoch * batch_size
            Since we do not know the dataset size we simply leave it to the user to determine how many steps per epoch
            we should have.
        datamodules: List[Type[LightningDataModule]]
            List specifying the datamodules whose datasets will be interleaved
        probabilities: List[float]
            List of probabilities for respective datamodules that should sum to 1
        num_workers: int
            refer to note above on PR https://github.com/huggingface/datasets/pull/4375
        seed: int
            haha funny number
        pre_tokenize: bool
            should we tokenize the texts (if true: dataset will return tokenized ids instead of source text)
        """
        super().__init__()
        self.opt_name = opt_name
        self.batch_size = batch_size
        self.steps_per_epoch = steps_per_epoch
        self.num_workers = num_workers
        self.seed = seed
        self.pre_tokenize = pre_tokenize
        self.datamodules = datamodules
        self.probabilities = probabilities
        self.tokenizer = None
        self.dataset = None

        # sanity check
        assert sum(self.probabilities) == 1, "Probabilities for interleaved datasets do not sum to 1.0"

    def prepare_data(self) -> None:
        # download and cache
        GPT2Tokenizer.from_pretrained(self.opt_name)

    def setup(self, stage: Optional[str] = None) -> None:
        # tokenizer is not actually used once instantiated but to stay consistent with other datamodule implementations
        # we instantiate it anyway
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.opt_name, use_fast=False)

        # instantiate all the datamodules and extract the dataset from them
        datasets = list()
        for datamodule in self.datamodules:
            dm = datamodule(self.opt_name, self.batch_size, self.steps_per_epoch,
                            seed=self.seed, pre_tokenize=self.pre_tokenize)
            dm.setup()
            datasets.append(dm.dataset)

        self.dataset = interleave_datasets(datasets, probabilities=self.probabilities, seed=self.seed)
        self.dataset = self.dataset.with_format("torch")

        # monkeypatch of __len__ function in the dataloader so that the trainer knows how many
        # steps there are per epoch. Sure this violates many programming paradigms but it works.
        n = self.steps_per_epoch

        def __len__(self):
            return n

        IterableDataset.__len__ = __len__

    # dataloaders are basically all the same since we cannot split a streamed dataset
    def train_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def val_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def test_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader

    def predict_dataloader(self):
        dataloader = DataLoader(self.dataset,
                                batch_size=self.batch_size,
                                num_workers=self.num_workers)
        if self.pre_tokenize: dataloader.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        return dataloader


if __name__ == "__main__":
    model_name = "facebook/opt-1.3b"
    datamodule = DataCombiner(model_name, 1, 1000, [PARABANK_Dataset, PARABANK_Dataset],
                                        probabilities=[0.35, 0.65], seed=1337, pre_tokenize=False)
    datamodule.setup()
    dl = datamodule.val_dataloader()
    it = iter(dl)

    for i in range(10):
        print(next(it))

Using custom data configuration default-a233796b5026b737
Using custom data configuration default-a233796b5026b737


{'source': ['E-3612/10 (RO) Elena Oana Antonescu (PPE) to the Commission (25 May 2010)</s>']}
{'source': ['E-3612/10 (RO) Elena Oana Antonescu (PPE) to the Commission (25 May 2010)</s>']}
{'source': ['Much of the poor majority of the world is mired in a vicious circle of disease, poverty, and political instability.</s>A great deal of the poor majority of the world is drowned in an enchanted ring of ills, poverty, and political instalency.']}
{'source': ["You promised me 24 hours.</s>I've been promised 24 hours."]}
{'source': ['Management of deep-sea fish stocks (vote)</s>Management of fishing stocks in deep waters (vote)']}
{'source': ['Much of the poor majority of the world is mired in a vicious circle of disease, poverty, and political instability.</s>A great deal of the poor majority of the world is drowned in an enchanted ring of ills, poverty, and political instalency.']}
{'source': ["You promised me 24 hours.</s>I've been promised 24 hours."]}
{'source': ['Management of deep-sea 

### FINE TUNE SETUP

In [1]:
import wandb
from pytorch_lightning import LightningModule
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import OPTForCausalLM

import torch
from torch.optim import Adam


class FineTuneOPT(LightningModule):
    """
    very straightforward direct fine tuning on the OPT model
    """
    def __init__(self, model_name="facebook/opt-350m"):
        super().__init__()
        self.model = OPTForCausalLM.from_pretrained(model_name)
        self.save_hyperparameters()

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs[0]
        self.log("train_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]

        # we care only about the last token being predicted
        pred_token_logits = logits[:, -1, :]
        pred_token = torch.argmax(pred_token_logits, dim=-1)
        labels = batch["labels"][:, -1]

        self.log("val_loss", val_loss)

        return {"loss": val_loss, "preds": pred_token, "labels": labels}

    def configure_optimizers(self):
        optimizer = Adam(self.model.parameters(), **wandb.config["optimizer_params"])

        # configure learning rate scheduler
        lr_scheduler = ReduceLROnPlateau(optimizer, **wandb.config["lr_scheduler_params"])

        lr_scheduler_config = {"scheduler": lr_scheduler}
        lr_scheduler_config.update(wandb.config["lr_scheduler_config"])

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler_config}

    """
    Note on following hooks (on_train_epoch_start and on_validation_epoch_start):
    Using the following code to access dataloaders: self.train_dataloader().dataset.set_epoch(self.current_epoch) 
    Results in an exception like such : pytorch_lightning.utilities.exceptions.MisconfigurationException: 
    `val_dataloader` must be implemented to be used with the Lightning Trainer 
    Although train_dataloader() is a valid hook, the hook is overridden only in the datamodule and we cannot reference
    that. We have to use self.trainer.train_dataloader.dataset which returns some CombinedDataset and then .datasets
    that one to get the original TorchIterableDataset.
    On the other hand, we can access validation dataloaders with self.trainer.val_dataloaders[0].dataset as that one is
    apparently a list and not a CombinedDataset.
    Pain.
    """

    def on_train_epoch_start(self) -> None:
        # reshuffle the dataset for every train epoch
        self.trainer.train_dataloader.dataset.datasets.set_epoch(self.trainer.current_epoch)

    def on_validation_epoch_start(self) -> None:
        # reshuffle the dataset for every validation epoch
        self.trainer.val_dataloaders[0].dataset.set_epoch(self.trainer.current_epoch)

  from .autonotebook import tqdm as notebook_tqdm


### Fine Tune In Action

In [5]:
import os
import torch
import wandb
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger


if __name__ == "__main__":
    # initialisation steps
    torch.cuda.empty_cache()
    AVAIL_GPUS = min(1, torch.cuda.device_count())

    run = wandb.init(project="fine-tune-opt")

    with run:
        datamodule = DataCombiner(wandb.config["model_name"], batch_size=wandb.config["batch_size"],
                                            steps_per_epoch=wandb.config["steps_per_epoch"],
                                            datamodules=[PARABANK_Dataset, PARANMT_50M_DATASET],
                                            probabilities=[0.5, 0.5])
        datamodule.setup()

        if (wandb.config["load_from_checkpoint"] is not None) and (os.path.isfile(wandb.config["load_from_checkpoint"])):
            model = FineTuneOPT.load_from_checkpoint(checkpoint_path=wandb.config["load_from_checkpoint"])
        else:
            model = FineTuneOPT(wandb.config["model_name"])

        checkpoint_callback = ModelCheckpoint(dirpath=wandb.config["checkpoint_save_dir"],
                                              save_top_k=2, monitor="val_loss",
                                              filename="fine-tune-opt-epoch={epoch:03d}-val_loss={val_loss:.3f}")

        # create wandb logger (obviously)
        wandb_logger = WandbLogger(checkpoint_callback=False)

        print("TRAINING MODEL")
        trainer = Trainer(max_epochs=wandb.config["max_epochs"], gpus=AVAIL_GPUS,
                          check_val_every_n_epoch=wandb.config["check_val_every_n_epoch"],
                          callbacks=[checkpoint_callback],
                          logger=wandb_logger)
        trainer.fit(model, datamodule=datamodule)

    wandb.finish()

    print("Training complete.")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mteosh-wp19[0m ([33mnlp-assignment[0m). Use [1m`wandb login --relogin`[0m to force relogin


Using custom data configuration default-a233796b5026b737
Using custom data configuration default-12387731b9302b2b
  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TRAINING MODEL


Using custom data configuration default-a233796b5026b737
Using custom data configuration default-12387731b9302b2b
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type           | Params
-----------------------------------------
0 | model | OPTForCausalLM | 1.3 B 
-----------------------------------------
1.3 B     Trainable params
0         Non-trainable params
1.3 B     Total params
5,263.032 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 29: 100%|██████████| 500/500 [4:23:16<00:00, 31.59s/it, loss=4.89, v_num=qu3d]      


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train_loss,▆▆▃▁▁▅▄▅▅▅▅▄▅█▅▅▄▄▆▄▃▆▃▃▂▃▅▃▃▂▃▃▂▂▄▂▂▂▂▄
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,▃▆▇▆█▆▆▃▃▃▂▂▁▁▁

0,1
epoch,29.0
train_loss,5.36745
trainer/global_step,7499.0
val_loss,4.8551


Training complete.


### Generate paraphrase examples

In [6]:
from transformers import GPT2Tokenizer

In [51]:
model = FineTuneOPT.load_from_checkpoint(checkpoint_path=r"C:\Users\HADOOP\Desktop\NLP\training_checkpoints\07-06-2022-optimize\fine-tune-opt-epoch=epoch=027-val_loss=val_loss=4.868.ckpt")
tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-1.3b")
tokenizer.padding_side = 'left'

In [52]:
model = model.to("cuda")
model = model.eval()
i = "I agree with that"
encoded_inputs = tokenizer([i + "</s> "], padding=True, return_tensors='pt')



In [53]:
output = model.model.generate(inputs=encoded_inputs['input_ids'].to("cuda"),
                                                max_length=100,
                                                use_cache=False)

outputs = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
outputs

["I agree with that illed's the Bol.......... Century. o....11. Century........lympics. Century..ics.ics...lympicsope Bol.... Century. Century.. Century Bol...lympics.lympics.lympics.lympics.lympics.lympics.lympics.lympics.lympics.lympics.lymp"]

['Dog is a cute and friendly animal. ']