In [1]:
! pip install transformers pytorch_lightning hf-hub-lightning datasets sentencepiece tokenizers
! apt install git-lfs
! git config --global credential.helper store
! huggingface-cli login

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 14.6 MB/s 
[?25hCollecting pytorch_lightning
  Downloading pytorch_lightning-1.7.7-py3-none-any.whl (708 kB)
[K     |████████████████████████████████| 708 kB 56.4 MB/s 
[?25hCollecting hf-hub-lightning
  Downloading hf_hub_lightning-0.0.2-py3-none-any.whl (3.2 kB)
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 57.5 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 58.8 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.13.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (7.0 MB)
[K     |████████████████████████████████|

In [2]:
class Config:
  DATASET_ID = "mlqa"
  SUBSET_ID = "mlqa.en.hi"
  TRAIN_DATA_PATH = "/content/artifacts/data/train.csv"
  TEST_DATA_PATH = "/content/artifacts/data/test.csv"
  VALID_DATA_PATH = "/content/artifacts/data/valid.csv"
  MODEL_CKPT = "google/mt5-base"
  QUESTION_MAX_LEN = 396
  ANSWER_MAX_LEN = 32
  BATCH_SIZE = 4
  MODEL_OUT = "Vasanth/mlqa-model"
  NUM_EPOCHS = 5

In [3]:
config = Config()

In [4]:
# import config
from datasets import load_dataset
import pandas as pd

class DataGen:
    
    def __init__(self):
        self.dataset_id = config.DATASET_ID
        self.subset_id = config.SUBSET_ID
    
    def load_data(self):
        train_test_data = load_dataset(self.dataset_id, self.subset_id, split="test")
        valid_data = load_dataset(self.dataset_id, self.subset_id, split="validation")
        return train_test_data[:4400], train_test_data[4400:], valid_data
        
    def data_df(self):
        train_data, test_data, valid_data = self.load_data()
        self.df_train = pd.DataFrame(train_data)
        self.df_test = pd.DataFrame(test_data)
        self.df_valid = pd.DataFrame(valid_data)
    
    def final_data_prep(self):
        self.df_train = self.df_train.drop("id", axis=1)
        self.df_train["answers"] = self.df_train["answers"].apply(lambda x: x["text"][0])
        self.df_test = self.df_test.drop("id", axis=1)
        self.df_test["answers"] = self.df_test["answers"].apply(lambda x: x["text"][0])
        self.df_valid = self.df_valid.drop("id", axis=1)
        self.df_valid["answers"] = self.df_valid["answers"].apply(lambda x: x["text"][0])
    
    def create_data_csv(self):
        self.data_df()
        self.final_data_prep()
        self.df_train.to_csv(config.TRAIN_DATA_PATH, index=False)
        self.df_test.to_csv(config.TEST_DATA_PATH, index=False)
        self.df_valid.to_csv(config.VALID_DATA_PATH, index=False)

In [5]:
# import config
import pandas as pd
from torch.utils.data import Dataset
from transformers import MT5Tokenizer
import pytorch_lightning as pl


class MLQADataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: MT5Tokenizer,
        question_max_token_len = config.QUESTION_MAX_LEN,
        answer_max_token_len = config.ANSWER_MAX_LEN
                    ):
        self.tokenizer = tokenizer
        self.data = data
        self.max_qns_len = question_max_token_len
        self.max_ans_len = answer_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_idx = self.data.iloc[index]

        question_encoding = self.tokenizer(
            data_idx["question"],
            data_idx["context"],
            max_length=config.QUESTION_MAX_LEN,
            padding="max_length",
            truncation="only_second",
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors="pt"   
        )

        answer_encoding = self.tokenizer(
            data_idx["answers"],
            max_length=config.ANSWER_MAX_LEN,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors="pt"   
        )

        labels = answer_encoding["input_ids"]
        labels[labels==0] = -100

        return dict (
            context = data_idx["context"],
            question = data_idx["question"],
            answer = data_idx["answers"],
            input_ids = question_encoding["input_ids"].flatten(),
            attention_mask = question_encoding["attention_mask"].flatten(),
            labels = labels.flatten()
        )

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [6]:
# import config
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule
import pandas as pd
from transformers import MT5Tokenizer
# from dataset import MLQADataset
from torch.utils.data import DataLoader
import torch
import gc

class MultiQADataLoader(LightningDataModule):

    def __init__(
        self,
        ):
        super().__init__()
        self.batch_size = config.BATCH_SIZE
        self.train_df = pd.read_csv(config.TRAIN_DATA_PATH)
        self.test_df = pd.read_csv(config.TEST_DATA_PATH)
        self.valid_df = pd.read_csv(config.VALID_DATA_PATH)
        self.tokenizer = MT5Tokenizer.from_pretrained(config.MODEL_CKPT)
        self.src_max_token_len = config.QUESTION_MAX_LEN
        self.tgt_max_token_len = config.ANSWER_MAX_LEN

    def setup(self, stage):
        self.train_dataset = MLQADataset(
            self.train_df,
            self.tokenizer
        )

        self.test_dataset = MLQADataset(
            self.test_df,
            self.tokenizer
        )

        self.val_dataset = MLQADataset(
            self.valid_df,
            self.tokenizer
        )

    def train_dataloader(self):
        torch.cuda.empty_cache()
        gc.collect()
        return DataLoader(
            self.train_dataset,
            batch_size=1,
            shuffle=True,
            num_workers=4
        )

    def test_dataloader(self):
        torch.cuda.empty_cache()
        gc.collect()
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )
    
    def val_dataloader(self):
        torch.cuda.empty_cache()
        gc.collect()
        return DataLoader(
            self.val_dataset,
            batch_size=1,
            num_workers=4
        )

In [7]:
# import config
import pytorch_lightning as pl
from pytorch_lightning import LightningModule
from transformers import MT5ForConditionalGeneration, AdamW
import torch
import gc

class Model(LightningModule):

    def __init__(self):
        super().__init__()
        torch.cuda.empty_cache()
        gc.collect()
        self.model = MT5ForConditionalGeneration.from_pretrained(config.MODEL_CKPT)
        torch.cuda.empty_cache()
        gc.collect()

    def forward(self, input_ids, attention_mask, labels=None):
        torch.cuda.empty_cache()
        gc.collect()
        output = self.model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        torch.cuda.empty_cache()
        gc.collect()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=False)
        return loss

    def validation_step(self, batch, batch_idx):
        torch.cuda.empty_cache()
        gc.collect()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=False)
        return loss

    def test_step(self, batch, batch_idx):
        torch.cuda.empty_cache()
        gc.collect()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=False)
        return loss
    
    def configure_optimizers(self):
        torch.cuda.empty_cache()
        gc.collect()
        return AdamW(self.parameters(), lr=0.001)
        
        

In [8]:
# import config
import pandas as pd
import pytorch_lightning as pl
from hf_hub_lightning import HuggingFaceHubCallback
import torch
import gc

if __name__ == "__main__":

    print("Start")
    pl.seed_everything(0)
    
    datagen = DataGen()
    datagen.create_data_csv()

    torch.cuda.empty_cache()
    gc.collect()

    print("Data CSV Created")

    multiqadataloader = MultiQADataLoader()
    multiqadataloader.setup(stage="train")

    torch.cuda.empty_cache()
    gc.collect()


    print("Data Loader Set")

    model = Model()

    torch.cuda.empty_cache()
    gc.collect()

    trainer = pl.Trainer(
        callbacks=[HuggingFaceHubCallback(config.MODEL_OUT)],
        max_epochs=config.NUM_EPOCHS,
        gpus=1
    )
    
    torch.cuda.empty_cache()
    gc.collect()

    trainer.fit(model, multiqadataloader)

INFO:pytorch_lightning.utilities.seed:Global seed set to 0


Start


Downloading builder script:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/114k [00:00<?, ?B/s]

Downloading and preparing dataset mlqa/mlqa.en.hi (download: 72.21 MiB, generated: 6.59 MiB, post-processed: Unknown size, total: 78.80 MiB) to /root/.cache/huggingface/datasets/mlqa/mlqa.en.hi/1.0.0/224fde9ea61350ffb013e4beff31d44c6e125ce82c3aa4af70298eceabc8f7f7...


Downloading data:   0%|          | 0.00/75.7M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4918 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/507 [00:00<?, ? examples/s]

Dataset mlqa downloaded and prepared to /root/.cache/huggingface/datasets/mlqa/mlqa.en.hi/1.0.0/224fde9ea61350ffb013e4beff31d44c6e125ce82c3aa4af70298eceabc8f7f7. Subsequent calls will reuse this data.




Data CSV Created


Downloading:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

Data Loader Set


Downloading:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Cloning https://huggingface.co/Vasanth/mlqa-model into local empty directory.
  rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                        | Params
------------------------------------------------------
0 | model | MT5ForConditionalGeneration | 582 M 
------------------------------------------------------
582 M     Trainable params
0         Non-trainable params
582 M     

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Pulling changes ...
Adding files tracked by Git LFS: ['lit_model.ckpt']. This may take a bit of time if the files are large.


Upload file lit_model.ckpt:   0%|          | 1.00/6.51G [00:00<?, ?B/s]

Upload file runs/events.out.tfevents.1663930180.da881e817ede.88.0:   2%|2         | 1.00/40.0 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   fdbbdbb..b177208  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   fdbbdbb..b177208  main -> main



Validation: 0it [00:00, ?it/s]

Pulling changes ...


Upload file lit_model.ckpt:   0%|          | 1.00/6.51G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   b177208..946757f  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   b177208..946757f  main -> main



Validation: 0it [00:00, ?it/s]

Pulling changes ...


Upload file lit_model.ckpt:   0%|          | 1.00/6.51G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   946757f..3e35350  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   946757f..3e35350  main -> main



Validation: 0it [00:00, ?it/s]

Pulling changes ...


Upload file lit_model.ckpt:   0%|          | 1.00/6.51G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   3e35350..9c52eeb  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   3e35350..9c52eeb  main -> main



Validation: 0it [00:00, ?it/s]

Pulling changes ...


Upload file lit_model.ckpt:   0%|          | 1.00/6.51G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   9c52eeb..da386e7  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/mlqa-model
   9c52eeb..da386e7  main -> main

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


ValueError: ignored