In [1]:
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-7439b042-394b-c141-348a-19580417a1b8)


In [2]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Prepare DataSet

In [None]:
TRAIN_SIZE = 0.85
TOTAL_SAMPLES = 500000
SEED = 123
INPUT_COLUMN = 'query'
TARGET_COLUMN = 'text'

In [None]:
from datasets import load_dataset

# Load dataset from TSV file
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/made_fake_documents/fulldocs.tsv.gz', 
                       delimiter='\t', column_names=["url", INPUT_COLUMN, TARGET_COLUMN])

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-2dd3879f1bc6e2e0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-2dd3879f1bc6e2e0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-large")

# model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-large")

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

In [None]:
train_test_dataset = (dataset['train'].select(range(TOTAL_SAMPLES))
                      .filter(lambda example: isinstance(example[INPUT_COLUMN], str) and isinstance(example[TARGET_COLUMN], str))
                      .train_test_split(shuffle=True, train_size=TRAIN_SIZE, seed=SEED)
)

Filter:   0%|          | 0/500000 [00:00<?, ? examples/s]

In [None]:
INPUT_MAX_LENGTH = 32
TARGET_MAX_LENGTH = 512

In [None]:
def preprocess_data(examples):
  model_inputs = tokenizer(text=examples[INPUT_COLUMN], max_length=INPUT_MAX_LENGTH, truncation=True)

  labels = tokenizer(examples[TARGET_COLUMN], max_length=TARGET_MAX_LENGTH, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
train_test_dataset.map(preprocess_data, batched=True, num_proc=128, remove_columns=["url", INPUT_COLUMN, TARGET_COLUMN]).save_to_disk('/content/drive/MyDrive/made_fake_documents/T5_tokenized_dataset')

Map (num_proc=128):   0%|          | 0/419835 [00:00<?, ? examples/s]

Map (num_proc=128):   0%|          | 0/74089 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/419835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/74089 [00:00<?, ? examples/s]

# Train

In [4]:
import torch
from transformers import (T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModel, 
                          DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback)

tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-small")

model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small")

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel

# The tokenizer is the same for the query and context encoder
tokenizer_bm25 = AutoTokenizer.from_pretrained('facebook/spar-marco-bm25-lexmodel-query-encoder')
query_encoder = AutoModel.from_pretrained('facebook/spar-marco-bm25-lexmodel-query-encoder')
context_encoder = AutoModel.from_pretrained('facebook/spar-marco-bm25-lexmodel-context-encoder')

Downloading (…)okenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [6]:
from datasets import load_from_disk
train_test_dataset = load_from_disk('/content/drive/MyDrive/made_fake_documents/T5_tokenized_dataset')

In [7]:
import evaluate
bleu = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [216]:
def compute_metrics(eval_pred):
    labels_ids = eval_pred.label_ids
    pred_ids = eval_pred.predictions
    input_ids = eval_pred.inputs
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    input_ids[input_ids == -100] = tokenizer.pad_token_id

    queries = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
    contexts = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    queries_input = tokenizer_bm25(queries, padding=True, truncation=True, return_tensors='pt')
    ctx_input = tokenizer_bm25(contexts, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        # Compute embeddings: take the last-layer hidden state of the [CLS] token
        query_emb = query_encoder(**queries_input).last_hidden_state[:, 0, :]
        ctx_emb = context_encoder(**ctx_input).last_hidden_state[:, 0, :]

        bm_25 = 0
        for i in range(len(query_emb)):
            bm_25 += (query_emb[i] @ ctx_emb[i]).item()
        bm_25 /= len(query_emb)

    decoded_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Some simple post-processing

    result = bleu.compute(predictions=contexts, references=[[label] for label in decoded_labels])
    
    return {
        'bm_25': bm_25,
        'BLEU': result['bleu']
        }

In [217]:
# del trainer

In [218]:
import torch
torch.cuda.empty_cache()

In [219]:
import gc
gc.collect()

0

In [220]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [221]:
batch_size = 32
model_dir = "./t5"
epochs = 2

In [222]:
args = Seq2SeqTrainingArguments(
    evaluation_strategy="steps",
    eval_steps=10000,
    logging_strategy="steps",
    logging_steps=10000,
    save_strategy="steps",
    save_steps=10000,
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir=model_dir,
    weight_decay=0.01,
    optim='adamw_torch',
    num_train_epochs=epochs,
    predict_with_generate=True,
    load_best_model_at_end=True,
    fp16=True,
    metric_for_best_model="bm_25",
    report_to="tensorboard",
    include_inputs_for_metrics=True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    compute_metrics=compute_metrics,
    train_dataset=train_test_dataset['train'],
    eval_dataset=train_test_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

Step,Training Loss,Validation Loss


