In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer


In [2]:
filename = "./data/quora_paraphrase/quora_duplicate_questions.tsv"
import pandas as pd
question_pairs = pd.read_csv(filename, sep = '\t')
question_pairs.drop(['qid1','qid2'], axis=1, inplace=True)

question_pairs_correct_paraphrased = question_pairs[question_pairs['is_duplicate']==1]
question_pairs_correct_paraphrased.drop(['id','is_duplicate'], axis=1, inplace=True)

from sklearn.model_selection import train_test_split
train, test = train_test_split(question_pairs_correct_paraphrased, test_size=0.1)

train.to_csv('./data/quora_paraphrase/quora_train.csv', index = False)
test.to_csv('./data/quora_paraphrase/quora_test.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [3]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hesu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from nltk.tokenize import sent_tokenize
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

In [5]:
from transformers import (AdamW, 
                          T5ForConditionalGeneration, 
                          T5Tokenizer,
                          get_linear_schedule_with_warmup)

In [7]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [10]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams
        
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
        
    def is_logger(self):
        return self.trainer.proc_rank <= 0
    
    def forward(self,
               input_ids, 
               attention_mask=None,
               decoder_input_ids=None,
               decoder_attention_mask=None,
               lm_labels=None):
        
        return self.model(input_ids,
                          attention_mask=attention_mask,
                          decoder_input_ids=decoder_input_ids,
                          decoder_attention_mask=decoder_attention_mask,
                          lm_labels=lm_labels,)
        
    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:,:] == self.tokenizer.pad_token_id] = -100
        
        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch["target_mask"])
        
        
        loss = outputs[0]
        return loss
    
    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        
        tensorboard_logs = {"train_loss":loss}
        return {"loss":loss, "log": tensorboard_logs}
        
    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss":avg_train_loss, "log":tensorboard_logs, "progress_bar":tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss":loss}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss":avg_loss, "log":tensorboard_logs, "progress_bar":tensorboard_logs}
            
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"
        
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params":[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay":self.hparams.weight_decay,
            },
            {
                "params":[p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay":0.0,
            },
        ]
        
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]
        
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        
        optimizer.zero_grad() # 将每个batch的梯度初始化为0
        self.lr_scheduler.step()
        
    def get_tqdm_dict(self):
        tqdm_dict = {"loss":"{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
        
        return tqdm_dict
    
    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, 
                                    type_path="./data/quora_paraphrase/quora_train",
                                    args=self.hparams)
        dataloader = DataLoader(train_dataset, 
                                batch_size=self.hparams.train_batch_size,
                                drop_last=True,
                                shuffle=True,
                                num_workers=4)
        
        t_total = ((len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                  // self.hparams.gradient_accumulation_steps
                  * float(self.hparams.num_train_epochs))
        
        scheduler = get_linear_schedule_with_warmup(
                    self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total)
        
        
        self.lr_scheduler = scheduler
        return dataloader
    
    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="./data/quora_paraphrase/quora_test",args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)
    
        

In [11]:
logger = logging.getLogger(__name__)

In [13]:
class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("**** Validation results ****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {} \n".format(key, str(metrics[key])))
    
    def on_test_end(self, trainer, pl_module):
        logger.info("**** Test results ****")
        
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            
            output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))
    
    
    

In [33]:
args_dict = dict(
    data_dit = "",
    output_dir="./model/",
    model_name_or_path="./model/t5_paraphraser",
    tokenizer_name_or_path="./model/t5_paraphraser",
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=6,
    eval_batch_size=6,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False,
    opt_level="01",
    max_grad_norm=1.0,
    seed=42,
)

train_path = "./data/quora_paraphrase/quora_train.csv"
val_path = "./data/quora_paraphrase/quora_test.csv"
train = pd.read_csv(train_path)
print(train.head)


<bound method NDFrame.head of                                                 question1  \
0              Why is everyone on Quora obsessed with IQ?   
1       Can people still see questions that are marked...   
2       How do you reinstall the Apple app store on a ...   
3                           Why is mathematics important?   
4       Who makes the final decision on which question...   
...                                                   ...   
134331  What are some of the good books to read if one...   
134332  Why do people waste time waiting for answers o...   
134333                How did the Big Bang "create" time?   
134334  What should I do to improve my basic in chemis...   
134335  Is it possible to have civilization without laws?   

                                                question2  
0       What is with the obsession of people on Quora ...  
1       What happens to questions marked as needing im...  
2       How can I reinstall the app store on my iPad o...

In [35]:
tokenizer = T5Tokenizer.from_pretrained('./model/t5_paraphraser')

INFO:transformers.tokenization_utils:Model name './model/t5_paraphraser' not found in model shortcut name list (t5-small, t5-base, t5-large, t5-3b, t5-11b). Assuming './model/t5_paraphraser' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:Didn't find file ./model/t5_paraphraser/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils:Didn't find file ./model/t5_paraphraser/special_tokens_map.json. We won't load it.
INFO:transformers.tokenization_utils:Didn't find file ./model/t5_paraphraser/tokenizer_config.json. We won't load it.
INFO:transformers.tokenization_utils:loading file ./model/t5_paraphraser/spiece.model
INFO:transformers.tokenization_utils:loading file None
INFO:transformers.tokenization_utils:loading file None
INFO:transformers.tokenization_utils:loading file None


In [50]:
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=256):
        self.path = os.path.join(data_dir, type_path +".csv")
        self.source_column = "question1"
        self.target_column = "question2"
        self.data = pd.read_csv(self.path)
        
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()
        
        src_mask = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()
        
        return {"source_ids":source_ids, "source_mask":src_mask, 
                "target_ids":target_ids, "target_mask":target_mask}
    
    def _build(self):
        for idx in range(len(self.data)):
            input_, target = self.data.loc[idx, self.source_column],  self.data.loc[idx, self.target_column]
            
            input_ = "paraphrase: " + input_ + "</s>"
            target = target + "</s>"
            
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                    [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            tokenized_targets = self.tokenizer.batch_encode_plus(
                    [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            
            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)
    
    
    

In [51]:
dataset = ParaphraseDataset(tokenizer,'./data/quora_paraphrase', 'quora_train')

In [38]:
print("Train dataset: ", len(dataset))


Train dataset:  134336


In [39]:
data = dataset[61]
print(tokenizer.decode(data["source_ids"]))
print(tokenizer.decode(data["target_ids"]))

paraphrase: How do you know if you're unconditionally in love with someone?
How can you know if you're in love or just attracted to someone?


In [40]:
if not os.path.exists('./model/t5_paraphrase'):
    os.makedirs("./model/t5_paraphrase")

In [41]:
args_dict.update({'data_dir':'./data/quora_paraphrase', 
                  'output_dir':'./model/t5_paraphrase', 
                  'num_train_epochs':2, 
                  'max_seq_length':256})

args = argparse.Namespace(**args_dict)
print(args_dict)

{'data_dit': '', 'output_dir': './model/t5_paraphrase', 'model_name_or_path': './model/t5_paraphraser', 'tokenizer_name_or_path': './model/t5_paraphraser', 'max_seq_length': 256, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 6, 'eval_batch_size': 6, 'num_train_epochs': 2, 'gradient_accumulation_steps': 16, 'n_gpu': 1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': '01', 'max_grad_norm': 1.0, 'seed': 42, 'data_dir': './data/quora_paraphrase'}


In [42]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(filepath=args.output_dir,
                                                  prefix="checkpoint",
                                                  monitor="val_loss",
                                                  mode="min",
                                                  save_top_k=5)

In [43]:
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision=16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

In [44]:
def get_dataset(tokenizer, type_path,args):
    return ParaphraseDataset(tokenizer=tokenizer, 
                             data_dir=args.data_dir,
                             type_path=type_path, 
                             max_len=args.max_seq_length)

In [45]:
print("Initialize model")
model = T5FineTuner(args)

INFO:transformers.configuration_utils:loading configuration file ./model/t5_paraphraser/config.json
INFO:transformers.configuration_utils:Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Ger

Initialize model


INFO:transformers.tokenization_utils:Model name './model/t5_paraphraser' not found in model shortcut name list (t5-small, t5-base, t5-large, t5-3b, t5-11b). Assuming './model/t5_paraphraser' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:Didn't find file ./model/t5_paraphraser/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils:Didn't find file ./model/t5_paraphraser/special_tokens_map.json. We won't load it.
INFO:transformers.tokenization_utils:Didn't find file ./model/t5_paraphraser/tokenizer_config.json. We won't load it.
INFO:transformers.tokenization_utils:loading file ./model/t5_paraphraser/spiece.model
INFO:transformers.tokenization_utils:loading file None
INFO:transformers.tokenization_utils:loading file None
INFO:transformers.tokenization_utils:loading file None


In [46]:
trainer = pl.Trainer(**train_params)

MisconfigurationException: 
                You requested GPUs: [0]
                But your machine only has: []
            