# Sentence completion training

In this notebook we fine-tune a T5 model that was pre-trained on the daily-dialog dataset, with the transcript data.

All additional packages:

In [1]:
! rm -rf NLP-project
! git clone https://github.com/ReviBa/NLP-project
! cp -rf NLP-project/* .

Cloning into 'NLP-project'...
remote: Enumerating objects: 180, done.[K
remote: Counting objects: 100% (180/180), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 180 (delta 89), reused 113 (delta 33), pack-reused 0[K
Receiving objects: 100% (180/180), 9.67 MiB | 7.48 MiB/s, done.
Resolving deltas: 100% (89/89), done.


In [2]:
!pip install --q transformers
!pip install --q pytorch_pretrained_bert
!pip install --q sentencepiece
!pip install --q sentence_transformers
!pip install --q pytorch_lightning
!pip install --q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:

# general
import os
import csv
import string
import random
import warnings
from typing import List,Tuple

# ML
import numpy as np
import pandas as pd

# visual
import matplotlib
from tqdm import tqdm

# DL
import torch
from torch.utils.data import DataLoader

# HF
from transformers import AdamW
from transformers import T5Tokenizer, T5ForConditionalGeneration

# SK
from sklearn.model_selection import train_test_split

#PL
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

In [4]:
from utils.tokens_utils import get_tokenizer_based_on_data, get_max_tokens_number

Set warnings filter to ignore:

In [5]:
warnings.filterwarnings('ignore')


Set random SEEDs:

In [6]:
SEED = 42

# Set the random seed for Python
random.seed(42)

# Set the random seed for numpy
np.random.seed(42)

# Set the random seed for torch to SEED
torch.manual_seed(42)

<torch._C.Generator at 0x78f3f0b5ab70>

Set GPU:

In [7]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
assert DEVICE == 'cuda'
# DEVICE = 'cpu'

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###**Helper Functions**

#### **The Office data loading**

In [9]:
file_path = "/content/NLP-project/resources/The-Office-Lines-V4.csv"

In [10]:
full_df = pd.read_csv(file_path)

####**Create tokenizer and DF for training**

In [11]:
tokenizer = get_tokenizer_based_on_data(full_df, speaker_col_name='speaker', line_col_name='line') # update tokenizer with new tokens
sentence_completion_df = pd.read_csv("/content/NLP-project/resources/sentence_completion_on_the_office_lines.csv") # there is a notebook that creates this dataset

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
# Uncomment if you want to save the tokenizer state or load from saved state
# tokenizer.save_pretrained('/content/drive/MyDrive/NLP_project/the-office-tokenizer/')
# tokenizer = tokenizer.from_pretrained('/content/drive/MyDrive/NLP_project/the-office-tokenizer/')

In [12]:
print(tokenizer.tokenize("this is a test"))

['▁this', '▁is', '▁', 'a', '▁test']


In [13]:
sentence_completion_df

Unnamed: 0.1,Unnamed: 0,question,answer
0,0,Michael:All right Jim.Michael:Your quarterlies...,"Jim:Oh, I told you.Jim:I couldn't close it.Jim..."
1,1,Michael:All right Jim.Michael:Your quarterlies...,Michael:So you've come to the master for guida...
2,2,Michael:All right Jim.Michael:Your quarterlies...,"Jim:Actually, you called me in here, but yeah.."
3,3,Michael:All right Jim.Michael:Your quarterlies...,"Michael:All right.Michael:Well, let me show yo..."
4,4,"Michael:I've, uh, I've been at Dunder Mifflin ...",Pam:Well.Pam:I don't know..
...,...,...,...
46464,46464,Kevin:Oscar.Kevin:Oscar.Kevin: I think I'm gay..,Oscar:Why do you say that?.
46465,46465,Kevin:Oscar.Kevin:Oscar.Kevin: I think I'm gay...,Kevin: It's just that I'm so emotional..
46466,46466,Kevin:Oscar.Kevin:Oscar.Kevin: I think I'm gay...,"Oscar:Yeah, but you're not gay.Oscar:You're no..."
46467,46467,Kevin:Oscar.Kevin:Oscar.Kevin: I think I'm gay...,"Kevin:No, but maybe the reason...."


####This section was used for experiments
Michael's dialogs with exactly 1 speaker.

In [15]:
# CHARACTER = "Michael"
# dialog_data = []
# person2 = CHARACTER

# dialog = ""
# prev_scene = 1
# speakers_list = set()

# for index, row in full_df.iterrows():
#   if row['scene'] == prev_scene: # if we are in the same scene
#     new_speakers = speakers_list.copy()
#     new_speakers.add(row['speaker'])

#     if len(new_speakers) <= 2: # if we are still dealing with 2 speakers
#       speakers_list.add(row['speaker'])
#       speaker_str = person2 if row['speaker'] == CHARACTER else "Person1"
#       new_line = "{}:{}".format(speaker_str, row['line'])
#       if speaker_str == person2:
#         dialog_data.append((dialog, new_line))
#       dialog += new_line

#     else:
#       dialog = ""
#       speakers_list = set()

#   else: # if we moved to the next scene
#     speaker_str = person2 if row['speaker'] == (CHARACTER) else "Person1"
#     new_line = "{}:{}".format(speaker_str, row['line'])
#     dialog = ""
#     if speaker_str == person2:
#       dialog_data.append((dialog, new_line))
#     dialog += new_line
#     speakers_list = {row['speaker']}

#   prev_scene = row['scene']

# character_dialog_df = pd.DataFrame(dialog_data, columns=['question', 'answer'])

##**Dialog T5 model mask training**
All t5 objects are copied here and not used from the git repo because of comflicts that we couldn't fix.

In [14]:
INPUT_MAX_LEN, OUTPUT_MAX_LEN = get_max_tokens_number(sentence_completion_df, 'question', 'answer')

In [15]:
INPUT_MAX_LEN, OUTPUT_MAX_LEN

(907, 206)

In [16]:
TRAIN_BATCH_SIZE = 4 # batch size of training
VAL_BATCH_SIZE = 4 # batch size for validation
EPOCHS = 1 # number of epoch
MODEL_NAME = "t5-base"

In [17]:
class T5Dataset:

  def __init__(self,question,answer):
    self.question = question
    self.answer = answer
    self.tokenizer = tokenizer
    self.input_max_len = INPUT_MAX_LEN
    self.output_max_len = OUTPUT_MAX_LEN

  def __len__(self):
    return len(self.question)

  def __getitem__(self,item):
    question = str(self.question[item])

    answer = str(self.answer[item])

    input_tokenize = self.tokenizer(
            question,
            add_special_tokens=True,
            max_length=self.input_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
        )
    output_tokenize = self.tokenizer(
            answer,
            add_special_tokens=True,
            max_length=self.output_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"

        )
    input_ids = input_tokenize["input_ids"].flatten()
    attention_mask = input_tokenize["attention_mask"].flatten()
    labels = output_tokenize['input_ids'].flatten()
    # in case we will want to train ligning module again return out as dictionary:
    out = {
            'question':question,
            'answer':answer,
            'input_ids': input_ids,
            'attention_mask':attention_mask,
            'target':labels
        }

    return out

In [18]:
class T5DataLoad(pl.LightningDataModule):
    def __init__(self,df_train,df_test):
        super().__init__()
        self.df_train = df_train
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.out_max_len = OUTPUT_MAX_LEN

    def setup(self, stage=None):
        self.train_data = T5Dataset(
            question = self.df_train.question.values,
            answer = self.df_train.answer.values
        )

        self.valid_data = T5Dataset(
            question = self.df_test.question.values,
            answer = self.df_test.answer.values
        )

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
         self.train_data,
         batch_size= TRAIN_BATCH_SIZE,
         shuffle=True
         )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
        self.valid_data,
        batch_size= VAL_BATCH_SIZE
        )

In [19]:
class T5Model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids , attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return {'val_loss': loss}

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=3e-4)


In [21]:
def trainT5(data):
    df_train, df_test = train_test_split(data ,test_size = 0.2, random_state=100)
    dataload = T5DataLoad(df_train,df_test)
    dataload.setup()
    model = T5Model.load_from_checkpoint('/content/drive/MyDrive/NLP_project/Option3-trained-on-daily-dialog-splitted-corpus-second-epoch.ckpt')
    model.to(DEVICE)
    # after adding trunscript tokens
    model.model.resize_token_embeddings(len(tokenizer))

    checkpoint = ModelCheckpoint( # saving the stats of the model into directory
        dirpath="/content/drive/MyDrive/",
        filename='sentence-completion-above-daily-dialog2',
        save_top_k=2,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )
    trainer = pl.Trainer(
        callbacks = checkpoint,
        max_epochs= EPOCHS,
        accelerator="gpu"
    )

    trainer.fit(model, dataload)
    return model, trainer

trained_model, trainer = trainT5(sentence_completion_df)

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.10 to v2.0.8. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file drive/MyDrive/NLP_project/Option3-trained-on-daily-dialog-splitted-corpus-second-epoch.ckpt`
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 40763. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:
#trained_model.save_checkpoint("/content/drive/MyDrive/Masking-all-transcript-after-2-epochs-daily-dialog-better-vocab")

In [22]:
model2 = T5Model()
model2.model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 40763. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(40763, 768)

The way to load the state_dict manually because otherwise there is some error that looks like a bug of the lightning library.

In [23]:
checkpoint_test = torch.load('/content/drive/MyDrive/NLP_project/sentence-completion-above-daily-dialog.ckpt')
model2.load_state_dict(checkpoint_test['state_dict'])
model2.freeze()
model2.to(DEVICE)

T5Model(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(40763, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(40763, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_features=30

Basic model test

In [24]:
def generate_question(question, model):

    inputs_encoding =  tokenizer(
        question,
        add_special_tokens=True,
        max_length=INPUT_MAX_LEN,
        padding = 'max_length',
        truncation = True,
        return_attention_mask=True,
        return_tensors="pt"
        )

    #print(inputs_encoding)
    generate_ids = model.model.generate(
        input_ids = inputs_encoding["input_ids"].to(DEVICE),
        attention_mask = inputs_encoding["attention_mask"].to(DEVICE),
        max_length = INPUT_MAX_LEN,
        num_return_sequences = 1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        )
    #print(generate_ids)

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=False,
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]

    return "".join(preds) # why here we need to join? probably space is also a token

In [28]:
ques = "Person1:Who is the best boss?"
print("Ques: ",ques)
print("BOT: ",generate_question(ques, model2))

Ques:  Person1:Who is the best boss?
BOT:  <pad> Michael:I'm the best boss..</s>
