In [None]:
!pip install --quiet pytorch_lightning
!pip install --quiet transformers
!pip install --quiet sentencepiece

In [None]:
import glob
import os
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from sklearn.model_selection import train_test_split
import textwrap

from torch.utils.data import Dataset,DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

from transformers import(
    AdamW,
    T5Model,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [None]:
file_path = 'drive/MyDrive/Colab Notebooks/ner_data.csv'
root_path = 'drive/MyDrive/Colab Notebooks/'

# Local File System
# file_path = 'dataset/moviedata.csv'
# root_path = 'dataset/'

In [None]:
df = pd.read_csv(file_path,index_col=0)
df.tail()

Unnamed: 0,input_text,target_text,prefix
3588,what happened in the year 541,541+,ner
3589,what happened in the year 969,969+,ner
3590,what happened in the year 910,910+,ner
3591,what happened in the year 120,120+,ner
3592,what happened in the year 687,687+,ner


In [None]:
m_name="t5-small"
tokenizer = T5Tokenizer.from_pretrained(m_name)

In [None]:
class NERDataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: T5Tokenizer,
      source_max_token_length: int = 396,
      target_max_token_length: int = 32
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_length = source_max_token_length
    self.target_max_token_length = target_max_token_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index: int):
    data_row = self.data.iloc[index]
  
    source_encoding = tokenizer(
      data_row['prefix'] + ': ' + data_row['input_text'],
      max_length=self.source_max_token_length,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
    )

    target_encoding = tokenizer(
      data_row['target_text'],
      max_length=self.target_max_token_length,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
    )

    labels = target_encoding['input_ids']
    labels[labels == 0] = -100

    return dict(
        input_text=data_row['prefix'] + ': ' + data_row['input_text'],
        target_text=data_row['target_text'],
        input_ids=source_encoding['input_ids'].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )


In [None]:
class NERDataModel(pl.LightningDataModule):

  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer: T5Tokenizer,
      batch_size: int=8,
      source_max_token_length = 396,
      target_max_token_length = 32
  ):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.train_dataset = None
    self.test_dataset = None
    self.tokenizer = tokenizer
    self.source_max_token_length = source_max_token_length
    self.target_max_token_length = target_max_token_length

  def setup(self):
    self.train_dataset = NERDataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_length,
        self.target_max_token_length
    )
    
    self.test_dataset = NERDataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_length,
        self.target_max_token_length
    )
  
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=2
    )
  
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=2
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=2
    )
    

In [None]:
BATCH_SIZE = 32
EPOCHS = 2
train_df,test_df = train_test_split(df,test_size=0.95)
data_module = NERDataModel(train_df,test_df,tokenizer,batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
class NERModel(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(m_name,return_dict = True)
  
  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    return output.loss, output.logits
  
  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch['attention_mask']
    labels = batch["labels"]
    loss, outputs = self(input_ids,attention_mask,labels)
    self.log("train_loss",loss,prog_bar=True,logger=True)
    return loss
  
  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch['attention_mask']
    labels = batch["labels"]
    loss, outputs = self(input_ids,attention_mask,labels)
    self.log("val_loss",loss,prog_bar=True,logger=True)
    return loss
  
  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch['attention_mask']
    labels = batch["labels"]
    loss, outputs = self(input_ids,attention_mask,labels)
    self.log("test_loss",loss,prog_bar=True,logger=True)
    return loss
  
  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.001)


In [None]:
model = NERModel()

In [None]:
# ckpt_name = "ner-v1"
# model =  NERModel().load_from_checkpoint("checkpoints/"+ckpt_name)

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="ner",
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode="min"
)

trainer = Trainer(
    checkpoint_callback=checkpoint_callback,
    max_epochs=EPOCHS,
    progress_bar_refresh_rate=10,
    gpus=1, 
    accelerator='dp',
    auto_select_gpus=True,
    auto_scale_batch_size=True
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
# %load_ext tensorboard
# %tensorboard --logdir ./lightning_logs
!kill 286

In [None]:
!rm -r lightning_logs

rm: cannot remove 'lightning_logs': No such file or directory


In [None]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0, global step 5: val_loss reached 0.09387 (best 0.09387), saving model to "/content/checkpoints/ner.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1, global step 11: val_loss reached 0.00226 (best 0.00226), saving model to "/content/checkpoints/ner.ckpt" as top 1





1

In [None]:
# trainer.test()

In [None]:
ls checkpoints

ner.ckpt


In [None]:
trained_model = NERModel.load_from_checkpoint("checkpoints/ner.ckpt")
trained_model.freeze()

In [None]:
def generate_answer(data_row):
  source_encoding = tokenizer(
      data_row['prefix'] + ': ' + data_row['input_text'],
      max_length=396,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )

  generated_ids = trained_model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=3,
      max_length=80,
      repetition_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )

  preds = [
           tokenizer.decode(generated_id,skip_special_tokens=True,clean_up_tokenization_spaces=True)
           for generated_id in generated_ids
  ]

  return "".join(preds)


In [None]:
# Extracting NER
# > - Genres
# # - Persons
# + - Numbers
def extract_entities(text):
  r_text = re.sub(r'(>|\#|\+)','\\1[cut]',text)
  r_text_list = r_text.split('[cut]')
  genres = []
  persons = []
  numbers = []
  for entry in r_text_list:
    if '>' in entry:
      genres.append(re.sub(r'>','',entry).strip())
    if '#' in entry:
      persons.append(re.sub(r'\#','',entry).strip())
    if '+' in entry:
      numbers.append(re.sub(r'\+','',entry).strip())
  entities = {
      "genres":genres,
      "persons":persons,
      "numbers":numbers
  }
  return entities

In [None]:
test_df.head()

Unnamed: 0,input_text,target_text,prefix
3125,what happened in the year 616,616+,ner
2108,suggest me some biography movies directed by w...,biography> wally pfister# jeremy saulnier#,ner
1824,suggest me some action movies directed by mayk...,action> mayko nguyen# leland orser#,ner
1530,suggest me some music movies directed by rasmu...,music> rasmus hardiker# pascal laugier#,ner
1422,suggest me some animation movies directed by j...,animation> justin timberlake# axel devillers#,ner


In [None]:
sample = test_df.iloc[1422]
# sample['input_text']='1945'
sample

input_text     suggest me some sci-fi movies directed by ryan...
target_text                sci-fi> ryan guzman# asghar farhadi# 
prefix                                                       ner
Name: 1821, dtype: object

In [None]:
ans = generate_answer(sample)
ans 

'sci-fi> ryan guzman# asghar farhadi#'

In [None]:
extract_entities(ans)

{'genres': ['sci-fi'],
 'numbers': [],
 'persons': ['ryan guzman', 'asghar farhadi']}