In [1]:
!pip install pytorch-lightning==1.2.3
!pip install transformers
!pip install sentencepiece

Collecting pytorch-lightning==1.2.3
  Downloading pytorch_lightning-1.2.3-py3-none-any.whl (821 kB)
[K     |████████████████████████████████| 821 kB 876 kB/s eta 0:00:01
[?25hCollecting PyYAML!=5.4.*,>=5.1
  Downloading PyYAML-5.3.1.tar.gz (269 kB)
[K     |████████████████████████████████| 269 kB 50.2 MB/s eta 0:00:01
Building wheels for collected packages: PyYAML
  Building wheel for PyYAML (setup.py) ... [?25ldone
[?25h  Created wheel for PyYAML: filename=PyYAML-5.3.1-cp37-cp37m-linux_x86_64.whl size=44620 sha256=a049cf8db518c432e741d4b4f890d4dcbfe2e5a53a812f0bfb9285721a06b555
  Stored in directory: /root/.cache/pip/wheels/5e/03/1e/e1e954795d6f35dfc7b637fe2277bff021303bd9570ecea653
Successfully built PyYAML
Installing collected packages: PyYAML, pytorch-lightning
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 5.4.1
    Uninstalling PyYAML-5.4.1:
      Successfully uninstalled PyYAML-5.4.1
  Attempting uninstall: pytorch-lightning
    Found existing instal

In [None]:
!apt install git-lfs
!git config --global user.email "arkanfadhil080@gmail.com.com"
!git config --global user.name "Muhammad Fadhil Arkan"

In [3]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
pth = "./indonesian_datasets/paraphrase/paws/data/"

In [None]:
class T5FineTuner(pl.LightningModule):

  def __init__(self,hparams):

    # Calling the super constructer
    super(T5FineTuner,self).__init__()
    self.called = []

    self.hparams = hparams

    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)


  def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None):

    return self.model(input_ids, attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels)
    
  def is_logger(self):
      return True
    

  def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

  def training_step(self, batch, batch_idx):
      loss = self._step(batch)

      tensorboard_logs = {"train_loss": loss}
      return {"loss": loss, "log": tensorboard_logs}


  def training_epoch_end(self, outputs):
      avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
      tensorboard_logs = {"avg_train_loss": avg_train_loss}
      return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
      loss = self._step(batch)
      return {"val_loss": loss}

  def validation_epoch_end(self, outputs):
      avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
      tensorboard_logs = {"val_loss": avg_loss}
      return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}


  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]


  def optimizer_step(
      self,
      epoch,
      batch_idx,
      optimizer,
      optimizer_idx,
      optimizer_closure,
      on_tpu,
      using_native_amp,
      using_lbfgs,
  ):
      super().optimizer_step(
          epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs
      )
      self.called.append("optimizer_step")  # append after as closure calls other methods

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = CustomDataset(tokenizer=self.tokenizer, type_path=pth+"PAW_Train_Global",data_dir=self.hparams.data_dir, max_len=self.hparams.max_seq_length)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                            num_workers=4)
    t_total = (
            (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
            // self.hparams.gradient_accumulation_steps
            * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = CustomDataset(tokenizer=self.tokenizer, type_path=pth+"PAW_Test_Global",data_dir=self.hparams.data_dir, max_len=self.hparams.max_seq_length)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)
  



In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))


In [None]:
# Hyper parameters
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='panggi/t5-base-indonesian-summarization-cased',
    tokenizer_name_or_path='panggi/t5-base-indonesian-summarization-cased',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=6,
    eval_batch_size=6,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [None]:
tokenizer = T5Tokenizer.from_pretrained('panggi/t5-base-indonesian-summarization-cased')

class CustomDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=256):
        # self.path = os.path.join(data_dir, type_path + '.csv')

        self.source_column = "question1"
        self.target_column = "question2"
        
        self.data = []
        
        with open(type_path+".csv","r") as csv_file:
          csv_reader = csv.reader(csv_file, delimiter=',')
          line_count = 0
          for row in csv_reader:
            self.data.append(row)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for example in self.data:
            
            input_ = example[0]
            target = example[1]

            input_ = "paraphrase: "+ input_ + ' </s>'
            target = target + " </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, pad_to_max_length=True, truncation=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, pad_to_max_length=True,truncation=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)





In [None]:
!git clone https://github.com/Wikidepia/indonesian_datasets.git

In [None]:
pth = "./indonesian_datasets/paraphrase/paws/data/"

In [None]:
import csv

train_examples = []
test_examples = []
dev_examples = []

with open(pth+"final/train.tsv","r") as csvfile:

  reader = csv.reader(csvfile,delimiter="\t")
  
  next(reader)

  for row in reader:

    if row[3] == "1":
      train_examples.append((row[1],row[2]))



with open(pth+"final/test.tsv","r") as csvfile:

  reader = csv.reader(csvfile,delimiter="\t")
  
  next(reader)

  for row in reader:

    if row[3] == "1":
      test_examples.append((row[1],row[2]))


with open(pth+"final/dev.tsv","r") as csvfile:

  reader = csv.reader(csvfile,delimiter="\t")
  
  next(reader)

  for row in reader:

    if row[3] == "1":
      dev_examples.append((row[1],row[2]))






In [None]:
test_examples = dev_examples + test_examples

In [None]:
len(test_examples)

In [None]:
with open(pth+"PAW_Train_Global.csv","w") as csvfile:
  writer = csv.writer(csvfile)

  for row in train_examples:
    writer.writerow(row)


In [None]:
with open(pth+"PAW_Test_Global.csv","w") as csvfile:
  writer = csv.writer(csvfile)

  for row in test_examples:
    writer.writerow(row)


In [None]:
if not os.path.exists('t5_paw_Global'):
    os.makedirs('t5_paw_Global')

args_dict.update({'output_dir': 't5_paw_Global','num_train_epochs':10,'max_seq_length':256})
args = argparse.Namespace(**args_dict)
print(args_dict)

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=1,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

In [None]:
model = T5FineTuner(args)

In [None]:
import csv
trainer = pl.Trainer(**train_params)

print (" Training model")
trainer.fit(model)

print ("training finished")

print ("Saving model")
model.model.save_pretrained('t5_paw_global')

print ("Saved model")

In [None]:
model.model.push_to_hub(use_auth_token='api_OUmDuFxfZQolrUWWbaZXQDCaThefqqqMlB')

In [32]:
# Getting the output


model = T5ForConditionalGeneration.from_pretrained('fadhilarkan/tmpvqruuuz0')
tokenizer = T5Tokenizer.from_pretrained('panggi/t5-base-indonesian-summarization-cased')

model.to("cuda")

sentence = "Ekonomi Neoklasik memandang ketidaksamaan dalam distribusi pendapatan sebagai timbul dari perbedaan nilai ditambahkan oleh tenaga kerja, modal, dan tanah."

text =  "paraphrase: " + sentence + " </s>"


max_len = 256

encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")


# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
# Have to read about these decodings
beam_outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    do_sample=True,
    max_length=256,
    top_k=220,
    top_p=1,
    early_stopping=True,
    num_return_sequences=5
)


print ("\nOriginal Question ::")
print (sentence)
print ("\n")
print ("Paraphrased Questions :: ")
final_outputs =[]
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    if sent.lower() != sentence.lower() and sent not in final_outputs:
        final_outputs.append(sent)

for i, final_output in enumerate(final_outputs):
    print("{}: {}".format(i, final_output))


Original Question ::
Ekonomi Neoklasik memandang ketidaksamaan dalam distribusi pendapatan sebagai timbul dari perbedaan nilai ditambahkan oleh tenaga kerja, modal, dan tanah.


Paraphrased Questions :: 
0: Ekonomi Neoklasik, berdasarkan perbedaan nilai pada distribusi pendapatan sebagai timbul dari perbedaan nilai yang disimpan oleh tenaga kerja, modal, dan tanah.
1: Ekonomi Neoklasik menilai ketidaksamaan dalam distribusi pendapatan sebagai bentuk dari perbedaan nilai yang ditambahkan oleh tenaga kerja, modal dan tanah.
2: Ekonomi Neoklasik tentang ketidaksamaan dalam distribusi pendapatan sebagai hasil dari perbedaan nilai dari tenaga kerja, modal, dan tanah.
3: Ekonomi Neoklasik memandang ketidaksamaan dalam distribusi pendapatan sebagai bagian dari perbedaan nilai yang ditambahkan oleh tenaga kerja, modal dan tanah.
4: Ekonomi Neoklasik melihat ketidaksamaan dalam anggaran sebagai timbul dari perbedaan nilai yang ditambahkan oleh tenaga kerja, modal, dan tanah untuk kebutuhan.


In [33]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from transformers import T5ForConditionalGeneration, T5Tokenizer

class pipeLine_indo:

    def __init__(self):

        self.model_answer_name = "fadhilarkan/qa-indo-k"
        self.model_question_name = "fadhilarkan/gq-indo-k"
        self.model_paraphrase = T5ForConditionalGeneration.from_pretrained('fadhilarkan/tmpvqruuuz0')
        self.model_paraphrase_token = T5Tokenizer.from_pretrained('panggi/t5-base-indonesian-summarization-cased')
        self.paraphrase_maxlen = 256
        
        self.nlp_answer = pipeline('question-answering', model=self.model_answer_name, tokenizer=self.model_answer_name)
        self.nlp_question = pipeline('text2text-generation', model=self.model_question_name, tokenizer=self.model_question_name)
                                     
    def generate_question(self,context):

        input = str(context)

        question = self.nlp_question(input)

        return question[0]['generated_text']

    def predict_answer(self,context,question):

        input = {'question': question,
                 'context': context
                }

        answer = self.nlp_answer(input)

        return answer['answer']

    def paraphrase(self,sentence):

        text =  "paraphrase: " + sentence + " </s>"
        encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
        input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
        beam_outputs = model.generate(input_ids=input_ids, attention_mask=attention_masks,
                                      do_sample=True,max_length=256,top_k=220,top_p=1,
                                      early_stopping=True,num_return_sequences=5
                                     )
        
        final_outputs =[]
        for beam_output in beam_outputs:
            sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
            if sent.lower() != sentence.lower() and sent not in final_outputs:
                final_outputs.append(sent)
                
        return final_outputs

In [34]:
a = pipeLine_indo()
