# Dataset generation

## NER model

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 30.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [2]:
from transformers import BartTokenizerFast, BartForConditionalGeneration

In [83]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [4]:
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)

In [6]:
!wget https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt

--2021-11-25 01:10:47--  https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 241387 (236K) [text/plain]
Saving to: ‘clean.txt’


2021-11-25 01:10:47 (9.02 MB/s) - ‘clean.txt’ saved [241387/241387]



In [5]:
with open('clean.txt', 'r') as fp:
    text = fp.read().split('\n')

In [6]:
text = text[:50]

In [7]:
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [8]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [9]:
tokenizer.convert_ids_to_tokens(101)

'Ġlike'

In [10]:
from tqdm import tqdm

def get_mask(inputs):
  mask = []
  for sentense in tqdm(inputs['input_ids']):    
    text_tok = tokenizer.convert_ids_to_tokens(sentense)
    #print(text_tok)
    text_tok = [tok.replace("Ġ", "") for tok in text_tok if tok not in ['<s>','</s>','<pad>']] #<s> </s>?
    #print(text_tok)
    ner_results = nlp(" ".join(text_tok))
    res = [False] * 512
    for entity in ner_results:
      try:
        res[text_tok.index(entity['word'])] = True
      except ValueError:
        pass
    #print(ner_results)
    mask.append(res)
  return mask

In [11]:
inputs.labels

tensor([[    0,  7605,   127,  ...,     1,     1,     1],
        [    0,  7605,     5,  ...,     1,     1,     1],
        [    0,  7605,   127,  ...,     1,     1,     1],
        ...,
        [    0,  1106, 44761,  ...,     1,     1,     1],
        [    0,  1620, 13018,  ...,     1,     1,     1],
        [    0,  3084,  1181,  ...,     1,     1,     1]])

In [12]:
import torch

In [13]:
mask_arr = get_mask(inputs)
mask_arr = torch.Tensor(mask_arr)

100%|██████████| 50/50 [00:31<00:00,  1.56it/s]


In [14]:
mask_arr.size()

torch.Size([50, 512])

In [15]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [16]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [17]:
tokenizer.convert_ids_to_tokens(103)

'Ġsome'

In [18]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [19]:
dataset = MeditationsDataset(inputs)

In [20]:

loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

In [21]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
   

In [22]:
from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [23]:
from tqdm import tqdm  # for our progress bar

epochs = 10

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  """
Epoch 0: 100%|██████████| 50/50 [00:58<00:00,  1.17s/it, loss=3.31]
Epoch 1: 100%|██████████| 50/50 [00:58<00:00,  1.17s/it, loss=0.774]
Epoch 2:  30%|███       | 15/50 [00:18<00:42,  1.21s/it, loss=3.39]


KeyboardInterrupt: ignored

# Testing trained model

In [24]:
torch_device = 'cuda'

In [25]:
import google.colab.output

def bart_summarize(text, num_beams, length_penalty, max_length, min_length, no_repeat_ngram_size):
  
  text = text.replace('\n','')
  text_input_ids = tokenizer.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
  summary_ids = model.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))           
  summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
  return summary_txt

#register callback for Javascript
google.colab.output.register_callback('bart_summarize', bart_summarize)

In [26]:
sample_text = """Germany is mulling tighter Covid restrictions that could include lockdown measures with a decision expected on Wednesday. Meanwhile, Spain is tightening controls as infection numbers rise. Austria has opted for total lockdown and the Netherlands for a partial one.

Austria is, so far, the only country in Europe that will make Covid vaccines compulsory from February next year, although there have been calls in other countries for mandating vaccines.

WHO’s Butler said the health agency did not have a position on mandates but said they were a “very delicate” matter.

“It polarizes, you risk marginalizing [people] and it can come at the expense of trust and social inclusion. So it’s a very delicate measure, a last-resort measure. Lessons of history have shown us that where vaccines are mandated or made compulsory, there is an erosion of trust and we have seen this polarization,” he said.

European Commission President Ursula von der Leyen on Tuesday called for the deployment of booster shots and said other preventive measures must be embraced to keep infection numbers down.

“Further measures are necessary to prevent or slow the spread of the virus. In other words, social distancing, wearing masks and hygiene rules. All of these remain equally important. I know that many of us are really beginning to find it very difficult, but we mustn’t forget something. In the EU, 1,600 people die every day of Covid, 1,600 people, day after day,” she noted.

“Therefore, vaccination and hygiene measures are an act of solidarity, and they save lives,” she added."""

In [28]:
bart_summarize(sample_text, num_beams=4, length_penalty = 2.0, max_length=250, min_length=5, no_repeat_ngram_size=1)

", and the of in to.; that from it for which there all who are like a other socialatory is prevent toooll can fearations those with them's haveant away blame or moreion Butlyence anyation things care has also as about they notost necessity thanateure nor need required pain matters do by fit but avoid fearsest fall others much better habitsiness same what either this hands kind oneablyility if blood partent againstment\xa0 man only endut value excuse every overingly called life him amongful should worried being must last besides so distraction without thingius observe men less live: another intotax then no filled vaccine at call natureFrom thee evils stranger wantive still gravity time knowledge be yet make further aversionort pityainuriousct anythinged askite out words satisfied was freedom voluntarily concern equallyit wearile The possibleist dress when health And ablevesration their vaccines he far matter person universe changes throughrel action even on very death now these immediat

In [1]:
import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

In [2]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [86]:
torch_device = 'cpu'

In [75]:
import google.colab.output

def bart_summarize(text, num_beams, length_penalty, max_length, min_length, no_repeat_ngram_size):
  
  text = text.replace('\n','')
  text_input_ids = tokenizer.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
  summary_ids = model.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))           
  summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
  return summary_txt

#register callback for Javascript
google.colab.output.register_callback('bart_summarize', bart_summarize)

In [29]:
from IPython.display import HTML

#spinner from https://codepen.io/vovchisko/pen/vROoYQ
spinner_css = """
<style>
@keyframes c-inline-spinner-kf {
  0% {
    transform: rotate(0deg);
  }
  100% {
    transform: rotate(360deg);
  }
}

.c-inline-spinner,
.c-inline-spinner:before {
  display: inline-block;
  width: 11px;
  height: 11px;
  transform-origin: 50%;
  border: 2px solid transparent;
  border-color: #74a8d0 #74a8d0 transparent transparent;
  border-radius: 50%;
  content: "";
  animation: linear c-inline-spinner-kf 300ms infinite;
  position: relative;
  vertical-align: inherit;
  line-height: inherit;
}
.c-inline-spinner {
  top: 3px;
  margin: 0 3px;
}
.c-inline-spinner:before {
  border-color: #74a8d0 #74a8d0 transparent transparent;
  position: absolute;
  left: -2px;
  top: -2px;
  border-style: solid;
}
</style>
"""

input_form = """
<link rel="stylesheet" href="https://unpkg.com/purecss@1.0.1/build/pure-min.css" integrity="sha384-oAOxQR6DkCoMliIh8yFnu25d7Eq/PHS21PClpwjOTeU2jRSq11vu66rf90/cZr47" crossorigin="anonymous">

<div style="background-color:white; border:solid #ccc; width:800px; padding:20px; color: black;">
<p><strong>BART</strong> Seq2Seq model with SoTA summarization performance</p>
<textarea id="main_textarea" cols="75" rows="20" placeholder="Paste your text here..." style="font-family: 'Liberation Serif', 'DejaVu Serif', Georgia, 'Times New Roman', Times, serif; font-size: 13pt; padding:10px;"></textarea><br>
<div class="pure-form pure-form-aligned">
   <div class="pure-control-group">
     <label for="no_repeat_ngram_size"><strong>no_repeat_ngram_size:</strong></label>
     <input type="number" id="no_repeat_ngram_size" value="3" style="background-color: white;">
    </div>
    <div class="pure-control-group">
      <label for="num_beams"><strong>num_beams:</strong></label>
      <input type="number" min="0" max="10" step="1" id="num_beams" value="4" style="background-color: white;">
    </div>
    <div class="pure-control-group">
        <label for="length_penalty"><strong>length_penalty:</strong></label>
        <input type="number" min="0.0" max="10.0" step="0.1" id="length_penalty" value="2.0" style="background-color: white;">
    </div>
    <div class="pure-control-group">
        <label for="max_length"><strong>max_length:</strong></label>
        <input type="number" id="max_length" value="142" style="background-color: white;">
    </div>
     <div class="pure-control-group">
        <label for="min_length"><strong>min_length:</strong></label>
        <input type="number" id="min_length" value="56" style="background-color: white;">
    </div>
    <p><a target="_blank" href='https://pastebin.com/raw/BMPcUS6v'>Try to summarize this example article</a></p>
    <div style="width: 300px; display: block; margin-left: auto !important; margin-right: auto !important;">
        <p><button class="pure-button pure-button-primary" style="font-size: 125%%;" onclick="summarize()">Summarize</button>
        <span class="c-inline-spinner" style="visibility: hidden;" id="spinner"></span></p>
    </div>
</div>
</div>
"""

javascript = """
<script type="text/Javascript">


       function saveTextAsFile(textToWrite, fileNameToSaveAs)
    {
    	var textFileAsBlob = new Blob([textToWrite], {type:'text/plain'}); 
    	var downloadLink = document.createElement("a");
    	downloadLink.download = fileNameToSaveAs;
    	downloadLink.innerHTML = "Download File";
    	if (window.webkitURL != null)
    	{
    		// Chrome allows the link to be clicked
    		// without actually adding it to the DOM.
    		downloadLink.href = window.webkitURL.createObjectURL(textFileAsBlob);
    	}
    	else
    	{
    		// Firefox requires the link to be added to the DOM
    		// before it can be clicked.
    		downloadLink.href = window.URL.createObjectURL(textFileAsBlob);
    		downloadLink.onclick = destroyClickedElement;
    		downloadLink.style.display = "none";
    		document.body.appendChild(downloadLink);
    	}
    
    	downloadLink.click();
    }


    function summarize(){
        
        var text = document.getElementById('main_textarea').value;
        var no_repeat_ngram_size = document.getElementById('no_repeat_ngram_size').value;
        var num_beams = document.getElementById('num_beams').value;
        var length_penalty = document.getElementById('length_penalty').value;
        var max_length = document.getElementById('max_length').value;
        var min_length = document.getElementById('min_length').value;
        
        var kernel = google.colab.kernel;

        var resultPromise = kernel.invokeFunction("bart_summarize", [text,num_beams,length_penalty,max_length,min_length,no_repeat_ngram_size]); // developer, look here
        resultPromise.then(
            function(result) {
              document.getElementById('main_textarea').value = 'da resultado';
              document.getElementById('main_textarea').value = result.data["text/plain"];
              document.getElementById('spinner').style = "visibility: hidden;";
              saveTextAsFile(result.data["text/plain"], 'summary.txt')
        }).catch(function(error){document.getElementById('main_textarea').value = error;});
        document.getElementById('spinner').style = "visibility: visible;";
    };
</script>
""" 


HTML(spinner_css + input_form + javascript)