In [None]:
!pip install transformers
!pip install sentencepiece
!pip install fitz
#Restart runtime after cell runs installation

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
from transformers import pipeline

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

In [None]:
!gdown -O  t5_que_gen.zip --id 1vhsDOW9wUUO83IQasTPlkxb82yxmMH-V
!unzip t5_que_gen.zip

Downloading...
From: https://drive.google.com/uc?id=1vhsDOW9wUUO83IQasTPlkxb82yxmMH-V
To: /content/t5_que_gen.zip
100% 1.65G/1.65G [00:18<00:00, 87.3MB/s]
Archive:  t5_que_gen.zip
   creating: t5_que_gen_model/
   creating: t5_que_gen_model/t5_base_tok_que_gen/
  inflating: t5_que_gen_model/t5_base_tok_que_gen/spiece.model  
 extracting: t5_que_gen_model/t5_base_tok_que_gen/added_tokens.json  
 extracting: t5_que_gen_model/t5_base_tok_que_gen/tokenizer_config.json  
  inflating: t5_que_gen_model/t5_base_tok_que_gen/special_tokens_map.json  
   creating: t5_que_gen_model/t5_base_que_gen/
  inflating: t5_que_gen_model/t5_base_que_gen/config.json  
  inflating: t5_que_gen_model/t5_base_que_gen/pytorch_model.bin  
 extracting: t5_que_gen_model/logs.zip  
   creating: t5_ans_gen_model/
   creating: t5_ans_gen_model/t5_base_tok_ans_gen/
  inflating: t5_ans_gen_model/t5_base_tok_ans_gen/spiece.model  
  inflating: t5_ans_gen_model/t5_base_tok_ans_gen/added_tokens.json  
 extracting: t5_ans_ge

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
from itertools import chain
from string import punctuation
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#!pip install sentencepiece

In [None]:
class QueGenerator():
  def __init__(self):
    self.que_model = T5ForConditionalGeneration.from_pretrained('./t5_que_gen_model/t5_base_que_gen/')
    self.ans_model = T5ForConditionalGeneration.from_pretrained('./t5_ans_gen_model/t5_base_ans_gen/')

    self.que_tokenizer = T5Tokenizer.from_pretrained('./t5_que_gen_model/t5_base_tok_que_gen/')
    self.ans_tokenizer = T5Tokenizer.from_pretrained('./t5_ans_gen_model/t5_base_tok_ans_gen/')
    
    self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    self.que_model = self.que_model.to(self.device)
    self.ans_model = self.ans_model.to(self.device)
  
  def generate(self, text):
    answers = self._get_answers(text)
    questions = self._get_questions(text, answers)
    output = [{'answer': ans, 'question': que} for ans, que in zip(answers, questions)]
    return output
  
  def _get_answers(self, text):
    # split into sentences
    sents = sent_tokenize(text)

    examples = []
    for i in range(len(sents)):
      input_ = ""
      for j, sent in enumerate(sents):
        if i == j:
            sent = "[HL] %s [HL]" % sent
        input_ = "%s %s" % (input_, sent)
        input_ = input_.strip()
      input_ = input_ + " </s>"
      examples.append(input_)
    
    batch = self.ans_tokenizer.batch_encode_plus(examples, max_length=512, pad_to_max_length=True, return_tensors="pt")
    with torch.no_grad():
      outs = self.ans_model.generate(input_ids=batch['input_ids'].to(self.device), 
                                attention_mask=batch['attention_mask'].to(self.device), 
                                max_length=32,
                                # do_sample=False,
                                # num_beams = 4,
                                )
    dec = [self.ans_tokenizer.decode(ids, skip_special_tokens=False) for ids in outs]
    answers = [item.split('[SEP]') for item in dec]
    answers = chain(*answers)
    answers = [ans.strip() for ans in answers if ans != ' ']
    return answers
  
  def _get_questions(self, text, answers):
    examples = []
    for ans in answers:
      input_text = "%s [SEP] %s </s>" % (ans, text)
      examples.append(input_text)
    
    batch = self.que_tokenizer.batch_encode_plus(examples, max_length=512, pad_to_max_length=True, return_tensors="pt")
    with torch.no_grad():
      outs = self.que_model.generate(input_ids=batch['input_ids'].to(self.device), 
                                attention_mask=batch['attention_mask'].to(self.device), 
                                max_length=32,
                                num_beams = 4)
    dec = [self.que_tokenizer.decode(ids, skip_special_tokens=False) for ids in outs]
    return dec

In [None]:
text = "/content/1-s2.0-S004896972208367X-main-2-9.pdf"

### Do not run code from here

In [None]:
pipe_summary = pipeline("summarization", model=model, tokenizer=tokenizer, max_length=80)
que_generator = QueGenerator()
output_summary = pipe_summary(text)
output_summary = output_summary[0]["summary_text"]
que_generator.generate(output_summary)

Your max_length is set to 80, but you input_length is only 75. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'answer': '<pad>integration or retention of trees',
  'question': '<pad> What does agroforestry involve?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'},
 {'answer': '</s>',
  'question': '<pad> What is agroforestry?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'},
 {'answer': '<pad>Ecological interactions',
  'question': "<pad> What can benefit smallholder farmers' crop yields while minimising the need for farming inputs?</s>"},
 {'answer': '</s><pad><pad>',
  'question': '<pad> What is the purpose of agroforestry?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'}]

In [None]:
print("Original Text:\n",text)
print("Paraphrased Text:\n",output_summary)



Original Text:
 Agroforestry involves the integration or retention of trees in agricultural landscapes for socio-economic and ecological benefit (Schroth et al., 2004).Ecological interactions between trees, soils and crops in agroforests can benefit smallholder farmers' crop yields whilst minimising the need for farming inputs (Jose, 2009; Ajayi et al., 2011). 
Paraphrased Text:
  Agroforestry involves the integration or retention of trees in agricultural landscapes for socio-economic and ecological benefit (Schroth et al., 2004). Ecological interactions between trees, soils and crops in agroforests can benefit smallholder farmers' crop yields whilst minimising the need for farming inputs .


In [None]:
words_1 = nltk.word_tokenize(text)
words_2 = nltk.word_tokenize(output_summary)
print(len(words_1), len(words_2))

63 52


### Code not to run end

In [None]:
!pip install PyMuPDF

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyMuPDF
  Downloading PyMuPDF-1.22.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.22.2


In [None]:
import fitz

def get_text_data(pdf_path):
  with fitz.open(pdf_path) as doc:
      text = ""
      for page in doc:
          text += page.get_text()

      return text

In [None]:
import nltk  #NLTK Library is for sentence and word tokenization.
nltk.download('punkt')
from nltk import tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
text = get_text_data('/content/test_main.pdf')

result = tokenize.sent_tokenize(text)

str_paragraph = ''
paragraphs = []

for i in range(len(result)):

  sentence = result[i]
  len_para = len(tokenize.word_tokenize(str_paragraph))

  if len_para < 200:
    str_paragraph = str_paragraph + ' ' + sentence

  elif len_para >= 200 :
    paragraphs.append(str_paragraph)

    str_paragraph = ''
    str_paragraph = str_paragraph + ' ' + sentence

  elif i == len(result) - 1:
    paragraphs.append(str_paragraph)

In [None]:
paragraphs[1]

' A multi-model ensemble is constructed with the global climate models \nthat are found to best reproduce the wind climate in Northern Europe. The results anticipate an overall decline in \nwind power density, especially in the high-emissions scenario and in certain regions (up to 30% off Western \nIreland), which should be taken into account in planning future offshore wind deployments. As an exception, \nslight increases (around 10%) are projected in certain areas of the Baltic Sea. The general decline is less pro-\nnounced in the low-emissions scenario. Indeed, the results prove that reducing emissions as advocated by current \nclimate objectives would not only weaken the declining trend but also lead to a more stable resource. 1. Introduction \nCutting carbon emissions to mitigate climate change and reducing \nthe over-dependence on fossil fuels are listed as a priority in a great \nnumber of countries throughout the world. As a result, investments in \nthe energy sector are shifti

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

In [None]:
pipe_summary = pipeline("summarization", model=model, tokenizer=tokenizer, truncation=True)
que_generator = QueGenerator()
output_summary = pipe_summary(paragraphs[1])
output_summary = output_summary[0]["summary_text"]
que_generator.generate(output_summary)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'answer': '<pad>Northern Europe',
  'question': '<pad> Where is the wind climate found to be best reproduced?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'},
 {'answer': '</s><pad><pad><pad><pad>',
  'question': '<pad> What type of wind power density is predicted to decline in some areas of the Baltic Sea?</s><pad><pad><pad><pad>'},
 {'answer': '<pad>high-emissions scenario',
  'question': '<pad> The results anticipate an overall decline in wind power density especially in what scenario?</s><pad><pad><pad><pad><pad><pad><pad>'},
 {'answer': '</s>',
  'question': '<pad> What ensemble is constructed with the global climate models that are found to best reproduce the wind climate in Northern Europe?</s>'},
 {'answer': '<pad>around 10',
  'question': '<pad> How many light increases are projected in certain areas of the Baltic Sea?</s><pad><pad><pad><pad><pad><pad><pad><pad>'},
 {'answer': '</s><pad><pad><pad><pad>',
  'question': '<pad> What type of wind power density is predict

In [None]:
que_generator.generate(output_summary)

[{'answer': '<pad>warmer',
  'question': '<pad> What do climate projections for tropical regions forecast?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'},
 {'answer': '</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
  'question': '<pad> What is the name of the game?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'},
 {'answer': '<pad>NE South America, Central America,. Central America',
  'question': '<pad> Which regions may become drier by 2100?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'},
 {'answer': '</s>',
  'question': "<pad> What are the IPCC's recent multi-model mean projections?</s><pad><pad><pad><pad><pad><pad><pad>"},
 {'answer': '<pad>Central America',
  'question': '<pad> Along with Southern Africa and SE Asia, what tropical region is predicted to become wetter by 2100?</s>'},
 {'answer': '</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
  'question': '<pad> What is the name of the item that

In [None]:
print("Original Text:\n",paragraphs[1])
print("Paraphrased Text:\n",output_summary)

Original Text:
  A multi-model ensemble is constructed with the global climate models 
that are found to best reproduce the wind climate in Northern Europe. The results anticipate an overall decline in 
wind power density, especially in the high-emissions scenario and in certain regions (up to 30% off Western 
Ireland), which should be taken into account in planning future offshore wind deployments. As an exception, 
slight increases (around 10%) are projected in certain areas of the Baltic Sea. The general decline is less pro-
nounced in the low-emissions scenario. Indeed, the results prove that reducing emissions as advocated by current 
climate objectives would not only weaken the declining trend but also lead to a more stable resource. 1. Introduction 
Cutting carbon emissions to mitigate climate change and reducing 
the over-dependence on fossil fuels are listed as a priority in a great 
number of countries throughout the world. As a result, investments in 
the energy sector are s