In [1]:
#Installing dependencies
!pip install sentence-splitter



In [2]:
!pip install transformers



In [3]:
!pip install SentencePiece



In [4]:
#importing the PEGASUS Transformer model
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

#setting up the model
def get_response(input_text,num_return_sequences):
  batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [5]:
#test input sentence
text = "I will be showing you how to build a web application in Python using the SweetViz and its dependent library."

In [6]:
#printing response
get_response(text, 5)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



['I will show you how to use the SweetViz and its dependent library to build a web application.',
 'I will show you how to use the SweetViz library to build a web application.',
 'I will show you how to build a web application using the SweetViz and its dependent library.',
 'I will show you how to use the SweetViz and its dependent library to build a web application in Python.',
 'I will show you how to build a web application in Python using the SweetViz library.']

In [7]:
get_response(text, 1)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



['I will show you how to use the SweetViz and its dependent library to build a web application.']

In [8]:
# Paragraph of text
context = "I will be showing you how to build a web application in Python using the SweetViz and its dependent library. Data science combines multiple fields, including statistics, scientific methods, artificial intelligence (AI), and data analysis, to extract value from data. Those who practice data science are called data scientists, and they combine a range of skills to analyze data collected from the web, smartphones, customers, sensors, and other sources to derive actionable insights."
print(context)

I will be showing you how to build a web application in Python using the SweetViz and its dependent library. Data science combines multiple fields, including statistics, scientific methods, artificial intelligence (AI), and data analysis, to extract value from data. Those who practice data science are called data scientists, and they combine a range of skills to analyze data collected from the web, smartphones, customers, sensors, and other sources to derive actionable insights.


In [9]:

# Takes the input paragraph and splits it into a list of sentences
from sentence_splitter import SentenceSplitter, split_text_into_sentences

splitter = SentenceSplitter(language='en')

sentence_list = splitter.split(context)
sentence_list

['I will be showing you how to build a web application in Python using the SweetViz and its dependent library.',
 'Data science combines multiple fields, including statistics, scientific methods, artificial intelligence (AI), and data analysis, to extract value from data.',
 'Those who practice data science are called data scientists, and they combine a range of skills to analyze data collected from the web, smartphones, customers, sensors, and other sources to derive actionable insights.']

In [10]:
# Do a for loop to iterate through the list of sentences and paraphrase each sentence in the iteration
paraphrase = []

for i in sentence_list:
  a = get_response(i,1)
  paraphrase.append(a)

In [11]:
# Generating the paraphrased text
paraphrase

[['I will show you how to use the SweetViz and its dependent library to build a web application.'],
 ['Data science combines multiple fields, including statistics, scientific methods, and data analysis, to extract value from data.'],
 ['Data scientists combine a range of skills to analyze data collected from the web, smartphones, customers, sensors, and other sources to derive actionable insights.']]

In [12]:
#creating the second split
paraphrase2 = [' '.join(x) for x in paraphrase]
paraphrase2

['I will show you how to use the SweetViz and its dependent library to build a web application.',
 'Data science combines multiple fields, including statistics, scientific methods, and data analysis, to extract value from data.',
 'Data scientists combine a range of skills to analyze data collected from the web, smartphones, customers, sensors, and other sources to derive actionable insights.']

In [13]:

# Combine the above splitted lists into a paragraph
paraphrase3 = [' '.join(x for x in paraphrase2) ]
paraphrased_text = str(paraphrase3).strip('[]').strip("'")
paraphrased_text

'I will show you how to use the SweetViz and its dependent library to build a web application. Data science combines multiple fields, including statistics, scientific methods, and data analysis, to extract value from data. Data scientists combine a range of skills to analyze data collected from the web, smartphones, customers, sensors, and other sources to derive actionable insights.'

In [14]:
# Comparison of the original (context variable) and the paraphrased version (paraphrase3 variable)

print(context)
print(paraphrased_text)

I will be showing you how to build a web application in Python using the SweetViz and its dependent library. Data science combines multiple fields, including statistics, scientific methods, artificial intelligence (AI), and data analysis, to extract value from data. Those who practice data science are called data scientists, and they combine a range of skills to analyze data collected from the web, smartphones, customers, sensors, and other sources to derive actionable insights.
I will show you how to use the SweetViz and its dependent library to build a web application. Data science combines multiple fields, including statistics, scientific methods, and data analysis, to extract value from data. Data scientists combine a range of skills to analyze data collected from the web, smartphones, customers, sensors, and other sources to derive actionable insights.


In [15]:

#setting up the model
def get_response(input_text,num_return_sequences):
  batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

def paraphrase(text):
    splitter = SentenceSplitter(language='en')
    sentence_list = splitter.split(text)
    paraphrase = []
    output=""
    for i in sentence_list:
        a = get_response(i,1)
        paraphrase.append(a)
    for i in paraphrase:
        output=output+" "+i[0]
    return output
print(paraphrase("The introduction is the initial paragraph that begins the subsequent process of the project. Each project, each essay, or any article if it is written, receives an introductory paragraph that opens the way to successive paragraphs or topics of the project."))

 The introduction is the beginning of the project. Each project, each essay, or any article gets an introductory paragraph that opens the way to the next paragraphs or topics of the project.


In [17]:
import torch
import warnings
from flask import jsonify
from nltk.tokenize import sent_tokenize
import random
import nltk.data
import torch
import docx2txt
import PyPDF2
import os
import warnings
warnings.filterwarnings("ignore")
from flask import Flask, render_template, request, redirect, url_for, abort, \
    send_from_directory
from werkzeug.utils import secure_filename
import PyPDF2

from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from sentence_splitter import SentenceSplitter, split_text_into_sentences

app = Flask(__name__)
app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 2 * 1024 * 1024
app.config['UPLOAD_EXTENSIONS'] = ['.docx', '.txt', '.pdf']
app.config['UPLOAD_PATH'] = 'uploads'
text=""


model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)


#setting up the model
def get_response(input_text,num_return_sequences,num_beams=10):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

def paraphrase(text):
    sentence_list = sent_tokenize(text)
    paraphrase = []
    output=""
    for i in sentence_list:
        a = get_response(i,1)
        paraphrase.append(a)
    for i in paraphrase:
        output=output+i[0]+" "
    return output

@app.errorhandler(413)
def too_large(e):
    return "File is too large", 413

@app.route('/')
def index():
    a=''
    files = os.listdir(app.config['UPLOAD_PATH'])
    return render_template('index.html', files=files,a=a)

@app.route('/', methods=['GET','POST'])
def upload_files():
    a=''
    uploaded_file = request.files['file']
    filename = secure_filename(uploaded_file.filename)
    if filename != '':
        file_ext = os.path.splitext(filename)[1]
        if file_ext not in app.config['UPLOAD_EXTENSIONS'] :
            return "Invalid image", 400
        uploaded_file.save(os.path.join(app.config['UPLOAD_PATH'], filename))
        if file_ext==".pdf":
            pdfdoc = PyPDF2.PdfFileReader("uploads/"+filename)
            for i in range(pdfdoc.numPages):
                current_page = pdfdoc.getPage(i)
                print("===================")
                print("Content on page:" + str(i + 1))
                print("===================")
                a=a+current_page.extractText()
        if file_ext==".txt":
            with open ("uploads/"+filename, "r") as myfile:
                a=myfile.readlines()
        if file_ext==".docx":
            # extract text
            a = docx2txt.process("uploads/"+filename)
    print(a)
    return render_template('index.html', a=a)

@app.route('/uploads/<filename>')
def upload(filename):
    return send_from_directory(app.config['UPLOAD_PATH'], filename)

@app.route('/phrase', methods=['POST'])
def phrase():
    sen = request.get_json()
    print(sen['data'])
    pem = sen['data']
    print (pem)
    text = paraphrase(pem)
    ata = {'name':text}
    return jsonify(ata)



if __name__ == '__main__':
   app.run(debug=True)

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Restarting with watchdog (windowsapi)


SystemExit: 1