<a href="https://colab.research.google.com/github/TomasMrkva/project/blob/colab-notebooks/Summarizer_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-24bfee76-d7cb-5195-7970-be411ddd35c9)


In [24]:
!pip install -U transformers -q
!pip install -U sentencepiece -q
!pip install -q keybert multi_rake git+https://github.com/LIAAD/yake
!pip install -q atomicwrites

In [None]:
import pprint as pp
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline
import yake
from keybert import KeyBERT
from multi_rake import Rake

kw_model = KeyBERT(model='all-mpnet-base-v2')
hf_name = 'pszemraj/led-large-book-summary'
summary_model = AutoModelForSeq2SeqLM.from_pretrained(
                hf_name,
                # low_cpu_mem_usage=True,
                )
summary_tokenizer = AutoTokenizer.from_pretrained(hf_name)   
have_GPU = torch.cuda.is_available()
summarizer = pipeline("summarization", model=summary_model, tokenizer=summary_tokenizer, device=0 if have_GPU else -1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
headline_model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
headline_tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
headline_model = headline_model.to(device)

def make_headline(summary):
  text =  "headline: " + summary
  encoding = headline_tokenizer.encode_plus(text, return_tensors = "pt")
  input_ids = encoding["input_ids"].to(device)
  attention_masks = encoding["attention_mask"].to(device)
  bad_words_ids = headline_tokenizer(["Book", "-", "Story", "story", "Review", "Novel", "book", "novel", "review", "tale", "Tale", "A"], add_special_tokens=False).input_ids   

  beam_outputs = headline_model.generate(
      bad_words_ids=bad_words_ids,
      input_ids = input_ids,
      attention_mask = attention_masks,
      max_length = 30,
      num_beams = 4,
      # early_stopping = True,
      repetition_penalty = 1.0
  )

  return headline_tokenizer.decode(beam_outputs[0], skip_special_tokens=True)

def summarize_led(text):
  print('running')
  result = summarizer(
            text,
            min_length=16, 
            max_length=450,
            no_repeat_ngram_size=3, 
            encoder_no_repeat_ngram_size =3,
            clean_up_tokenization_spaces=True,
            repetition_penalty=3.7,
            num_beams=4,
            early_stopping=True,
      )
  return result[0]['summary_text']

def keyword_extractios(text, n):
  yake_kw = yake.KeywordExtractor(top=1, stopwords=None, n=n).extract_keywords(text)[0][0]
  rake_kw = Rake(max_words=n).apply(text)[0][0]
  bert_kw=kw_model.extract_keywords(text, keyphrase_ngram_range=(1, n), stop_words=None, top_n=1)[0][0]
  return yake_kw, rake_kw, bert_kw


def run(text):
    summary_led = summarize_led(text)
    headline = make_headline(summary_led)
    yake_kw, rake_kw, bert_kw = keyword_extractios(summary_led, n=8)
    return headline, yake_kw, rake_kw, bert_kw

In [None]:
# from inotify_simple import INotify, flags
import os
import json
from time import sleep
from os.path import exists
import shutil
import stat
import tempfile

def copy_with_metadata(source, target):
    """Copy file with all its permissions and metadata.
    
    Lifted from https://stackoverflow.com/a/43761127/2860309
    :param source: source file name
    :param target: target file name
    """
    # copy content, stat-info (mode too), timestamps...
    shutil.copy2(source, target)
    # copy owner and group
    st = os.stat(source)
    os.chown(target, st[stat.ST_UID], st[stat.ST_GID])

def atomic_write(file_contents, target_file_path, mode="w"):
    """Write to a temporary file and rename it to avoid file corruption.
    Attribution: @therightstuff, @deichrenner, @hrudham
    :param file_contents: contents to be written to file
    :param target_file_path: the file to be created or replaced
    :param mode: the file mode defaults to "w", only "w" and "a" are supported
    """
    # Use the same directory as the destination file so that moving it across
    # file systems does not pose a problem.
    temp_file = tempfile.NamedTemporaryFile(
        delete=False,
        dir=os.path.dirname(target_file_path))
    try:
        # preserve file metadata if it already exists
        if os.path.exists(target_file_path):
            copy_with_metadata(target_file_path, temp_file.name)
        with open(temp_file.name, mode) as f:
            f.write(file_contents)
            f.flush()
            os.fsync(f.fileno())

        os.replace(temp_file.name, target_file_path)
    finally:
        if os.path.exists(temp_file.name):
            try:
                os.unlink(temp_file.name)
            except:
                pass

while True:
  # for event in inotify.read():
  #     break
  while os.stat("drive/MyDrive/3rd_yr_project/text_to_summarise.txt").st_size == 0:
    sleep(5)
  with open('drive/MyDrive/3rd_yr_project/text_to_summarise.txt', 'r+') as f:
    text = f.read()
    f.truncate(0)
  print(text)
  headline, yake_kw, rake_kw, bert_kw = run(text)
  atomic_write(json.dumps({"headline": headline, "yake": yake_kw, "rake": rake_kw, "bert": bert_kw}), 'drive/MyDrive/3rd_yr_project/prompts.json')
  print(headline, yake_kw, rake_kw, bert_kw, sep="\n")

In [None]:
headline, yake_kw, rake_kw, bert_kw = run(text)
atomic_write(json.dumps({"headline": headline, "yake": yake_kw, "rake": rake_kw, "bert": bert_kw}), 'drive/MyDrive/3rd_yr_project/prompts.json')
print(headline, yake_kw, rake_kw, bert_kw, sep="\n")