```bibtex @article{DBLP:journals/corr/abs-1910-13461, author = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and Abdelrahman Mohamed and Omer Levy and Veselin Stoyanov and Luke Zettlemoyer}, title = {{BART:} Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension}, journal = {CoRR}, volume = {abs/1910.13461}, year = {2019}, url = {http://arxiv.org/abs/1910.13461}, eprinttype = {arXiv}, eprint = {1910.13461}, timestamp = {Thu, 31 Oct 2019 14:02:26 +0100}, biburl = {https://dblp.org/rec/journals/corr/abs-1910-13461.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }

# Параметры

In [1]:
url = '' # 'https://www.gutenberg.org/cache/epub/1259/pg1259.txt' # Ссылка на txt
filename = '/content/pg1259.txt' # Путь к txt
word_to_split_book_into_parts = 'Chapter' # Если книга не суммаризируется целиком, можно попробовать суммаризацию по частям путем введения слова разделителся на части (пример: Chapter)
max_words = 1000 # Количество слов суммаризации. Применение числа меньше 20 может привести к зацикливанию исполнения.
output_file = 'output.txt' # Имя файла, в который будет записана суммаризация
#model = "facebook/bart-large-cnn" #на текущипй момент не реализовано

# Суммаризация

In [2]:
%%capture
#@markdown set up auto-formatting of cells in notebook

from IPython.display import HTML, display


def set_css():
    display(
        HTML(
            """
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  """
        )
    )
get_ipython().events.register("pre_run_cell", set_css)

In [3]:
#@markdown GPU stats
!nvidia-smi

Wed Feb  8 21:28:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    26W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!pip install -U -q transformers

In [5]:
#@markdown utils
from transformers.utils.logging import set_verbosity

set_verbosity(40)

import warnings
# ignore hf pipeline complaints
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')
warnings.filterwarnings("ignore", category=FutureWarning, module='transformers')

In [6]:
import torch
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1,)

In [7]:
params = {
    "max_length": 500,
    "min_length": 1,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
    "repetition_penalty": 3.5,
    "length_penalty": 0.3,
    "encoder_no_repeat_ngram_size": 3,
    "num_beams": 4,
    "do_sample": False,
}

In [8]:
from urllib.request import urlopen

try:
  if url:
    df = urlopen(url)
    long_text = df.read().decode("utf8")
  elif filename:
    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
      long_text = f.read()
  else:
    raise Exception
except Exception as error:
  with open(output_file, 'a+') as f:
    f.write(error + '\n')

In [9]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
# Split in segments
def sis(text):
    tokens = 0
    eq_1024 = list()
    total = []
    for sent in sent_tokenize(text):
        newtokens = len(sent.split())
        tokens += newtokens
        eq_1024.append(str(sent).strip())
        if tokens > 512:
            total.append(" ".join(eq_1024))
            eq_1024 = []
            tokens = 0
    if eq_1024:
        total.append(" ".join(eq_1024))
    return total

# Summarize by segments
def sbs(text):
    segments = sis(text)
    summarylist = summarizer(segments, max_length=100, min_length=30, do_sample=False)
    summary = summarizer(" ".join([summarylist[i]['summary_text'] for i in range(len(summarylist))]), max_length = 120, min_length = 30, do_sample = False)
    return summary

def sas(long_text, number_to_split):
  tmp = sent_tokenize(long_text.replace('“', '').replace('—', '').replace('é', '').replace('\t', '').replace('\r', '').replace('\n\n', ' ').replace('\n', ' '))
  eq_1024 = ''
  total = ''
  for line in tmp:
    if len(eq_1024.split(' ')) + len(line.split(' ')) > number_to_split:
      batch_sum = summarizer(eq_1024, **params)
      total += ' ' + batch_sum[0]['summary_text']
      eq_1024 = line + '.'
    else:
      eq_1024 += line + '.'
  last_sum = summarizer(eq_1024, **params)
  total += ' ' + last_sum[0]['summary_text']
  return total

In [11]:
%%time
!CUDA_LAUNCH_BLOCKING=1
while len(long_text.split()) > max_words:
  long_text = sas(long_text, 512)

print(long_text)

 The life of Cardinal Mazarin. The French monarch who would have had to be banished if possible. A jester sings: 'I think I'll blow Sieur Mazarin  away' Two men go on a search for their lost son, Aramis. D’ artagnan meets his ex-lover and learns about his former lovers. Raoul, Porthos, Athos and Gondy meet in Paris once again. The men break their swords but don't attack each other. Raoul tells Grimaud that he will fight with him against the enemy. They also discuss Mazarin and his relationship with Henrietta. Respect for Fallen Majesty tells the story of four Frenchmen who escape England. The men are trying to rescue their king Charles I from his clutches. Athos is one of the four fugitives, along with Aramis and D' artagnan. French queen wanted to abdicate, but Mazarin said she should remain in France. Two French soldiers are captured and held captive by Swiss soldiers. They devise a trick that leads them to Mazarin's hideout.
CPU times: user 10min 15s, sys: 2.2 s, total: 10min 18s
Wa

In [11]:
%%time
import os

if word_to_split_book_into_parts:
  book_parts = long_text.split(word_to_split_book_into_parts)
  for i in range(len(book_parts)):
    filename = 'part' + str(i).zfill(5) + '.txt'
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(book_parts[i])

  directory = '/content/'

  part_files = set()

  for filename in os.listdir(directory):
    f = ''
    if 'part' in filename:
      f = os.path.join(directory, filename)

    if os.path.isfile(f):
      part_files.add(f)
  
  long_text = ''
  for filename in sorted(part_files):
    if 'part00054' in filename:
      pass
    else:
      print(filename)
      with open(filename, 'r') as f:
        long_text += sas(f.read(), 512) + '\n\n'
  
while len(long_text.split()) > max_words:
  long_text = sas(long_text, 512)

print(long_text)

/content/part00000.txt
/content/part00001.txt
/content/part00002.txt
/content/part00003.txt
/content/part00004.txt
/content/part00005.txt
/content/part00006.txt
/content/part00007.txt
/content/part00008.txt
/content/part00009.txt
/content/part00010.txt
/content/part00011.txt
/content/part00012.txt
/content/part00013.txt
/content/part00014.txt
/content/part00015.txt
/content/part00016.txt
/content/part00017.txt
/content/part00018.txt
/content/part00019.txt
/content/part00020.txt
/content/part00021.txt
/content/part00022.txt
/content/part00023.txt
/content/part00024.txt
/content/part00025.txt
/content/part00026.txt
/content/part00027.txt
/content/part00028.txt
/content/part00029.txt
/content/part00030.txt
/content/part00031.txt
/content/part00032.txt
/content/part00033.txt
/content/part00034.txt
/content/part00035.txt
/content/part00036.txt
/content/part00037.txt
/content/part00038.txt
/content/part00039.txt
/content/part00040.txt
/content/part00041.txt
/content/part00042.txt
/content/pa

In [12]:
!wget -O test.txt https://www.gutenberg.org/cache/epub/1259/pg1259.txt

with open('test.txt', 'r') as f:
  test_text = f.read()
testing_parts = test_text.split(word_to_split_book_into_parts)
for i in range(len(testing_parts)):
  filename = 'test' + str(i).zfill(3) + '.txt'
  with open(filename, 'w', encoding='utf-8') as f:
      f.write(testing_parts[i])

--2023-02-08 20:09:57--  https://www.gutenberg.org/cache/epub/1259/pg1259.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1473597 (1.4M) [text/plain]
Saving to: ‘test.txt’


2023-02-08 20:09:58 (15.2 MB/s) - ‘test.txt’ saved [1473597/1473597]



In [27]:
for part in os.listdir(directory):
  fp = ''
  if 'part' in part:
    fp = os.path.join(directory, part)
  else: p = None
  if os.path.isfile(fp):
    with open(fp, 'r')as f:
      p = f.read()
  
  for test in os.listdir(directory):
    ft = ''
    if 'test' in test:
      ft = os.path.join(directory, test)
    else: t = None
    if os.path.isfile(ft):
      with open(ft, 'r')as f:
        t = f.read()
    if t is not None and p is not None:
      if len(p) == len(t):
        print(fp, ft)

/content/part00014.txt /content/test104.txt
/content/part00020.txt /content/test110.txt
/content/part00031.txt /content/test121.txt
/content/part00078.txt /content/test168.txt
/content/part00066.txt /content/test156.txt
/content/part00083.txt /content/test173.txt
/content/part00036.txt /content/test126.txt
/content/part00017.txt /content/test107.txt
/content/part00050.txt /content/test140.txt
/content/part00025.txt /content/test115.txt
/content/part00053.txt /content/test143.txt
/content/part00064.txt /content/test154.txt
/content/part00003.txt /content/test093.txt
/content/part00004.txt /content/test094.txt
/content/part00052.txt /content/test142.txt
/content/part00035.txt /content/test125.txt
/content/part00013.txt /content/test103.txt
/content/part00008.txt /content/test098.txt
/content/part00062.txt /content/test152.txt
/content/part00074.txt /content/test164.txt
/content/part00045.txt /content/test135.txt
/content/part00073.txt /content/test163.txt
/content/part00030.txt /content/

In [36]:
output_set = set()
with open('/content/part00054.txt', 'r')as f:
  char_seek = set(f.read())


for i in range(91):
  filename = os.path.join(directory, 'test' + str(i).zfill(3) + '.txt')
  if os.path.isfile(filename):
    with open(filename, 'r')as f:
      t = set(f.read())
  for ch in list(char_seek):
    if  ch in t:
      output_set.add(ch)
sorted(output_set)

['\n',
 ' ',
 '!',
 ',',
 '-',
 '.',
 ':',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'é',
 '—',
 '’',
 '“',
 '”']

In [35]:
'a' in set('aaabbccccc')

True