In [1]:
import os

import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig, pipeline

In [2]:
# https://jmlr.org/papers/volume21/20-074/20-074.pdf
model_name = "Falconsai/text_summarization"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")



`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [3]:
def summarize_book(path):
    summarize_path = os.path.join("..", "summaries", path)

    if os.path.exists(summarize_path):
        print(f"{path} already summarized")
        return

    with open(os.path.join("..", "books", path), "r") as f:
        text = f.read()

    splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        chunk_size=512, chunk_overlap=0, tokenizer=summarizer.tokenizer
    )

    chunks = splitter.split_text(text)
    print(f"{path} split into {len(chunks)} chunks")

    with open(summarize_path, "w") as f:
        for chunk in tqdm(chunks):
            summary = summarizer(chunk)
            f.write(summary[0]["summary_text"] + "\n")

In [4]:
for series in ["hp", "asoif"]:
    for book in os.listdir(os.path.join("..", "books", series)):
        summarize_book(os.path.join(series, book))

hp/Book 3 - Prisoner of Azkaban.txt already summarized
hp/Book 4 - Goblet of Fire.txt already summarized
hp/Book 2 - Chamber of Secrets.txt already summarized
hp/Book 5 - Order of the Phoenix.txt already summarized
hp/Book 7 - Deathly Hallows.txt already summarized
hp/Book 6 - Half Blood Prince.txt already summarized
hp/Book 1 - Philosophers Stone.txt already summarized
asoif/Book 2 - A Clash of Kings.txt already summarized
asoif/characters.txt already summarized
asoif/Book 5 - A Dance With Dragons.txt already summarized
asoif/Book 3 - A Storm of Swords.txt already summarized
asoif/Book 1 - A Game of Thrones.txt already summarized
asoif/Book 4 - A Feast for Crows.txt split into 1094 chunks


  1%|          | 10/1094 [00:15<29:03,  1.61s/it] You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 40%|███▉      | 437/1094 [11:40<18:24,  1.68s/it]Your max_length is set to 200, but your input_length is only 197. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=98)
100%|█████████▉| 1093/1094 [30:26<00:01,  1.84s/it]Your max_length is set to 200, but your input_length is only 176. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=88)
100%|██████████| 1094/1094 [30:29<00:00,  1.67s/it]
