### **Using T5-large with chunking to handle large paragraphs**

**Imports**

In [16]:
import multiprocessing as mp
from functools import partial

import rich
import torch
from accelerate import Accelerator
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
)
from datasets import DatasetDict, load_dataset

2024-11-04 02:06:04.449872: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-04 02:06:04.456825: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-04 02:06:04.464837: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-04 02:06:04.467231: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-04 02:06:04.473970: I tensorflow/core/platform/cpu_feature_guar

**Loading Data**

In [None]:
dataset = load_dataset("ingoziegler/CRAFT-Summarization", "XL")

train_dataset = dataset["train"].select(range(15000))
test_dataset = dataset["train"].select(range(15000, 17000))

train_dataset = train_dataset.remove_columns(["instruction", "is_few_shot"])
test_dataset = test_dataset.remove_columns(["instruction", "is_few_shot"])
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['summary', 'long_but_clean_text'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['summary', 'long_but_clean_text'],
        num_rows: 2000
    })
})

**Setting Up Model**

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-large", legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-large")
model.to(device)
rich.print(model.eval())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Customizing Dataset for Chunked Text Summarization**

In [None]:
class TextSummaryDataset(Dataset):
    def __init__(
        self,
        dataframe,
        tokenizer,
        max_article_len=512,
        max_summary_len=300,
        chunk_size=512,
        overlap=50,
    ):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_article_len = max_article_len
        self.max_summary_len = max_summary_len
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.chunks = self.create_chunks()

    def create_chunks(self):
        with mp.Pool(mp.cpu_count()) as pool:
            chunks = list(
                tqdm(
                    pool.imap(partial(self.process_document), range(len(self.data))),
                    total=len(self.data),
                )
            )
        return [chunk for sublist in chunks for chunk in sublist]

    def process_document(self, i):
        chunks = []
        document = str(self.data["long_but_clean_text"][i])
        summary = str(self.data["summary"][i])
        if not document.strip():
            return chunks  # Skip empty documents
        tokenized_doc = self.tokenizer(document, return_tensors="pt", truncation=False)[
            "input_ids"
        ].squeeze()
        if tokenized_doc.dim() == 0:
            return chunks  # Skip if tokenization fails
        num_chunks = (len(tokenized_doc) - self.overlap) // (
            self.chunk_size - self.overlap
        ) + 1
        for j in range(num_chunks):
            start = j * (self.chunk_size - self.overlap)
            end = start + self.chunk_size
            chunk = tokenized_doc[start:end]
            chunks.append((chunk, summary))
        return chunks

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, index):
        chunk, summary = self.chunks[index]

        # Decode the chunk tensor back to a string
        chunk_text = self.tokenizer.decode(chunk, skip_special_tokens=True)

        inputs = self.tokenizer(
            chunk_text,
            max_length=self.max_article_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        targets = self.tokenizer(
            summary,
            max_length=self.max_summary_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze(),
        }

**Allocating Dataset**

In [9]:
train_dataset = TextSummaryDataset(
    dataset["train"],
    tokenizer,
    max_article_len=512,
    max_summary_len=300,
    chunk_size=512,
    overlap=30,
)
test_dataset = TextSummaryDataset(
    dataset["test"],
    tokenizer,
    max_article_len=512,
    max_summary_len=300,
    chunk_size=512,
    overlap=30,
)

100%|██████████| 15000/15000 [01:38<00:00, 152.27it/s]
100%|██████████| 2000/2000 [00:07<00:00, 255.72it/s]


#### **Training the model** (DO NOT RUN)

In [10]:
training_args = TrainingArguments(
    output_dir="./text_summarization_results",
    eval_strategy="epoch",
    overwrite_output_dir=True,
    learning_rate=3e-5,
    save_steps=3000,
    save_strategy="steps",
    per_device_train_batch_size=1,  # Adjust based on VRAM availability
    per_device_eval_batch_size=1,  # Adjust based on VRAM availability
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=2000,
    save_total_limit=1,
    gradient_accumulation_steps=2,
    logging_dir="./logs",
    logging_steps=300,
    log_level="info",  # Set log level to info
)

accelerator = Accelerator()
model, train_dataset, test_dataset = accelerator.prepare(
    model, train_dataset, test_dataset
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

2024-11-01 03:15:54.830158: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-01 03:15:54.837371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-01 03:15:54.845590: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-01 03:15:54.848174: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-01 03:15:54.854828: I tensorflow/core/platform/cpu_feature_guar

  0%|          | 0/28998 [00:00<?, ?it/s]

{'loss': 15.3454, 'grad_norm': 169.35281372070312, 'learning_rate': 4.5e-06, 'epoch': 0.03}
{'loss': 1.9012, 'grad_norm': 1.9761168956756592, 'learning_rate': 9e-06, 'epoch': 0.06}
{'loss': 0.8656, 'grad_norm': 1.152578592300415, 'learning_rate': 1.3500000000000001e-05, 'epoch': 0.09}
{'loss': 0.7416, 'grad_norm': 0.8779325485229492, 'learning_rate': 1.8e-05, 'epoch': 0.12}
{'loss': 0.7572, 'grad_norm': 0.5748788118362427, 'learning_rate': 2.25e-05, 'epoch': 0.16}
{'loss': 0.6583, 'grad_norm': 1.113865852355957, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.19}
{'loss': 0.6408, 'grad_norm': 2.7303810119628906, 'learning_rate': 2.9888880657826505e-05, 'epoch': 0.22}
{'loss': 0.6777, 'grad_norm': 0.7075058221817017, 'learning_rate': 2.955552263130602e-05, 'epoch': 0.25}
{'loss': 0.6682, 'grad_norm': 1.0706393718719482, 'learning_rate': 2.922216460478554e-05, 'epoch': 0.28}


Saving model checkpoint to ./text_summarization_results/checkpoint-3000
Configuration saved in ./text_summarization_results/checkpoint-3000/config.json
Configuration saved in ./text_summarization_results/checkpoint-3000/generation_config.json


{'loss': 0.6449, 'grad_norm': 2.264118194580078, 'learning_rate': 2.8888806578265056e-05, 'epoch': 0.31}


Model weights saved in ./text_summarization_results/checkpoint-3000/model.safetensors


{'loss': 0.6478, 'grad_norm': 0.4918314218521118, 'learning_rate': 2.8555448551744576e-05, 'epoch': 0.34}
{'loss': 0.6386, 'grad_norm': 0.6431795954704285, 'learning_rate': 2.822209052522409e-05, 'epoch': 0.37}
{'loss': 0.6407, 'grad_norm': 1.1144620180130005, 'learning_rate': 2.788873249870361e-05, 'epoch': 0.4}
{'loss': 0.6493, 'grad_norm': 1.2584564685821533, 'learning_rate': 2.7555374472183127e-05, 'epoch': 0.43}
{'loss': 0.616, 'grad_norm': 0.8022075891494751, 'learning_rate': 2.7222016445662643e-05, 'epoch': 0.47}
{'loss': 0.6043, 'grad_norm': 0.7667637467384338, 'learning_rate': 2.688865841914216e-05, 'epoch': 0.5}
{'loss': 0.673, 'grad_norm': 0.8478726148605347, 'learning_rate': 2.655530039262168e-05, 'epoch': 0.53}
{'loss': 0.5812, 'grad_norm': 1.1326345205307007, 'learning_rate': 2.6221942366101194e-05, 'epoch': 0.56}
{'loss': 0.6376, 'grad_norm': 1.1000229120254517, 'learning_rate': 2.588858433958071e-05, 'epoch': 0.59}


Saving model checkpoint to ./text_summarization_results/checkpoint-6000
Configuration saved in ./text_summarization_results/checkpoint-6000/config.json
Configuration saved in ./text_summarization_results/checkpoint-6000/generation_config.json


{'loss': 0.6111, 'grad_norm': 0.9140889644622803, 'learning_rate': 2.5555226313060226e-05, 'epoch': 0.62}


Model weights saved in ./text_summarization_results/checkpoint-6000/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-3000] due to args.save_total_limit


{'loss': 0.5937, 'grad_norm': 0.9573311805725098, 'learning_rate': 2.5221868286539746e-05, 'epoch': 0.65}
{'loss': 0.6017, 'grad_norm': 1.1295498609542847, 'learning_rate': 2.4888510260019262e-05, 'epoch': 0.68}
{'loss': 0.6408, 'grad_norm': 1.083109736442566, 'learning_rate': 2.4555152233498778e-05, 'epoch': 0.71}
{'loss': 0.619, 'grad_norm': 0.785707414150238, 'learning_rate': 2.4221794206978294e-05, 'epoch': 0.74}
{'loss': 0.5741, 'grad_norm': 1.345826506614685, 'learning_rate': 2.3888436180457813e-05, 'epoch': 0.78}
{'loss': 0.5713, 'grad_norm': 1.2231944799423218, 'learning_rate': 2.355507815393733e-05, 'epoch': 0.81}
{'loss': 0.5823, 'grad_norm': 1.5391989946365356, 'learning_rate': 2.3221720127416845e-05, 'epoch': 0.84}
{'loss': 0.554, 'grad_norm': 0.5638049840927124, 'learning_rate': 2.288836210089636e-05, 'epoch': 0.87}
{'loss': 0.6014, 'grad_norm': 1.0157389640808105, 'learning_rate': 2.255500407437588e-05, 'epoch': 0.9}


Saving model checkpoint to ./text_summarization_results/checkpoint-9000
Configuration saved in ./text_summarization_results/checkpoint-9000/config.json
Configuration saved in ./text_summarization_results/checkpoint-9000/generation_config.json


{'loss': 0.5946, 'grad_norm': 1.1153615713119507, 'learning_rate': 2.2221646047855396e-05, 'epoch': 0.93}


Model weights saved in ./text_summarization_results/checkpoint-9000/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-6000] due to args.save_total_limit


{'loss': 0.5181, 'grad_norm': 0.8269067406654358, 'learning_rate': 2.1888288021334912e-05, 'epoch': 0.96}
{'loss': 0.6317, 'grad_norm': 0.9822296500205994, 'learning_rate': 2.155492999481443e-05, 'epoch': 0.99}



***** Running Evaluation *****
  Num examples = 2574
  Batch size = 1


  0%|          | 0/2574 [00:00<?, ?it/s]

{'eval_loss': 0.5619871020317078, 'eval_runtime': 240.7665, 'eval_samples_per_second': 10.691, 'eval_steps_per_second': 10.691, 'epoch': 1.0}
{'loss': 0.593, 'grad_norm': 0.6144326329231262, 'learning_rate': 2.1221571968293948e-05, 'epoch': 1.02}
{'loss': 0.5223, 'grad_norm': 1.1786211729049683, 'learning_rate': 2.0888213941773464e-05, 'epoch': 1.06}
{'loss': 0.5157, 'grad_norm': 0.6738572120666504, 'learning_rate': 2.055485591525298e-05, 'epoch': 1.09}
{'loss': 0.5265, 'grad_norm': 0.5991089344024658, 'learning_rate': 2.0221497888732502e-05, 'epoch': 1.12}
{'loss': 0.5642, 'grad_norm': 2.5837929248809814, 'learning_rate': 1.988813986221202e-05, 'epoch': 1.15}
{'loss': 0.5148, 'grad_norm': 1.5288840532302856, 'learning_rate': 1.9554781835691534e-05, 'epoch': 1.18}
{'loss': 0.5446, 'grad_norm': 0.4517318606376648, 'learning_rate': 1.922142380917105e-05, 'epoch': 1.21}


Saving model checkpoint to ./text_summarization_results/checkpoint-12000
Configuration saved in ./text_summarization_results/checkpoint-12000/config.json
Configuration saved in ./text_summarization_results/checkpoint-12000/generation_config.json


{'loss': 0.5607, 'grad_norm': 1.363714337348938, 'learning_rate': 1.888806578265057e-05, 'epoch': 1.24}


Model weights saved in ./text_summarization_results/checkpoint-12000/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-9000] due to args.save_total_limit


{'loss': 0.5445, 'grad_norm': 0.8184049129486084, 'learning_rate': 1.8554707756130086e-05, 'epoch': 1.27}
{'loss': 0.5627, 'grad_norm': 1.395644187927246, 'learning_rate': 1.8221349729609602e-05, 'epoch': 1.3}
{'loss': 0.5525, 'grad_norm': 1.755685567855835, 'learning_rate': 1.7887991703089118e-05, 'epoch': 1.33}
{'loss': 0.5589, 'grad_norm': 0.9236899614334106, 'learning_rate': 1.7554633676568637e-05, 'epoch': 1.37}
{'loss': 0.5504, 'grad_norm': 0.9872875213623047, 'learning_rate': 1.7221275650048153e-05, 'epoch': 1.4}
{'loss': 0.5446, 'grad_norm': 1.1711606979370117, 'learning_rate': 1.688791762352767e-05, 'epoch': 1.43}
{'loss': 0.526, 'grad_norm': 0.9562973976135254, 'learning_rate': 1.6554559597007185e-05, 'epoch': 1.46}
{'loss': 0.587, 'grad_norm': 0.9172770977020264, 'learning_rate': 1.6221201570486704e-05, 'epoch': 1.49}
{'loss': 0.5218, 'grad_norm': 0.5829291343688965, 'learning_rate': 1.588784354396622e-05, 'epoch': 1.52}


Saving model checkpoint to ./text_summarization_results/checkpoint-15000
Configuration saved in ./text_summarization_results/checkpoint-15000/config.json
Configuration saved in ./text_summarization_results/checkpoint-15000/generation_config.json


{'loss': 0.5117, 'grad_norm': 1.6153216361999512, 'learning_rate': 1.5554485517445736e-05, 'epoch': 1.55}


Model weights saved in ./text_summarization_results/checkpoint-15000/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-12000] due to args.save_total_limit


{'loss': 0.5401, 'grad_norm': 0.8385534882545471, 'learning_rate': 1.5221127490925252e-05, 'epoch': 1.58}
{'loss': 0.5647, 'grad_norm': 1.0921636819839478, 'learning_rate': 1.488776946440477e-05, 'epoch': 1.61}
{'loss': 0.5654, 'grad_norm': 0.8114355206489563, 'learning_rate': 1.4554411437884288e-05, 'epoch': 1.64}
{'loss': 0.5745, 'grad_norm': 0.8882663249969482, 'learning_rate': 1.4221053411363805e-05, 'epoch': 1.68}
{'loss': 0.5668, 'grad_norm': 0.7894174456596375, 'learning_rate': 1.3887695384843323e-05, 'epoch': 1.71}
{'loss': 0.5941, 'grad_norm': 1.1886852979660034, 'learning_rate': 1.355433735832284e-05, 'epoch': 1.74}
{'loss': 0.5476, 'grad_norm': 0.8728538751602173, 'learning_rate': 1.3220979331802357e-05, 'epoch': 1.77}
{'loss': 0.5117, 'grad_norm': 0.9941262006759644, 'learning_rate': 1.2887621305281873e-05, 'epoch': 1.8}
{'loss': 0.5223, 'grad_norm': 0.6913861036300659, 'learning_rate': 1.255426327876139e-05, 'epoch': 1.83}


Saving model checkpoint to ./text_summarization_results/checkpoint-18000
Configuration saved in ./text_summarization_results/checkpoint-18000/config.json
Configuration saved in ./text_summarization_results/checkpoint-18000/generation_config.json


{'loss': 0.5287, 'grad_norm': 0.8681735992431641, 'learning_rate': 1.2220905252240906e-05, 'epoch': 1.86}


Model weights saved in ./text_summarization_results/checkpoint-18000/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-15000] due to args.save_total_limit


{'loss': 0.5731, 'grad_norm': 1.0315507650375366, 'learning_rate': 1.1887547225720424e-05, 'epoch': 1.89}
{'loss': 0.5018, 'grad_norm': 0.801112711429596, 'learning_rate': 1.155418919919994e-05, 'epoch': 1.92}
{'loss': 0.5261, 'grad_norm': 0.8593601584434509, 'learning_rate': 1.1220831172679458e-05, 'epoch': 1.96}
{'loss': 0.5377, 'grad_norm': 0.857659637928009, 'learning_rate': 1.0887473146158974e-05, 'epoch': 1.99}



***** Running Evaluation *****
  Num examples = 2574
  Batch size = 1


  0%|          | 0/2574 [00:00<?, ?it/s]

{'eval_loss': 0.5496358275413513, 'eval_runtime': 241.3384, 'eval_samples_per_second': 10.666, 'eval_steps_per_second': 10.666, 'epoch': 2.0}
{'loss': 0.5437, 'grad_norm': 0.9241330027580261, 'learning_rate': 1.0554115119638492e-05, 'epoch': 2.02}
{'loss': 0.5066, 'grad_norm': 1.13649320602417, 'learning_rate': 1.0220757093118008e-05, 'epoch': 2.05}
{'loss': 0.4785, 'grad_norm': 0.8464525938034058, 'learning_rate': 9.887399066597527e-06, 'epoch': 2.08}
{'loss': 0.4891, 'grad_norm': 0.832268476486206, 'learning_rate': 9.554041040077043e-06, 'epoch': 2.11}
{'loss': 0.5045, 'grad_norm': 1.0776270627975464, 'learning_rate': 9.22068301355656e-06, 'epoch': 2.14}


Saving model checkpoint to ./text_summarization_results/checkpoint-21000
Configuration saved in ./text_summarization_results/checkpoint-21000/config.json
Configuration saved in ./text_summarization_results/checkpoint-21000/generation_config.json


{'loss': 0.511, 'grad_norm': 0.5666629672050476, 'learning_rate': 8.887324987036077e-06, 'epoch': 2.17}


Model weights saved in ./text_summarization_results/checkpoint-21000/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-18000] due to args.save_total_limit


{'loss': 0.525, 'grad_norm': 0.8588789701461792, 'learning_rate': 8.553966960515594e-06, 'epoch': 2.2}
{'loss': 0.4932, 'grad_norm': 1.0380734205245972, 'learning_rate': 8.22060893399511e-06, 'epoch': 2.23}
{'loss': 0.488, 'grad_norm': 0.9239305853843689, 'learning_rate': 7.887250907474628e-06, 'epoch': 2.27}
{'loss': 0.4585, 'grad_norm': 0.7304684519767761, 'learning_rate': 7.5538928809541456e-06, 'epoch': 2.3}
{'loss': 0.4997, 'grad_norm': 1.6545072793960571, 'learning_rate': 7.2205348544336616e-06, 'epoch': 2.33}
{'loss': 0.5203, 'grad_norm': 0.6079597473144531, 'learning_rate': 6.887176827913179e-06, 'epoch': 2.36}
{'loss': 0.517, 'grad_norm': 0.9297361969947815, 'learning_rate': 6.553818801392696e-06, 'epoch': 2.39}
{'loss': 0.4994, 'grad_norm': 0.7760828137397766, 'learning_rate': 6.220460774872213e-06, 'epoch': 2.42}
{'loss': 0.5121, 'grad_norm': 0.8267377614974976, 'learning_rate': 5.88710274835173e-06, 'epoch': 2.45}


Saving model checkpoint to ./text_summarization_results/checkpoint-24000
Configuration saved in ./text_summarization_results/checkpoint-24000/config.json
Configuration saved in ./text_summarization_results/checkpoint-24000/generation_config.json


{'loss': 0.5136, 'grad_norm': 1.0768182277679443, 'learning_rate': 5.5537447218312466e-06, 'epoch': 2.48}


Model weights saved in ./text_summarization_results/checkpoint-24000/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-21000] due to args.save_total_limit


{'loss': 0.5015, 'grad_norm': 0.6864096522331238, 'learning_rate': 5.220386695310763e-06, 'epoch': 2.51}
{'loss': 0.4863, 'grad_norm': 0.7314274907112122, 'learning_rate': 4.887028668790281e-06, 'epoch': 2.55}
{'loss': 0.4923, 'grad_norm': 0.4832019805908203, 'learning_rate': 4.553670642269798e-06, 'epoch': 2.58}
{'loss': 0.5001, 'grad_norm': 2.183259963989258, 'learning_rate': 4.220312615749315e-06, 'epoch': 2.61}
{'loss': 0.4845, 'grad_norm': 0.9278035759925842, 'learning_rate': 3.886954589228832e-06, 'epoch': 2.64}
{'loss': 0.5227, 'grad_norm': 0.8974775671958923, 'learning_rate': 3.553596562708349e-06, 'epoch': 2.67}
{'loss': 0.498, 'grad_norm': 0.9390460252761841, 'learning_rate': 3.220238536187866e-06, 'epoch': 2.7}
{'loss': 0.5226, 'grad_norm': 1.1870229244232178, 'learning_rate': 2.886880509667383e-06, 'epoch': 2.73}
{'loss': 0.5177, 'grad_norm': 0.9750829339027405, 'learning_rate': 2.5535224831468998e-06, 'epoch': 2.76}


Saving model checkpoint to ./text_summarization_results/checkpoint-27000
Configuration saved in ./text_summarization_results/checkpoint-27000/config.json
Configuration saved in ./text_summarization_results/checkpoint-27000/generation_config.json


{'loss': 0.5369, 'grad_norm': 1.6076221466064453, 'learning_rate': 2.220164456626417e-06, 'epoch': 2.79}


Model weights saved in ./text_summarization_results/checkpoint-27000/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-24000] due to args.save_total_limit


{'loss': 0.4861, 'grad_norm': 1.5000014305114746, 'learning_rate': 1.8868064301059339e-06, 'epoch': 2.82}
{'loss': 0.4828, 'grad_norm': 1.0314337015151978, 'learning_rate': 1.553448403585451e-06, 'epoch': 2.86}
{'loss': 0.5307, 'grad_norm': 1.3552531003952026, 'learning_rate': 1.2200903770649678e-06, 'epoch': 2.89}
{'loss': 0.5166, 'grad_norm': 0.873418927192688, 'learning_rate': 8.867323505444848e-07, 'epoch': 2.92}
{'loss': 0.515, 'grad_norm': 1.6022268533706665, 'learning_rate': 5.533743240240017e-07, 'epoch': 2.95}
{'loss': 0.5101, 'grad_norm': 0.781707763671875, 'learning_rate': 2.2001629750351878e-07, 'epoch': 2.98}


Saving model checkpoint to ./text_summarization_results/checkpoint-28998
Configuration saved in ./text_summarization_results/checkpoint-28998/config.json
Configuration saved in ./text_summarization_results/checkpoint-28998/generation_config.json
Model weights saved in ./text_summarization_results/checkpoint-28998/model.safetensors
Deleting older checkpoint [text_summarization_results/checkpoint-27000] due to args.save_total_limit

***** Running Evaluation *****
  Num examples = 2574
  Batch size = 1


  0%|          | 0/2574 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.5508502721786499, 'eval_runtime': 260.4858, 'eval_samples_per_second': 9.882, 'eval_steps_per_second': 9.882, 'epoch': 3.0}
{'train_runtime': 24787.0947, 'train_samples_per_second': 2.34, 'train_steps_per_second': 1.17, 'train_loss': 0.7269155495873368, 'epoch': 3.0}


TrainOutput(global_step=28998, training_loss=0.7269155495873368, metrics={'train_runtime': 24787.0947, 'train_samples_per_second': 2.34, 'train_steps_per_second': 1.17, 'total_flos': 1.25564082978816e+17, 'train_loss': 0.7269155495873368, 'epoch': 3.0})

**Saving the Model**

In [11]:
model.save_pretrained("models/fine-tuned-t5-CRAFT-summarizer-with_chunking")
tokenizer.save_pretrained("models/fine-tuned-t5-CRAFT-summarizer-with_chunking")

Configuration saved in models/fine-tuned-t5-CRAFT-summarizer-with_chunking/config.json
Configuration saved in models/fine-tuned-t5-CRAFT-summarizer-with_chunking/generation_config.json
Model weights saved in models/fine-tuned-t5-CRAFT-summarizer-with_chunking/model.safetensors
tokenizer config file saved in models/fine-tuned-t5-CRAFT-summarizer-with_chunking/tokenizer_config.json
Special tokens file saved in models/fine-tuned-t5-CRAFT-summarizer-with_chunking/special_tokens_map.json
added tokens file saved in models/fine-tuned-t5-CRAFT-summarizer-with_chunking/added_tokens.json


('models/fine-tuned-t5-CRAFT-summarizer-with_chunking/tokenizer_config.json',
 'models/fine-tuned-t5-CRAFT-summarizer-with_chunking/special_tokens_map.json',
 'models/fine-tuned-t5-CRAFT-summarizer-with_chunking/spiece.model',
 'models/fine-tuned-t5-CRAFT-summarizer-with_chunking/added_tokens.json')

#### **Loading the Model**

In [2]:
model = T5ForConditionalGeneration.from_pretrained(
    "models/fine-tuned-t5-CRAFT-summarizer-with_chunking"
)
tokenizer = T5Tokenizer.from_pretrained(
    "models/fine-tuned-t5-CRAFT-summarizer-with_chunking"
)

#### **Summary Generation**

**Generating Summary for a paragraph**

In [18]:
def generate_summary(
    paragraph, model, tokenizer, max_length=300, num_beams=6, chunk_size=512, overlap=50
):
    """
    Generate a summary for a given paragraph using the trained model.

    Args:
    - paragraph (str): The input paragraph to summarize.
    - model (T5ForConditionalGeneration): The trained T5 model.
    - tokenizer (T5Tokenizer): The tokenizer for the T5 model.
    - max_length (int): The maximum length of the generated summary.
    - num_beams (int): The number of beams for beam search.
    - chunk_size (int): The size of each chunk.
    - overlap (int): The overlap between chunks.

    Returns:
    - summary (str): The generated summary.
    """
    model.to(device)
    # Tokenize the input paragraph
    tokenized_paragraph = tokenizer(paragraph, return_tensors="pt", truncation=False)[
        "input_ids"
    ].squeeze()

    # Chunk the tokenized paragraph
    chunks = []
    num_chunks = (len(tokenized_paragraph) - overlap) // (chunk_size - overlap) + 1
    for i in range(num_chunks):
        start = i * (chunk_size - overlap)
        end = start + chunk_size
        chunk = tokenized_paragraph[start:end]
        chunks.append(chunk)

    # Generate summaries for each chunk
    summaries = []
    for chunk in chunks:
        inputs = tokenizer.decode(chunk, skip_special_tokens=True)
        inputs = tokenizer(
            inputs, return_tensors="pt", max_length=chunk_size, truncation=True
        ).to(device)
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Combine the summaries of all chunks
    combined_summary = " ".join(summaries)

    return combined_summary


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example usage
paragraph = "Artificial Intelligence (AI) has become an integral part of modern healthcare, revolutionizing the way medical professionals diagnose, treat, and manage patient care. The introduction of AI in healthcare has brought about a paradigm shift, offering unprecedented accuracy and efficiency. From predictive analytics to robotic surgeries, AI is transforming the medical landscape. Predictive analytics, for instance, uses complex algorithms to analyze historical data and predict future outcomes. This capability is particularly useful in identifying at-risk patients, enabling early intervention and personalized treatment plans. Furthermore, AI-driven diagnostic tools have demonstrated remarkable precision in detecting diseases such as cancer at an early stage, significantly improving patient outcomes."
summary = generate_summary(paragraph, model, tokenizer)
print("Summary:", summary)

Summary: Artificial Intelligence (AI) has become an integral part of modern healthcare, revolutionizing the way medical professionals diagnose, treat, and manage patient care. From predictive analytics to robotic surgeries, AI is transforming the medical landscape. Predictive analytics uses complex algorithms to analyze historical data and predict future outcomes. AI-driven diagnostic tools have demonstrated remarkable precision in detecting diseases such as cancer at an early stage, significantly improving patient outcomes.


**Generating Summary for a PDF**

In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import PyPDF2
from fpdf import FPDF

In [3]:
file_path = "sample.pdf"

In [4]:
def read_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(open(file_path, "rb"))
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    return text

In [5]:
def generate_summary(
    paragraph,
    model,
    tokenizer,
    max_length=300,
    num_beams=6,
    chunk_size=512,
    overlap=30,
):
    """
    Generate a summary for a given paragraph using the trained model.

    Args:
    - paragraph (str): The input paragraph to summarize.
    - model (T5ForConditionalGeneration): The trained T5 model.
    - tokenizer (T5Tokenizer): The tokenizer for the T5 model.
    - max_length (int): The maximum length of the generated summary.
    - num_beams (int): The number of beams for beam search.
    - chunk_size (int): The size of each chunk.
    - overlap (int): The overlap between chunks.

    Returns:
    - summary (str): The generated summary.
    """
    model.to(device)
    # Tokenize the input paragraph
    tokenized_paragraph = tokenizer(paragraph, return_tensors="pt", truncation=False)[
        "input_ids"
    ].squeeze()

    # Chunk the tokenized paragraph
    chunks = []
    num_chunks = (len(tokenized_paragraph) - overlap) // (chunk_size - overlap) + 1
    for i in range(num_chunks):
        start = i * (chunk_size - overlap)
        end = start + chunk_size
        chunk = tokenized_paragraph[start:end]
        chunks.append(chunk)

    # Generate summaries for each chunk
    summaries = []
    for chunk in chunks:
        inputs = tokenizer.decode(chunk, skip_special_tokens=True)
        inputs = tokenizer(
            inputs, return_tensors="pt", max_length=chunk_size, truncation=True
        ).to(device)
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Combine the summaries of all chunks
    combined_summary = " ".join(summaries)

    return combined_summary

In [6]:
def generate_pdf_summary(file_path, model, tokenizer):
    text = read_pdf(file_path)
    summary = generate_summary(text, model, tokenizer)
    return summary

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
summary = generate_pdf_summary(file_path, model, tokenizer)
print("Summary:", summary)

Summary: Artificial Intelligence (AI) has become an integral part of modern healthcare, revolutionizing the way medical professionals diagnose, treat, and manage patient care. From predictive analytics to robotic surgeries, AI is transforming the medical landscape. Predictive analytics uses complex algorithms to analyze historical data and predict future outcomes, enabling early intervention and personalized treatment plans. AI-driven diagnostic tools have demonstrated remarkable precision in detecting diseases such as cancer at an early stage, significantly improving patient outcomes. Robotic surgeries are performed with robotic arms controlled by surgeons, allowing for greater precision and minimal invasiveness. AI is also playing a crucial role in drug discovery and development, accelerating the process by analyzing vast amounts of data from clinical trials, medical literature, and patient records. Moreover, AI is transforming the way healthcare providers interact with patients, wit

In [32]:
summary = """ Artificial Intelligence (AI) has become an integral part of modern healthcare, revolutionizing the way medical professionals diagnose, treat, and manage patient care. From predictive analytics to robotic surgeries, AI is transforming the medical landscape. Predictive analytics uses complex algorithms to analyze historical data and predict future outcomes, enabling early intervention and personalized treatment plans. AI-driven diagnostic tools have demonstrated remarkable precision in detecting diseases such as cancer at an early stage, significantly improving patient outcomes. Robotic surgeries are performed with robotic arms controlled by surgeons, allowing for greater precision and minimal invasiveness. AI is also playing a crucial role in drug discovery and development, accelerating the process by analyzing vast amounts of data from clinical trials, medical literature, and patient records. Moreover, AI is transforming the way healthcare providers interact with patients, with virtual health assistants powered by AI providing instant access to medical advice and support. AI has the potential to significantly improve patient outcomes by enhancing diagnostic accuracy, improving surgical precision, accelerating drug discovery, and empowering patients. However, it is essential to address the challenges associated with its implementation to ensure that its benefits are realized in a safe and ethical manner.
"""

In [None]:
summary = generate_pdf_summary(file_path, model, tokenizer)
print("Summary:", summary)

Summary: The text is a systematic review of research in the field of text summarization published from 2008 to 2019. It includes 85 journal and conference publications and provides an in-depth explanation of the topics/trends, datasets, preprocessing, features, techniques, methods, evaluations, and problems in this field of research. The results provide references to public datasets, preprocessing, and features that have been used, and describes the techniques and methods that are often used by researchers as a comparison and means for developing methods. The text discusses various aspects of a research study conducted at King Saud University. The study was published in Elsevier B.V. on behalf of King Saud University. The author(s) used a review method, a research question, a search strategy, study selection, data extraction, a result, a paper study publication, a dataset, topics or trends research, and a preprocessing step. The text discusses various aspects of text summarization, inc

In [63]:
def save_summary_to_pdf(summary, original_file_path):
    pdf = FPDF()
    pdf.add_page()

    pdf.add_font("JetBrainsMono-Regular", "", "JetBrainsMono-Regular.ttf", uni=True)
    pdf.set_font("JetBrainsMono-Regular", size=16)

    pdf_title = (
        original_file_path.split("/")[-1].replace(".pdf", "").title().replace("_", " ")
    )

    pdf.cell(0, 10, pdf_title, align="C", ln=True)

    pdf.ln(3)

    pdf.set_font("JetBrainsMono-Regular", size=12)
    pdf.cell(0, 10, "Generated Summary", align="C", ln=True)

    pdf.ln(10)

    pdf.set_font("JetBrainsMono-Regular", size=10)

    pdf.multi_cell(0, 5, summary)

    output_file_path = original_file_path.replace(".pdf", "_summary_generated.pdf")
    pdf.output(output_file_path, "F")

    print(f"Summary saved as {output_file_path}")

In [64]:
save_summary_to_pdf(summary, file_path)

Summary saved as review_of_automatic_text_summarization_techniques_&_methods_summary_generated.pdf


### **Bleu & Rouge Scores**

In [9]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Download necessary NLTK data
nltk.download("punkt")


def compute_bleu(reference, candidate):
    reference_tokens = nltk.word_tokenize(reference)
    candidate_tokens = nltk.word_tokenize(candidate)
    smoothie = SmoothingFunction().method1
    score = sentence_bleu(
        [reference_tokens], candidate_tokens, smoothing_function=smoothie
    )
    return score


def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

[nltk_data] Downloading package punkt to /home/srajan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# Define the reference summary
reference_summary = """
Artificial Intelligence (AI) has transformed modern healthcare by enhancing diagnostic accuracy, improving surgical precision, accelerating drug discovery, and empowering patients. AI-driven tools assist medical professionals in predicting patient outcomes, identifying diseases early, personalizing treatment plans, and providing real-time support through virtual health assistants. Despite these benefits, challenges such as data privacy, security, and the need for transparent AI systems must be addressed to ensure ethical and effective implementation.
"""

# Compute BLEU score
bleu_score = compute_bleu(reference_summary, summary)
print("BLEU Score:", bleu_score)

# Compute ROUGE scores
rouge_scores = compute_rouge(reference_summary, summary)
print("ROUGE Scores:", rouge_scores)

BLEU Score: 0.1326216164529091
ROUGE Scores: {'rouge1': Score(precision=0.28205128205128205, recall=0.7857142857142857, fmeasure=0.4150943396226415), 'rouge2': Score(precision=0.12886597938144329, recall=0.36231884057971014, fmeasure=0.19011406844106463), 'rougeL': Score(precision=0.13846153846153847, recall=0.38571428571428573, fmeasure=0.2037735849056604)}


### **ChatBot Using GPT-2**

In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sentence_transformers import SentenceTransformer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


# Split summary into chunks
def chunk_text(text, max_length=512):
    sentences = text.split(". ")
    chunks = []
    chunk = ""
    for sentence in sentences:
        if len(chunk) + len(sentence) <= max_length:
            chunk += sentence + ". "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + ". "
    if chunk:
        chunks.append(chunk.strip())
    return chunks


chunks = chunk_text(summary)

# Create embeddings for chunks
chunk_embeddings = embedding_model.encode(chunks)


def answer_question(question):
    question_embedding = embedding_model.encode([question])[0]
    from numpy import dot
    from numpy.linalg import norm

    similarities = [
        dot(question_embedding, chunk_emb)
        / (norm(question_embedding) * norm(chunk_emb))
        for chunk_emb in chunk_embeddings
    ]
    relevant_chunk = chunks[similarities.index(max(similarities))]

    prompt = f"Context: {relevant_chunk}\n\nQuestion: {question}\nAnswer without including any links or references to them:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    output_ids = model.generate(
        input_ids, max_length=512, num_beams=5, no_repeat_ngram_size=2
    )
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    answer = output.split("Answer:")[-1].strip()
    return answer


question = input("Ask a question: ")
print("Answer:", answer_question(question))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: Context: Summary: The text is a systematic review of research in the field of text summarization published from 2008 to 2019. It includes 85 journal and conference publications and provides an in-depth explanation of the topics/trends, datasets, preprocessing, features, techniques, methods, evaluations, and problems in this field of research.

Question: when was this text released and what is it about
Answer without including any links or references to them: This text was published in 2008 and is available for download from the following link: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1701891/ (accessed April 1, 2017).
...
The text of this paper is published under the terms of a Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3-0-unported/), which permits the use, distribution and reproduction in any medium, provided the original work is properly cited and the source is not altered or altered for any other

In [2]:
summary = """Summary: The text is a systematic review of research in the field of text summarization published from 2008 to 2019. It includes 85 journal and conference publications and provides an in-depth explanation of the topics/trends, datasets, preprocessing, features, techniques, methods, evaluations, and problems in this field of research. The results provide references to public datasets, preprocessing, and features that have been used, and describes the techniques and methods that are often used by researchers as a comparison and means for developing methods. The text discusses various aspects of a research study conducted at King Saud University. The study was published in Elsevier B.V. on behalf of King Saud University. The author(s) used a review method, a research question, a search strategy, study selection, data extraction, a result, a paper study publication, a dataset, topics or trends research, and a preprocessing step. The text discusses various aspects of text summarization, including the use of various approaches and methods, the evaluation of text summarization results, and the role of evaluations in text summarization. The text discusses the development of automatic text summarization, which generates summaries containing important sentences and includes all relevant information from the original document. It has been studied since the mid-20th century and has seen various approaches, including single and multi-document summarization. A single document produces a summary that is sourced from one source document, while a multi-document summarization is taken from various sources or documents that discuss the same topic. The text discusses various research studies on extractive summarization, including those using Latent Semantic Analysis (LSA) and Non-Negative Matrix Factorization (NMF) for summarizing multiple documents, Qaroush et al. (2019), Verma and Om (2019), and Naik and Gaonkar (2017). Extractive summarization is a summary that consists entirely of extracted content, such as sentences or words obtained from the original text. The usual problem raised from the extraction problem is determining the position of the sentence and the frequency of words in the text. Abstractive summaries are complex and require extensive natural language processing, making them more difficult than extractive summaries. They can be generated using various techniques such as deep learning, linguistic approaches, and encoder-decoder frameworks. Real-time summarization is a trend in text summarizing research that allows for real-time summaries to be generated or updated when new information appears. The techniques used include fuzzy-based and machine learning. Fuzzy logic with classic Zadeh's calculus of linguistically quantified propositions is a fuzzy-based method that addresses trend extrac- tion and real-time problems but weak in semantic problems. Fuzzy Formal Concept Analysis (Fuzzy FCA) is a fuzzy-based method that excels at evaluations in f-measures with optimal recall and comparable precision. Machine learning methods include Incremental Short Text Summarization (IncreSTS) and Rank-biased precision-summarization (RBP-SUM) has advantages in overcoming redundancy but can only produce extractive summaries. Text summarization is a challenging task in NLP due to the need for precise text analysis, conciseness, and consideration of aspects such as non-redundancy, relevance, coverage, coherence, and readability. Research has shifted towards abstractive summarization and real-time summarization, but extractive summaries are still in demand. A clear literature study is needed to advance research in the field of text summarization. The study aims to identify and analyze research topics/trends in the field of summarizing texts, classify them, provide an overview of the various approaches to summarizing texts, briefly explain the methods that already exist in this field, and briefly explain the preprocessing stages and features. The text discusses a research study on text summarization conducted using Systematic Literature Review (SLR) to identify, evaluate, and interpret research results relevant to the topic field or research questions. The study aimed to explore more opportunities in research in this field by exploring more systematic, measurable, and diverse topics. The advantages of SLR over traditional review techniques are the use of scientific methods and their systematic workmanship, to minimize bias and the results are clear and can be accounted for. The text describes a literature review on text summarization using various sources, including sciencedirect.com, ieeexplore.ieee.org, and dl.acm.org. The research question (RQ) is prepared using PICOC meaningful criteria. RQ2 and RQ4 through RQ9 are the main study research questions, while RQ1 and RQ3 are assigned to help evaluate the context of the main study. RQ1 and RQ3 provide a synopsis of cer- tain areas of research in summarizing texts. 2.3. Search strategy Data sources are papers available on the sciencedirect.com site, ieexplore.iee.org, and dl.acm.org. The search string is adjusted to reduce the list of irrelevant studies. Specific requirements for database searches are based on title, abstract, and keywords. The text describes a study aimed at identifying and evaluating the effectiveness of text summarization in computer laboratories with small and large datasets. The study aimed to identify and evaluate the effectiveness of text summarization in computer laboratories with small and large datasets. The initial paper article obtained from an initial search by automatically filtering titles, abstracts, and key- words was 1338 studies. The main paper article that matches the complete contents of the entire text was chosen to produce 85 papers. The final results were summa- Table 1 PICOC Criteria. The text summarization study was conducted using Mendeley's software and included studies from 2008 to 2019. The study aimed to gather data from the main study to answer research questions. The results showed that 85 papers discussed text summarization from 2008 to 2019. The most research on text summarization is in 2018 with 18 publications. The study's research experienced a significant increase in 2015 with 15 publications out of 85 selected publications. The text summarization study used various datasets, including DUC, tweet, news, multilingual, and TAC2011, to test the performance of a proposed method. The public dataset is more widely used than private datasets, with 55 studies using public datasets and 30 studies using private datasets. The most popular public dataset in this research is 70% DUC, followed by tweet, news, multilingual, and TAC2011. There are several types of DUC datasets used in the past ten years, including DUC 2002 and DUC 2004. The text discusses the use of the DUC 2002 and DUC 2004 datasets for sentence segmentation in news documents. The DUC 2002 and DUC 2004 datasets contain more than one document containing news in the U.S. and every single news document is cut into sentences, allowing researchers to create sentence segmentation in doc- uments. However, DUC 2002 and DUC 2004 are not suitable for real-time summarization, as news data is not sequential and does not continue. The multilingual dataset is used for documents consisting of several languages. The text discusses the use of various datasets in text summarization, with a focus on news, trending topics, movie comments, and product-specific product comments. The text also mentions the importance of using novel datasets and datasets with language domains other than English for preprocessing. The graph in Fig. 7 shows the distribution of datasets used from year to year for the past 10 years, both public and private datasets. A significant increase in the public dataset occurred in 2015, which is comparable to the amount of research increase in text summarization. The text discusses various research topics related to text summarization, including multi-document summarization, optimization, domain, and real-time summarization. The most popular topic is multi documents, which is challenging and requires a large search space. The next favorite topic is extrac- tive text summarization, which is more objective without pre- senting viewpoints. The text is a collection of unrelated articles and reviews on various topics related to artificial intelligence, machine learning, and machine learning. Some of the articles include: Al-sabahi et al. (2018 ), Rastkar et al. (2014 ), Ren et al. (n.d.) Abstractive Barros et al. (2019 ), Azmi and Altmami (2018 ), Fuad et al. (2019 ), Sahoo et al. (2018 ), Jaafar and Bouzoubaa (2016 et al. (2018 ), Mori et al. (2018 ), Khan et al. (2019 ), Wei et al. (2019 ), Khan et al. (2015b ),Chi et al. (2018 ), Chen et al. (2018a,b ), S et al. (2017 ), Guo et al. (2019 ), Dilawari and Khan (2019 ), Zhang et al. (2013 ) Unsupervised learningSong et al. (2011 ),Yousefi-azar and Hamey (2017 ),Tayal et al. (2016 ),Alami et al The text is a collection of unrelated articles and reviews on various topics related to artificial intelligence and machine learning. Some of the articles discuss the development of artificial intelligence and machine learning, while others discuss the impact of artificial intelligence on various fields such as medicine, education, and sports. Some articles discuss the use of artificial intelligence and machine learning in various fields such as medicine, education, and sports. Some articles discuss the use of artificial intelligence and machine learning in various fields such as medicine, education, and sports. The text discusses various topics related to text summarization, including extractive summarization, which only chooses the most important words, sentences, and paragraphs to produce a summary, and real-time summarization, which must include all information without redundancy and be added/updated as soon as new information has appeared. Preprocessing is the initial step in preparing unstructured data for summarization. The most common preprocessing step is stop word removal, followed by stemming and tokenizing. Stop word removal is used to remove neglected words from the stop word list. Stemming is used to change words with affixes into basic forms or remove affixes that stick to the basic words. Tokenizing is used to divide sentences, paragraphs, or documents into certain tokens/parts. The text describes various preprocessing stages used in text summarization, including chapter segmentation, paragraph segmentation, sentence segmentation, lemmatize, term weight, word frequency, sentence by term matric, sentence selection, normalize, post tagging, proper noun set, and Bag of Word. The text discusses various features used in text summarization research from 2008 to 2019. The most popular features include sentence length, sentence position, title words, keywords, thematic words, proper nouns, numerical data, and sentence certainly. The text discusses various methods for extracting summaries from text, including machine learning, statistical approaches, and fuzzy logic. The main features include title words, keywords, thematic words, proper nouns, numerical data, sentence certainly, semantic terms, and frequent semantic. The text discusses various approaches to text summarization, including fuzzy-based, machine learning, statistics, graph-ics, topic modeling, and rule-based. The most popular approach is machine learning, which is automatic and learns to improve from experience. However, machine learning is not the only best approach, as there are other approaches with weaknesses in terms of semantics and repetition of sentences. The text discusses various research methods used in text summarization, including a hybridization of the Maximal marginal importance (MMI) method, PSO, and fuzzy logic, which produce more accurate summaries by relying on determining the most important sentences. The results of this research were tested on the DUC 2002 dataset and compared with Msword, sys19, and sys30 summarizers. The weakness of this system is the semantic problem, and future work can add semantic features by labeling semantic roles and lexical databases. The text discusses various approaches to text summarization, including abstractive with deep learning, rule-based, fuzzy based, and pointer-generator. Abstractive with deep learning predicts inaccuracies by giving a higher weight to semantic combinations. The method outperforms other machine learning approaches such as SeqtoSeq + attention baseline, abstractive model by Nallapati et al., 2016 and pointer-generator. Testing using the Gigaword dataset, this method excels with rouge-1 40.21, rouge-2 19.37, and rouge-L 38.29. Testing with CNN dataset, this method excels with rouge-1 39.93, rouge-2 18.21, and rouge-L 37.39. However, it loses rouge-1 measurements when compared to the baseline lead-3 sys- sys- compared to the baseline lead-3 sys- sys-. The less popular approach in research on text summarization in the last 10 years is rule-based, which has the advantage of being applied to a simple domain but has weaknesses when applied to a high level of complexity. Fuzzy based is the method most often used because it involves the role of humans to determine uncertainty. The text discusses various approaches to extractive summaries, including fuzzy combining with graphical approaches, fuzzy logic combining with machine learning, and statistics. Fuzzy methods include fuzzy hypergraphs, MDS System using Fuzzy Logic and combining with TF, fuzzy logic combining with machine learning approach, and fuzzy logic combining with machine learning approach. Statistics can be combined with machine learning or fuzzy-based to get the score or weight of a feature, such as term frequency with hybrid Cal- culation scoring + SVM, Hybrid TF-IDF with SumBasic, TF-IDF with K-Means, TF-IDF with Deep learning + AE (auto Encoder) and TF with fuzzy logic. The text discusses various techniques and methods used in text summarization, including machine learning, statistical, fuzzy-based, graphical, topic modeling, and rule-based approaches. Some of these techniques include extractive semantic fuzzy-based Fuzzy hypergraph, statistical LSA and NMF, machine learning LSA + ANN Deep Learning, optimization machine learning, hybrid graphical CGS, clustering machine learning, topic modeling, extraction fuzzy-based Fuzzy logic, graphical N-rank, rule-based, and machine learning. The text discusses various machine learning methods used for various tasks, including sentence ranking, sentence scoring, abstractive extraction, clustering, sentence scoring, semantic graphics, machine learning, word frequency, statistic index, sentiment machine learning, and keyword statistics. The text discusses various machine learning methods and their applications, including supervised learning, noise machine learning, semantic machine learning, and extraction machine learning. Some of the methods mentioned include supervised learning, noise machine learning, ambiguity machine learning, and semantic link network. The text discusses various research topics related to machine learning, semantic machine learning, and graph-based ranking algorithms. Some of the key points include: the importance of understanding the context of a data set, the importance of understanding the context of a data set, the importance of understanding the context of a data set, the importance of understanding the context of a data set, the importance of understanding the context of a data set, the importance of understanding the context of a data set, and the importance of understanding the context of a data set. The text discusses various techniques and methods for text summarization, including fuzzy logic, fuzzy formal concept analysis, machine learning, and clustering. Some of the techniques include fuzzy logic, real-time similarity, fuzzy FCA, machine learning, and fuzzy formal concept analysis. The text discusses various machine learning methods used in various domains, including domain extraction, graphic analysis, rule-based scoring, topic modeling, and clustering. Some of the methods include fuzzy logic, fuzzy logic, and Fuzzy Logic. Other methods include clustering, clustering ItemSet Biomedical summarization, and keyword analysis. The text discusses various problems in text summarization, including extraction, word frequency, sentiment analysis, noise, and sentence ranking. Some of the most common problems include determining new features to produce a summary, determining the combination of features to produce a good summary, and addressing semantic problems. The text discusses various methods used in text summarization research, including fuzzy logic, semantic link network, SRL, MMR, LSA, and NMF. The most widely used method is fuzzy logic, which is used to extract or determine the final value of words or sentences included in the summary. Fuzzy logic is the most popular method due to its ability to prevent data contradiction. Fuzzy summarization is a method for extracting or determining the final value of words or sentences in a text. It involves using multiple inputs from various features, such as frequency, similarity, position, sentence length, and sentence position, to produce summary sentences. Fuzzy systems can prevent data contradiction by involving humans to examine sen- tences and reach agreement on the choice of certain sentences to produce summary sentences. The latest research using fuzzy is from Goularte et al., which produces extractive summaries using various features such as frequency, similarity, position, and sentence length. The text discusses various research studies using fuzzy summaries and fuzzy combining with graphical approaches to produce extractive summaries. These studies compare fuzzy summaries with baseline, score, model, and a sentence using precision, recall, and f1/f-measure, and CI for F1 with summary sizes of 30%, 30%, and 20%. For sizes of 30% and 20%, the fuzzy method outperforms the state of the art. For a summary size of 30%, the fuzzy method produces precision 0.366, recall 0.496, F1 0.421, and CI for F1 0.389–0.450. For a summary size of 20%, the fuzzy method produces precision 0.417, recall 0.398, F-1 0.406, and CI for F1 0.369–0.436. However, for a summary size of 40%, the fuzzy method is inferior to the model system in terms of precision. The text discusses various methods for summarizing texts, including fuzzy hypergraphs, TF-IDF, and LSA. TF-IDF is a method that uses a statistical approach to analyze the relationship between a phrase/sentence with a collection of documents. LSA is a method that only prioritizes keywords contained in a sentence without regard to linguistic characteristics and word order. The text discusses various research studies in text summarization over the past 10 years. Some of the methods mentioned include CLA, Restricted Boltzmann Machine (RBM), Analytical Hierar- chy Process (AHP), AMR Abstract Meaning Representation (AMR), abstractive summarization (AS), single + dual train, MMR, SOF (WP/POS/NER/WF/HF), Recurrent neural network (RNN), Senti- ment Memory (SM), hierarchies agglomerative clustering (MOABC), TIDA, SVO, n- gram, NLP Parser, lowest common sub-summer (LCS), bag of word (BOW), Patsum, SSO, Non-Negative Matrix Factorization (NMF), rule-based, N-rank, Lex-rank, Text-rank, decision tree, Narrativeabstractive summarization (NATSUM), rank-biased precision- summarization (RBP-Sum), decay topic model (DTM), and evaluations. The text discusses various approaches to evaluating the results of machine summaries, including essential, extracting sentences, content-based, and task-based. Essential evaluation is done by comparing the results of the machine summary with an ideal summary from the expert. Extracting sentences evaluation is done by finding how many ideal sentences are in an automatic machine summary. Content evaluation compares the actual words in a sentence, not the whole sentence. Task-based evaluation measures the performance of an automatic summarizing machine by using sum- maries for specific tasks. The most evaluation approach taken is in terms of sentence extract and content- based. The text discusses the use of the SLR method to evaluate text summarization research. The method is based on a systematic review of the literature using various evaluation methods, including N-Gram match- ing, pyramid and cosine similarity, BLEU, METEOR, CR, and copyrate. BLEU evaluates N-grams that are appropriate or not and has the concept of paraphrasing. METEOR evaluates the correct token, WordNet synonyms, stemmed token, and then paraphrasing the lookup table. CR measures how short a compaction is. Copyrate measures how many pieces are copied to abstract sentences without paraphras. A lower copy rate copy score means more paraphrases involved in abstract sentences. The text discusses various research topics related to text summarization, including extractive summaries, fuzzy summaries, and abstractive summaries. Extractive summaries are considered easier than abstractive summaries due to the ease of exploration and re-analyzing. The most important features to produce a good summary are keywords, frequency, similarity, sentence position, sentence length, and semantics. Machine learning is a favorite technique due to automatic machine learning per- formance and learning to enhance the experience without being explicitly programmed. Statistics can be combined with machine learning or fuzzy based techniques. Future work includes solving feature problems, preprocessing, and developing new methods. The text discusses various methods for extractive document summarization, including fuzzy evolutionary cellular learning automata, hierarchical structured self-attentive model, and unsupervised neural networks. The authors acknowledge King Saud University for providing open access and acknowledge Universitas Dian Nuswantoro and STT Ronggolawe Cepu. The text discusses various research topics related to text summarization and machine learning. Some of the key findings include: the use of neural networks to enhance text summarization, the use of word embedding and ensemble learning in text summarization, the effectiveness of deep learning approaches for summarization of legal texts, and the use of population-based multicriteria optimization in extractive multi-document summarization. The text discusses various research topics in the field of text summarization, including the use of sentiment infusion in text summarization, the improvement of performance of text summarization, the use of cross-document timeline generation in narrative abstractive summarization, and the use of sentiment infusion in abstractive text summarization. The text discusses various research topics related to text summarization, including fuzzy swarm diversity hybrid model for text summarization, MMI diversity based text summarization, fuzzy ontology for news summarization, multi-criterion real time tweet summarization, and extractive broadcast news summarization. The text discusses various research topics related to summarization, including extractive broadcast news summarization leveraging recurrent neural network language modeling techniques, multi-view abstractive summarization model jointly considering semantics and sentiment, mining the usage of summary oriented features in abstractive summarization, summarization using term frequency-inverse document frequency (TF-IDF), automatic summarization of events from social media, Meteor Universal: Language Specific Translation Evaluation for Any Target, and abstractive summarization of video sequences. The text discusses various techniques for extractive text summarization, including word-sentence co-ranking for automatic extractive text summarization, neural sentence fusion for diversity driven abstractive multi-document summarization, fully abstractive approach to guided summarization, and framework for abstractive summarization using text-to-text generation. A text summarization method based on fuzzy rules and applicable to automated assessment. ExpertSyst. Appl. 115, 264–275. Goyal, P., Behera, L., Mcginnity, T.M., 2013. A context-based word indexing model for document summarization. IEEE Trans. Knowl. Data Eng. 25, 1693–1705. Guo, Q., Huang, J., Xiong, N., Wang, P., MS-pointer network: Abstractive text summarization based on multi-head self-attention. IEEE Access 7, 138603–138613. Gupta, V., Kaur, N., 2015. A novel hybrid text summarization system for punjabi text. Cognit. Comput. https://doi.org/10.1007/s12559-015-9359-3. Güran, Uysal, M., 2017. An additive FAHP based sentence score function for text summarization. J. Inf. Technol. Control. 46, 53–69. The text discusses various research topics related to abstractive summarization, including the use of fuzzy quantifiers in time series summarization, the use of semantic role labelling in multi-document summarization, and the use of TF-IDF in sentence scoring. A framework for multi-document abstractive summarization based on semantic role labelling. Khan, A., Salim, N., Kumar, Y.J., Khan, R., Qian, Y., Naeem, S., 2019. Extractive based text summarization using K- Means and TF-IDF. I.J. Inf. Eng. Electron. Bus, 33–44. Widyassari, A.P., Krishnaprasad, Sooryanarayan, A., Ramanujan, A., 2016. Malayalam Text Summarization : An Extractive Approach. Kutlu, M., Cigr, C., I., 2010. Generic text summarization for turkish. Comput. J. 53. https://doi.org/10. The text discusses various research topics related to text summarization, including attention history-based attention for abstractive text summarization, multi-document text summarization using topic model and fuzzy logic, and query-oriented text summarization using fuzzy hypergraphs. The text discusses various approaches to extractive speech summarization, including a concept-based approach for business-related tweets, a time-aware knowledge extraction for microblog summarization on twitter, and a hierarchical self-attentive neural extractive summarizer via reinforcement learning. The text discusses various research topics related to text summarization, machine learning, and artificial intelligence. Some of the main topics include: extractive text summarization by feature-based sentence extraction using rule-based, abstractive text summarization using sequence-to-sequence RNNs, SummaRuNNer, and a survey on automatic text summarization. Other topics include minimum redundancy and maximum relevance for single and multi-document Arabic text summarization, and association of deep learning algorithm with fuzzy logic for multidocument text summarization. The text discusses various research topics related to fuzzy logic, multi-document summarization, and machine translation. Some of the key findings include: the use of fuzzy logic in multi-document summarization, the use of fuzzy logic in multi-document summarization, the use of fuzzy logic in multi-document summarization, and the use of fuzzy logic in machine translation. The text discusses various research topics in the field of text mining and summarization, including automatic text summarization, deep learning based abstractive text summarization, and sentence relations for extractive summarization with deep neural networks. The text discusses various research topics related to text summarization, including automatic text summarization using term frequency and semantic similarity, automatic text summarization using key concepts in documents, and automatic text summarization using inverse document frequency. The text discusses various research topics related to text summarization, including NLP-based text summarization using semantic analysis, fuzzy evolutionary optimization modeling for unsupervised categorization, and extractive summarization using N-gram stemming. The text discusses various research studies in the field of text summarization, including the use of reinforcement ranking on semantic link networks for scientific paper summarization, the development of an approach based on soft computing for text summarization, the use of sentence features relevance for extractive text summarization using genetic algorithms, and the comparison of Hindi and English extractive text summarization. A systematic literature review of software defect prediction : research trends, datasets methods and frameworks. Wang, Xun, Yoshida, Yasuhisa, Hirao, Tsutomu, Sudoh, Katsuhito, Nagata, Masaaki, Sudoh, Katsuhito, Nagata, Masaaki, Sudoh, Katsuhito, Nagata, Masaaki, Sudoh, Katsuhito, Nagata, Masaaki, Sudoh, Katsuhito, Nagata, Masaaki, Sudoh, Katsuhito, Nagata, Masaaki, Sudoh, Katsuhito, Katsuhito, Cai, Su, Q.I, Sun, X., 2019. Regularizing output distribution of abstractive chinese social media text summarization. ACM Trans. Asian Low-Resour. Lang. Inf. The text discusses various research topics in the field of natural language processing, including multi-document summarization for the Indonesian language, continuous summarization for microblog streams, topic modeling based approach to novel document summarization, neural extractive text summarization with syntactic compression, and multi-task learning for abstractive text summarization with key information guide network. The text discusses various research topics related to text summarization, including the use of fuzzy logic and wordnet for extractive automatic text summarization, the use of dual encoding for abstractive text summarization, the use of deep reinforcement learning for extractive document summarization, and the use of unsupervised deep learning for document summarization. The text discusses several research articles published in the Journal of King Saud University - Computer and Information Sciences, including: 'Improving the classification performance on imbalanced data sets via new hybrid parameterisation model', 'A formal quantitative analysis of elastic cloud systems based on PSMaude', 'Arabic natural language processing: An overview', and 'A Signer Independent Sign Language Recognition with Co-articulation'. The authors of these articles were contacted after publication to request a Declaration of Competing Interest statement. The text discusses various research projects in the field of computer and information sciences at King Saud University in Saudi Arabia. These projects include a sign language recognition system, automatic text summarization techniques, a self-switching multi-strategic pedagogical agent, and a new design paradigm for provably secure keyless hash function with subsets and two variables polynomial function. The text discusses various research projects in the field of computer and information sciences at King Saud University, including the development of a real-time commit protocol, the implementation of modified OLSR protocol in AANETs for UDP and TCP environment, and the use of Linked Open Data for halal food products. The text discusses several research articles published in the Journal of King Saud University - Computer and Information Sciences, including one on microcalcification segmentation using modified U-net segmentation network from mammogram images. The authors were contacted after publication to request a Declaration of Competing Interest statement. Researchers at King Saud University have published several research papers in the field of computer and information sciences, including: 1. Segmentation of hippocampus guided by assembled and weighted coherent point drift registration; 2. Load balancing optimization based on hybrid Heuristic- Metaheuristic techniques in cloud environment; 3. Evolutionary computing approach to optimize superframe scheduling on industrial wireless sensor networks; 4. PSO based test case generation for critical path using improved combined fitness function; 5. Improving outliers detection in data streams using LiCS and voting; 6. A randomized CPA-secure asymmetric-key chaotic color image encryption scheme based on the Chebyshev mappings and one-time pad; 7. A randomized CPA-secure asymmetric-key chaotic color image encryption scheme based on the Chebyshev mappings and one-time pad; 8. Rank Criteria Improved Confidence-based centroid scheme for non Line of Sight node localizations in vehicular networks; 9. The text discusses various research projects in the field of computer and information sciences at King Saud University in Saudi Arabia. These projects include: - An adaptive cuckoo search based algorithm for placement of relay nodes in wireless body area networks - Improving the classification performance on imbalanced data sets - A formal quantitative analysis of elastic cloud systems - An overview of Arabic natural language processing - A signer independent sign language recognition with co-articulation elimination from live videos - A signer independent sign language recognition with co-articulation elimination from live videos - An Indian scenar- - A signer independent sign language recognition with co-articulation elimination from live videos - A signer independent sign language recognition with co-articulation elimination - A signer independent sign language recognition with co-articulation elimination from live videos - A signer independent sign language recognition with co-articulation elimination from live videos - A signer independent sign language recognition with co-articulation elimination from live videos - A signer independent sign language recognition with co-articulation elimination from live videos - WordCode using WordTrie The text is a collection of unrelated articles published in the Journal of King Saud University - Computer and Information Sciences, Volume 32, Issue 10, December 2020, Page 1227–1228. The articles cover various topics related to computers, information sciences, and information technology. The main topic is the role of computers in information and communication, and the authors discuss their research in the context of computer and information sciences. The text discusses the missing Declaration of Competing Interest statements in previously published articles in the Journal of King Saud University - Computer and Information Sciences. The authors of the articles have declared that they have no competing financial interests or relationships that could have appeared to influence the work reported in the articles. The text discusses various research papers published in various journals, including the Journal of King Saud University - Computer and Information Sciences (JKSUCI), the Journal of King Saud University - Computer and Information Sciences (JKSUCI), the Journal of King Saud University - Computer and Information Sciences (JKSUCI), the Journal of King Saud University - Computer and Information Sciences (JKSUCI), the Journal of King Saud University - Computer and Information Sciences (JKSUCI), and the Journal of King Saud University - Computer and Information Sciences. The text discloses that the authors have no competing interests or affiliations with any organizations or entities with financial interests. The text discloses that the authors have no competing financial interests or relationships with organizations with direct or indirect financial interest in the subject matter discussed in the manuscript. The text discloses that the authors have no conflicts of interest and have not received any financial support for their research. The authors have no affiliations with or involvement in any organization or entity with financial or non-financial interests in the subject matter or materials discussed in the manuscript.
"""