In [3]:
from pprint import pprint
from rouge_score import rouge_scorer 

In [1]:
pip install Flask transformers

Note: you may need to restart the kernel to use updated packages.


In [20]:
pip install Flask

Collecting FlaskNote: you may need to restart the kernel to use updated packages.

  Downloading flask-3.0.2-py3-none-any.whl.metadata (3.6 kB)
Collecting Werkzeug>=3.0.0 (from Flask)
  Downloading werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting itsdangerous>=2.1.2 (from Flask)
  Downloading itsdangerous-2.1.2-py3-none-any.whl.metadata (2.9 kB)
Collecting blinker>=1.6.2 (from Flask)
  Downloading blinker-1.7.0-py3-none-any.whl.metadata (1.9 kB)
Downloading flask-3.0.2-py3-none-any.whl (101 kB)
   ---------------------------------------- 0.0/101.3 kB ? eta -:--:--
   ------------------------ --------------- 61.4/101.3 kB 1.7 MB/s eta 0:00:01
   ------------------------------------ --- 92.2/101.3 kB 1.7 MB/s eta 0:00:01
   -------------------------------------- 101.3/101.3 kB 968.7 kB/s eta 0:00:00
Downloading blinker-1.7.0-py3-none-any.whl (13 kB)
Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Downloading werkzeug-3.0.1-py3-none-any.whl (226 kB)
   ----------------

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
sample_text = '''
The majority of available text summarization datasets include short-form source documents that lack 
long-range causal and temporal dependencies, and often contain strong layout and stylistic biases. While relevant, 
such datasets will offer limited challenges for future generations of text summarization systems. We address these 
issues by introducing BookSum, a collection of datasets for long-form narrative summarization. Our dataset covers 
source documents from the literature domain, such as novels, plays and stories, and includes highly abstractive, 
human written summaries on three levels of granularity of increasing difficulty: paragraph-, chapter-, and book-level.
 The domain and structure of our dataset poses a unique set of challenges for summarization systems, which include:
   processing very long documents, non-trivial causal and temporal dependencies, and rich discourse structures. 
   To facilitate future work, we trained and evaluated multiple extractive and abstractive summarization models as baselines for our dataset.
''' 

In [6]:
X_token = tokenizer(sample_text,return_tensors="pt" )['input_ids']# like dictionary
X_token

tensor([[    0, 50118,   133,  1647,     9,   577,  2788, 39186,  1938, 42532,
           680,   765,    12,  3899,  1300,  2339,    14,  1762,  1437, 50118,
          3479,    12,  9435, 41214,     8, 41853, 45371,     6,     8,   747,
          5585,   670, 18472,     8, 15240,  5580, 31681,     4,   616,  4249,
             6,  1437, 50118, 16918, 42532,    40,   904,  1804,  2019,    13,
           499,  6808,     9,  2788, 39186,  1938,  1743,     4,   166,  1100,
           209,  1437, 50118, 40512,    30, 10345,  5972, 38182,     6,    10,
          2783,     9, 42532,    13,   251,    12,  3899,  7122, 39186,  1938,
             4,  1541, 41616,  4865,  1437, 50118, 17747,  2339,    31,     5,
         13144, 11170,     6,   215,    25, 19405,     6,  1974,     8,  1652,
             6,     8,  1171,  2200, 20372,  2088,     6,  1437, 50118, 19003,
          1982, 32933,  5119,    15,   130,  1389,     9, 17227, 42664,     9,
          2284,  9600,    35, 17818, 20551,  7285, 2

In [7]:
output_tensor = model.generate(X_token)

output = tokenizer.decode(output_tensor[0], skip_special_tokens = True)

In [8]:
pprint(output)

('BookSum is a collection of datasets for long-form narrative summarization. '
 'The dataset covers novels, plays and stories. It includes highly '
 'abstractive, human written summaries on three levels of granularity of '
 'increasing difficulty: paragraph-, chapter-, and book-level. The domain and '
 'structure of our dataset poses a unique set of challenges for summarization '
 'systems.')


In [9]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)
#instead of accuracy in classification we use this to measure the rouge score.

In [10]:
scores = scorer.score(sample_text, output)
print("ROUGE-1 (Unigram):", scores['rouge1'])
print("ROUGE-2 (Bigram):", scores['rouge2'])
print("ROUGE-L (Longest Common Subsequence):", scores['rougeLsum'])
#black box model

ROUGE-1 (Unigram): Score(precision=0.9629629629629629, recall=0.3586206896551724, fmeasure=0.5226130653266332)
ROUGE-2 (Bigram): Score(precision=0.8679245283018868, recall=0.3194444444444444, fmeasure=0.46700507614213194)
ROUGE-L (Longest Common Subsequence): Score(precision=0.9444444444444444, recall=0.35172413793103446, fmeasure=0.5125628140703516)


In [11]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    pprint (f" Trainable params: {trainable_params} \n All params: {all_param} \n Trainable%: {100 * trainable_params / all_param}")

In [12]:
from peft import LoraConfig, get_peft_model 

config = LoraConfig(
    r=380,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="Seq2Seq"
)

peft_model = get_peft_model(model, config)
print_trainable_parameters(peft_model)

(' Trainable params: 56033280 \n'
 ' All params: 462323712 \n'
 ' Trainable%: 12.119923453115033')


In [13]:
peft_model.train()

PeftModel(
  (base_model): LoraModel(
    (model): BartForConditionalGeneration(
      (model): BartModel(
        (shared): Embedding(50264, 1024, padding_idx=1)
        (encoder): BartEncoder(
          (embed_tokens): Embedding(50264, 1024, padding_idx=1)
          (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
          (layers): ModuleList(
            (0-11): 12 x BartEncoderLayer(
              (self_attn): BartSdpaAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=380, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (defaul

In [14]:
peft_tensor = peft_model.generate(X_token)
peft_output = tokenizer.decode(peft_tensor[0], skip_special_tokens = True)

In [15]:
scores = scorer.score(sample_text, peft_output)
print("ROUGE-1 (Unigram):", scores['rouge1'])
print("ROUGE-2 (Bigram):", scores['rouge2'])
print("ROUGE-L (Longest Common Subsequence):", scores['rougeLsum'])

ROUGE-1 (Unigram): Score(precision=0.9629629629629629, recall=0.3586206896551724, fmeasure=0.5226130653266332)
ROUGE-2 (Bigram): Score(precision=0.9056603773584906, recall=0.3333333333333333, fmeasure=0.4873096446700508)
ROUGE-L (Longest Common Subsequence): Score(precision=0.9629629629629629, recall=0.3586206896551724, fmeasure=0.5226130653266332)


In [16]:
pprint(peft_output)

('BookSum is a collection of datasets for long-form narrative summarization. '
 'Our dataset covers novels, plays and stories. It includes highly '
 'abstractive, human written summaries on three levels of granularity of '
 'increasing difficulty: paragraph-, chapter-, and book-level. The domain and '
 'structure of our dataset poses a unique set of challenges for summarization '
 'systems.')


peft_model generally has higher ROUGE scores across all metrics (precision, recall, and F1 measure) for both unigram (ROUGE-1) and bigram (ROUGE-2) comparisons. This suggests that, based on the ROUGE metric, peft_model is performing better in terms of generating summaries that match reference summaries.

The F1 measure is often used as a balance between precision and recall. In this case, peft_model has higher F1 scores, indicating a better balance between precision and recall.

In [17]:
import torch

In [18]:
torch.save(peft_model,"saved_peft_model")

Use "model = torch.load("saved_peft_model")" to load the model