In [1]:
import os

os.environ["HF_HOME"] = "D:/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "D:/hf_cache/datasets"
os.environ["TRANSFORMERS_CACHE"] = "D:/hf_cache/models"

In [2]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")

dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

# Training Part

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('t5-small')



In [4]:
sample_text = dataset['train'][0]['article']

inputs = tokenizer(
    'summarize: ' + sample_text, #tell the model your task is summarization
    max_length = 512,
    truncation = True, # Automatically cut the remaining part.above 512
    return_tensors = 'pt' #Return output as PyTorch(pt) tensors.
)

| Task               | Input Given to Model                   |
| ------------------ | -------------------------------------- |
| Translation        | `"translate English to German: Hello"` |
| Summarization      | `"summarize: Long article..."`         |
| Question Answering | `"question: Who invented AI?"`         |
| Grammar Fix        | `"fix grammar: he go school"`          |


In [5]:
print(inputs)


'''
input_ids

Numbers representing words.

Example:

"hello world"
â†’ [8774, 296]

attention_mask

Tells model:

1 = real token
0 = padding'''

{'input_ids': tensor([[21603,    10,   301, 24796,  4170,     6,  2789,    41, 18844,    61,
          1636,  8929, 16023,  2213,  4173,  6324, 12591,    15, 11391,   592,
            12,     3,     9,  2196,  3996,  1755,   770,  8785,   591, 11039,
           770,    61, 13462,    38,     3,    88,  5050,   507,    30,  2089,
             6,    68,     3,    88, 10419,     7,     8,   540,   751,    31,
            17,  4061,     3,     9, 10783,    30,   376,     5,  4173,  6324,
         12591,    15,    38,  8929, 16023,    16,    96, 15537,   651, 16023,
            11,     8,  5197,    13,     8, 12308,   121,   304,     8, 19142,
            13, 29517,  6710,   343,     7,   300,     8,   296,     6,     8,
          1021,  7556,   845,     3,    88,    65,   150,  1390,    12,  9030,
            17,   449,   112,  1723,   550,    30,  1006,  2948,     6,  3281,
            11, 17086,  2251,     5,    96,   196,   278,    31,    17,   515,
            12,    36,    80,    13,  

'\ninput_ids\n\nNumbers representing words.\n\nExample:\n\n"hello world"\nâ†’ [8774, 296]\n\nattention_mask\n\nTells model:\n\n1 = real token\n0 = padding'

In [6]:
tokenizer.decode(inputs['input_ids'][0])

'summarize: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported Â£20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [7]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

'''This model already knows:

grammar

language structure

summarization patterns'''



'This model already knows:\n\ngrammar\n\nlanguage structure\n\nsummarization patterns'

In [8]:
outputs = model.generate(
    inputs['input_ids'],
    max_length=50, #Maximum length of generated summary
    num_beams=4, #Try 4 possible summaries Choose best
    early_stopping=True #Stops generation when summary logically ends.
)

In [9]:
outputs

tensor([[    0,     8,  1021,  7556,   845,     3,    88,    65,   150,  1390,
            12,  9030,    17,   449,   112,  1723,   550,    30,  1006,  2948,
             6,  3281,    11, 17086,  2251,     3,     5,     3,    88,    56,
            36,     3,   179,    12, 24068,    16,     3,     9,  2653,     6,
           805,     3,     9,  3281,    16,     3,     9, 11943,    42,   217]])

In [10]:
summary = tokenizer.decode(outputs[0],skip_special_tokens=True)
print(summary)


'''
without skip_special_tokens=true 

<pad> <pad> The match ended dramatically </s>'''

the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. he will be able to gamble in a casino, buy a drink in a pub or see


'\nwithout skip_special_tokens=true \n\n<pad> <pad> The match ended dramatically </s>'

In [None]:
def generate_summary(text,max_len=50,beams=4):
    
    inputs = tokenizer(
        'summarize: ' + text,
        max_length = 512,
        truncation = True,
        return_tensors = 'pt'
    )
    
    outputs = model.generate(
        inputs['input_ids'],
        max_length = max_len,
        num_beams = beams,
        early_stopping = True
    )
    
    return tokenizer.decode(outputs[0],skip_special_tokens=True)

In [12]:
article = dataset['train'][1]['article']

summary = generate_summary(article,beams=4)

print(summary)

inmates with the most severe mental illnesses are incarcerated until they're ready to appear in court. most often, they face drug charges or charges of assaulting an officer. they end up on the ninth floor severely mentally


# Fine-Tuning Part

Dataset

   â†“

Tokenization

   â†“

Input + Target labels

   â†“

Model Training

   â†“

Loss Calculation

   â†“

Weight Update

   â†“

Better Summaries

In [13]:
small_dataset = dataset['train'].select(range(500))

In [14]:
small_dataset

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 500
})

In [None]:
def preprocess_function(examples):
    
    inputs = ['summarize: '+ doc for doc in examples['article']]
    
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation = True
    )
    
    labels = tokenizer(
        examples['highlights'],
        max_length=128,
        truncation = True
    )
    
    model_inputs['labels']=labels['input_ids']
    
    return model_inputs
    

In [16]:
tokenized_data = small_dataset.map(
    preprocess_function,
    batched=True #Dataset sends a group (batch) of rows together.
)
tokenized_data

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

In [17]:
from transformers import TrainingArguments,Trainer


training_args = TrainingArguments(
    output_dir="./results",
    
    learning_rate=2e-5,
    
    per_device_train_batch_size=2,   # small for CPU
    per_device_eval_batch_size=2,
    
    num_train_epochs=1,              # keep small initially
    
    weight_decay=0.01,
    
    logging_dir="./logs",
    logging_steps=50,
    
    save_total_limit=2,
    save_strategy="epoch",
    
    evaluation_strategy="no",        # skip eval for speed
    
    fp16=False                       # GPU only feature
)

| Parameter     | Meaning                |
| ------------- | ---------------------- |
| learning_rate | how fast model learns  |
| batch_size=2  | laptop-friendly memory |
| epochs=1      | one pass over data     |
| weight_decay  | prevents overfitting   |
| logging_steps | print progress         |
| fp16=False    | CPU compatibility      |


In [30]:
from transformers import DataCollatorForSeq2Seq
data_collector = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model= model
)

In [31]:
trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
    data_collator=data_collector
)
trainer.train()

'''Trainer handles:

âœ… forward pass
âœ… loss calculation
âœ… backpropagation
âœ… optimization
'''

  0%|          | 0/250 [04:29<?, ?it/s]
 17%|â–ˆâ–‹        | 42/250 [02:31<12:05,  3.49s/it]

KeyboardInterrupt: 

1. Encoder reads article
2. Decoder predicts summary
3. Compare with real summary
4. Calculate loss
5. Update attention weights

In [19]:
trainer.save_model("artifacts/model")
tokenizer.save_pretrained("artifacts/tokenizer")

('artifacts/tokenizer\\tokenizer_config.json',
 'artifacts/tokenizer\\special_tokens_map.json',
 'artifacts/tokenizer\\tokenizer.json')

# Pretrained vs Fine tuned model

In [20]:
from transformers import AutoModelForSeq2SeqLM

base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")



In [21]:
trained_model = AutoModelForSeq2SeqLM.from_pretrained("artifacts/model")

In [22]:
def generate_with_model(model, text):
    
    inputs = tokenizer(
        "summarize: " + text,
        max_length=512,
        truncation=True,
        return_tensors="pt"
    )

    outputs = model.generate(
        inputs["input_ids"],
        max_length=50,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [23]:
test_article = dataset["test"][0]["article"]
true_summary = dataset["test"][0]["highlights"]

print("TRUE SUMMARY:\n", true_summary)

TRUE SUMMARY:
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .


In [24]:
print("\n--- BEFORE TRAINING ---\n")
print(generate_with_model(base_model, test_article))

print("\n--- AFTER TRAINING ---\n")
print(generate_with_model(trained_model, test_article))


--- BEFORE TRAINING ---

the palestinians signed the ICC's founding Rome Statute in January. the ICC also accepted its jurisdiction over alleged crimes committed in the occupied territories. the ICC opened a preliminary examination into the situation

--- AFTER TRAINING ---

the palestinians signed the ICC's founding Rome Statute in January. the ICC also accepted its jurisdiction over alleged crimes committed in the occupied territories. the ICC opened a preliminary examination into the situation


# Evaluate Rouge

ROUGE = Recall-Oriented Understudy for Gisting Evaluation

It measures how similar your generated summary is to the human-written summary.

| Metric  | Meaning                       |
| ------- | ----------------------------- |
| ROUGE-1 | word overlap                  |
| ROUGE-2 | phrase overlap                |
| ROUGE-L | sentence structure similarity |


In [25]:
from datasets import load_metric

rouge = load_metric("rouge")

  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [26]:
def evaluate_model(model, dataset, num_samples=50):
    
    predictions = []
    references = []

    for i in range(num_samples):

        article = dataset["test"][i]["article"]
        true_summary = dataset["test"][i]["highlights"]

        inputs = tokenizer(
            "summarize: " + article,
            max_length=512,
            truncation=True,
            return_tensors="pt"
        )

        outputs = model.generate(
            inputs["input_ids"],
            max_length=50,
            num_beams=4,
            early_stopping=True
        )

        pred_summary = tokenizer.decode(
            outputs[0],
            skip_special_tokens=True
        )

        predictions.append(pred_summary)
        references.append(true_summary)

    results = rouge.compute(
        predictions=predictions,
        references=references
    )

    return results

In [27]:
base_scores = evaluate_model(base_model, dataset)
print(base_scores)
trained_scores = evaluate_model(trained_model, dataset)
print(trained_scores)

{'rouge1': AggregateScore(low=Score(precision=0.2792186771007231, recall=0.29468321511191997, fmeasure=0.28257986593597795), mid=Score(precision=0.3185451989817666, recall=0.3373348577705798, fmeasure=0.3199628192125831), high=Score(precision=0.35793511071305434, recall=0.3802107844461455, fmeasure=0.3549506196486023)), 'rouge2': AggregateScore(low=Score(precision=0.09716931887432873, recall=0.10408049257874563, fmeasure=0.09888870543372635), mid=Score(precision=0.13046343534250726, recall=0.13826674269561823, fmeasure=0.13053908138963083), high=Score(precision=0.16448057421086426, recall=0.17358430253014573, fmeasure=0.16239301968749695)), 'rougeL': AggregateScore(low=Score(precision=0.2076877861844154, recall=0.220030705015694, fmeasure=0.2114717887856373), mid=Score(precision=0.24016090488535685, recall=0.25499034872618603, fmeasure=0.2414675921975874), high=Score(precision=0.2747611558099274, recall=0.2925878982878196, fmeasure=0.27202877950667786)), 'rougeLsum': AggregateScore(low