In [1]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv
/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv
/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv
/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1/config.json
/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1/spiece.model
/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1/training_args.bin
/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1/tokenizer.json
/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1/tokenizer_config.json
/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1/model.safetensors
/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1/special_tokens_map.json
/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1/generation_config.json
/kaggle/input/cnn-preprocessed-transformer/transformer_data.csv
/kaggle/input/cnn-preprocessed-transformer/transformer_data_special_removed.csv
/kaggle/input/bart-fine-tuned/transformers/bart_cnn_20k/1/config

In [None]:
!pip install textacy
!pip install contractions

In [3]:
import textacy
from textacy import preprocessing as prep
import re
import contractions

In [4]:
# defining regex function for the pipeline
#removing html tags
def remove_html(text):
    text = re.sub(r"<[^>]+>"," ",text)
    return re.sub(r"&[\S]+"," ",text) #removes &nbsp
#removing emails
def remove_email(text):
    return re.sub(r"\S*@\S*.com","",text)

#removing hashtags
def remove_hash(text):
    return re.sub(r"\#*","",text)

#removing special symbols except () '' "" ? ! - .
def remove_special(text):
    return re.sub(r"[^\w\s()\'\"\?\!\-\.]","",text)

#replacing _ with space
def replace_(text):
    return re.sub(r"[_]"," ",text)

In [5]:
#Expand Contracions
def expand(text):
    expanded = []
    for word in text.split():
        expanded.append(contractions.fix(word))
    txt = ' '.join(expanded)
    return txt

In [6]:
#Lowering word
def low(text):
    return text.lower()

In [7]:
text_pipe = prep.make_pipeline(
    remove_email,
    prep.replace.emojis,
    prep.replace.urls,
    prep.replace.phone_numbers,
    remove_hash,
    prep.replace.currency_symbols,
    remove_html,
    low,
    prep.normalize.hyphenated_words, # sentences that have been split by a '-' is split and joined together
    prep.normalize.quotation_marks, # normalize all singal and double quotes in the text to ASCII representation,
    prep.normalize.unicode,
    prep.normalize.bullet_points,
    replace_,
    remove_special,
    prep.normalize.whitespace
)

In [None]:
data = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')

In [None]:
def pre_process(df):
    df['article'] = df['article'].apply(pipe)
    df['article'] = df['article'].apply(expand)

    df['highlights'] = df['highlights'].apply(pipe)
    df['highlights'] = df['highlights'].apply(expand)
    return df

In [None]:
data.sample(frac=0.35).reset_index(drop=True).shape

In [None]:
data.sample()

In [None]:
df = data.sample(frac=0.35).reset_index(drop=True)

In [None]:
df

In [None]:
df = pre_process(df)

In [None]:
df.to_csv('transformer_data_special_removed.csv',index=False)

In [None]:
df.iloc[0,1]

In [2]:
df = pd.read_csv('/kaggle/input/cnn-preprocessed-transformer/transformer_data_special_removed.csv')

In [3]:
df.shape

(100490, 3)

In [4]:
df.sample().iloc[0,1],df.sample().iloc[0,2]

("you are top of the bundesliga tree by 11 points projected to win the title in the coming months so there remains plenty of time to spread some christmas cheer. bayern munich's squad of stars including thomas muller arjen robben and xabi alonso took part in a sing-along wishing their supporters a merry christmas. singing in english the video shows the team going along to 'we wish you a merry christmas' as fans from around the world help out too. bayern munich's squad of stars took part in a sing-along wishing their supporters a merry christmas . arjen robben (pictured wearing black) was among those singing 'we wish you a merry christmas' thomas muller (left) and bastian schweinsteiger (right) sing during the video clip of the german stars . bayern are comfortably 11 points ahead of wolfsburg in germany and 30 ahead of second-bottom borussia dortmund who may not be having quite as cheerful a christmas this time around. pep guardiola's men next play against second-top wolfsburg on janua

# Transformer 

In [None]:
!pip install -U transformers
!pip install datasets

In [6]:
from transformers import (
    pipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer
)
from datasets import Dataset, load_dataset
import torch

2024-04-23 05:47:41.189219: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 05:47:41.189318: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 05:47:41.322150: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
data = df.sample(frac=0.2).reset_index(drop=True)

In [8]:
data.shape

(20098, 3)

In [9]:
data.sample().iloc[0,2]

'tech giants google oracle and red hat helping to fix healthcare.gov . house republicans seek security documents from sebelius and contractors . obama "i take full responsibility for making sure it gets fixed as soon as possible" sebelius says she was wrong to tell obama the website was ready .'

In [10]:
dataset = Dataset.from_pandas(data)

In [11]:
dataset

Dataset({
    features: ['id', 'article', 'highlights'],
    num_rows: 20098
})

In [12]:
train_dataset = dataset.shuffle(seed=21).select(range(0, int(0.9 * len(dataset))))
validation_dataset = dataset.shuffle(seed=21).select(range(int(0.9 * len(dataset)), len(dataset)))

# validation_dataset = dataset.shuffle(seed=21).select(range(int(0.8 * len(dataset)), int(0.9 * len(dataset))))

# test_dataset = dataset.shuffle(seed=21).select(range(int(0.9 * len(dataset)), len(dataset)))

### t5

In [13]:
model_ckpt = 'google-t5/t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
article_len = [len(i.split()) for i in dataset['article']]
summary_len = [len(i.split()) for i in dataset['highlights']]

In [15]:
def get_features(batch):
    encodings = tokenizer(batch['article'],text_target=batch['highlights'],max_length=1024,truncation=True)
    encodings = {
        'input_ids':encodings['input_ids'],
        'attention_mask':encodings['attention_mask'],
        'labels':encodings['labels']
    }
    return encodings 

In [None]:
dataset_pt = dataset.map(get_features,batched=True)

In [16]:
train_dataset_pt = train_dataset.map(get_features,batched=True)

Map:   0%|          | 0/18088 [00:00<?, ? examples/s]

In [17]:
validation_dataset_pt = validation_dataset.map(get_features,batched=True)

Map:   0%|          | 0/2010 [00:00<?, ? examples/s]

In [18]:
dataset

Dataset({
    features: ['id', 'article', 'highlights'],
    num_rows: 20098
})

In [None]:
dataset_pt

In [19]:
train_dataset

Dataset({
    features: ['id', 'article', 'highlights'],
    num_rows: 18088
})

In [20]:
train_dataset_pt

Dataset({
    features: ['id', 'article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 18088
})

In [21]:
validation_dataset

Dataset({
    features: ['id', 'article', 'highlights'],
    num_rows: 2010
})

In [22]:
validation_dataset_pt

Dataset({
    features: ['id', 'article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2010
})

In [None]:
columns = ['input_ids','attention_mask','labels']
dataset_pt.set_format(type='torch',columns=columns)

In [23]:
columns = ['input_ids','attention_mask','labels']
train_dataset_pt.set_format(type='torch',columns=columns)
validation_dataset_pt.set_format(type='torch',columns=columns)

In [24]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer,model = model)

In [25]:
from transformers import TrainingArguments,Trainer

In [26]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/t5_cnn',
    num_train_epochs=1,
    warmup_steps=250,
#     warmup_steps=500,
#     per_device_eval_batch_size=4,
#     per_device_train_batch_size=4,
        per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
#     eval_steps=250,
    eval_steps=500,
    save_steps = 1e6,
#     gradient_accumulation_steps=16
    gradient_accumulation_steps=8
)

In [30]:
trainer = Trainer(model=model,tokenizer=tokenizer,args=training_args,data_collator=data_collator,train_dataset=train_dataset_pt,eval_dataset=validation_dataset_pt)

In [31]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ·························································································································································································································································································································


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 312
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,1.3709,1.18071
1000,1.3437,1.170336
1500,1.2996,1.161005
2000,1.3683,1.156293


TrainOutput(global_step=2261, training_loss=1.380703934575013, metrics={'train_runtime': 3710.1956, 'train_samples_per_second': 4.875, 'train_steps_per_second': 0.609, 'total_flos': 1.7778388837248e+16, 'train_loss': 1.380703934575013, 'epoch': 1.0})

In [32]:
model.save_pretrained('t5_cnn_20k')

In [33]:
tokenizer.save_pretrained('t5_cnn_20k')

('t5_cnn_20k/tokenizer_config.json',
 't5_cnn_20k/special_tokens_map.json',
 't5_cnn_20k/spiece.model',
 't5_cnn_20k/added_tokens.json',
 't5_cnn_20k/tokenizer.json')

In [None]:
trainer.save_model('t5_cnn_model')

In [34]:
model_t5_path = '/kaggle/input/t5-model'
model_t5 = AutoModelForSeq2SeqLM.from_pretrained(model_t5_path)
tokenizer_t5 = AutoTokenizer.from_pretrained(model_t5_path)

In [35]:
pipe = pipeline('summarization',model = model_t5,tokenizer = tokenizer_t5)
gen_kwargs = {'length_penalty':0.8,'num_beams':8,'max_length':125}

In [36]:
a,s = df.sample().iloc[0,1:]

In [37]:
a

'only in space would 2 million miles be considered a close call. an asteroid with an estimated diameter of three football fields zoomed by earth late monday missing our home by about that distance. it traveled at some 27000 miles per hour. the asteroid came just about a year after a relatively small asteroid blew up over russia. the roughly 60-foot space rock plunged into earth\'s atmosphere and exploded over the city of chelyabinsk with the force of about 30 early nuclear bombs. the blast left more than 1500 injured mostly by glass from shattered windows and raised concerns about humanity\'s vulnerability to stray asteroids. "on a practical level a previously unknown undiscovered asteroid seems to hit our planet and because damage or injury once a century or so as we witnessed on june 20 1908 and february 15 2013" said bob berman slooh host and astronomer. slooh.com tracks potentially hazardous objects like asteroids and comets. berman added "every few centuries an even more massive a

In [38]:
print(pipe(a,**gen_kwargs))

[{'summary_text': "an asteroid with an estimated diameter of three football fields zoomed by earth late monday . it traveled at some 27000 miles per hour . the space rock plunged into earth's atmosphere and exploded over chelyabinsk ."}]


In [39]:
s

'the asteroid misses earth by some 2 million miles . it comes just about a year after an asteroid blew up over russia . the space rock has an estimated diameter of three football fields .'

### bart
865 tokens

In [None]:
model_ckpt = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/bart_cnn',
    num_train_epochs=1,
    warmup_steps=250,
#     warmup_steps=500,
#     per_device_eval_batch_size=4,
#     per_device_train_batch_size=4,
        per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
#     eval_steps=250,
    eval_steps=500,
    save_steps = 1e6,
#     gradient_accumulation_steps=16
    gradient_accumulation_steps=8
)

In [None]:
trainer = Trainer(model=model,tokenizer=tokenizer,args=training_args,data_collator=data_collator,train_dataset=train_dataset_pt,eval_dataset=validation_dataset_pt)

In [None]:
trainer.train()

In [None]:
trainer.save_model('bart_cnn_model')

In [None]:
pipe_bart = pipeline('summarization','/kaggle/working/bart_cnn_model')
gen_kwargs = {'length_penalty':0.8,'num_beams':8,'max_length':125}

In [None]:
a,s = df.sample().iloc[0,1:]

In [None]:
a

In [None]:
print(pipe_bart(' '.join(a.split()[:865]),**gen_kwargs))

In [None]:
s

testing models

In [9]:
import torch
from transformers import (
    pipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer

)

2024-04-24 05:52:23.954959: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-24 05:52:23.955051: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-24 05:52:24.081513: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
path = "/kaggle/input/t5-fine-tuned/transformers/t5_cnn/1"
model_t5 = AutoModelForSeq2SeqLM.from_pretrained(path)
tokenizer_t5 = AutoTokenizer.from_pretrained(path)
pipe_t5 = pipeline("summarization",model=model_t5,tokenizer=tokenizer_t5)

In [22]:
path_bart = "/kaggle/input/bart-fine-tuned/transformers/bart_cnn_20k/1"
model_bart = AutoModelForSeq2SeqLM.from_pretrained(path_bart)
tokenizer_bart = AutoTokenizer.from_pretrained(path_bart)
pipe_bart = pipeline("summarization", model=model_bart,tokenizer=tokenizer_bart)

In [11]:
gen_kwargs = {'length_penalty':0.8,'num_beams':8,'max_length':125}

In [55]:
txt = """
what should a comedian do when a baby cradled in its mother's arms starts making noises in the middle of a set?american comedian arj barker was faced with that awkward situation during a show in australia on saturday night at the melbourne international comedy festival.in the middle of his hour-long gig the comedian known for his observational and satirical style made the decision to eject the mother and her 7-month-old baby prompting some in the audience to leave the show in solidarity with the mother. other patrons reportedly heckled her as she left.the interaction sparked fierce debate in australia about the rights of mothers to take their babies wherever they like and an entertainer's right to perform without interruptions from infants who some argue should be left at home.barker has repeatedly defended his decision to ask trish faranda and her baby clara to leave the event which specified a minimum age of 15 for attendees.speaking to melbourne radio station 3aw monday barker said "it wasn't easy" to make the decision to remove the baby but he "did it for the show" and the audience who deserved to see what they had paid for uninterrupted."i can understand that it was difficult and embarrassing for her and i do feel bad about that" barker said.the californian comedian a regular on the international comedy circuit was performing to a crowd of several hundred people at the athenaeum theater when the baby's noises interrupted his train of thought barker told the australian broadcasting corporation (abc)."on behalf of the other 700 people there who had paid to see the gig i politely told her the baby couldn't stay" he told the national broadcaster.barker said at the time he couldn't see that the mother was breastfeeding due to the theater's bright lights and dismissed criticism that his decision was related to anything other than noise."i have nothing against babies number one the breastfeeding thing is a non-issue it should be inadmissible and i had no idea if she was breastfeeding or not because i was on a lit stage" he told cnn affiliate nine news."all i could see was a woman likely holding a baby the breastfeeding was never part of it. if it were the father i would have acted the exact same way it had to do with the baby making noise."it was purely an audio issue it had nothing to do with her being a mom i have nothing against moms."faranda told multiple local media outlets that she initially thought it was just a joke when barker asked her and her infant daughter to leave his show.faranda told cnn affiliate seven news that while barker was in the middle of his performance he stopped and said "is there a baby here?"he then said "'i speak fluent baby and it said take me outside'" faranda recalled adding she had laughed along not knowing whether he was being serious.faranda said her baby "wasn't yelling" and she started to breastfeed her to try and calm the infant but by then she was also packing up to leave the fourth row of the theater."i didn't want to ruin anyone's night it was never my intention to go and disrupt people or create a scene" she told seven news."he was intimidating and he was standing right in front of me" she told 3aw. a witness who spoke with nine news said a few people in the crowd heckled the mother to leave.faranda said she attended the show with a group of friends and about a dozen others all women who were mothers or grandmothers as well as one "lovely gentleman" walked out in solidarity with her.the incident has sparked sympathy for the mother who claimed she was just trying to have a fun night out while others argued it was basic manners for audience members no matter how young to avoid distracting performers.when asked on 3aw if she would go to one of barker's gigs again faranda said "no and the sad bit is i've been to lots of his shows before children and you kind of lose yourself a bit when you have kids and i was just trying to get back to something i enjoyed before i had kids."australian politician ellen sandell said she was "livid" when she heard about the incident."it's hard enough for new mums to participate in society with all the barriers put in front of them to be humiliated like this for just trying to enjoy the comedy festival is awful" she said on x.to online critics who say the baby should have been left at home sandell said "women have a right to participate in society while breastfeeding.""a note to men who don't get it when a baby's breastfeeding you literally have to be attached to them most of time can't be separated for more than an hour or so" she added. "so if you don't allow breastfed babies in public places you're actually saying womenmums aren't allowed in those places."one user jumped in the conversation on x saying "i am a mother and grandmother and i consider it complete arrogance to think your rights to bring a noisy child to a show outstrip the rights of hundreds of others that have paid to see a show."in a post on facebook barker thanked his supporters for agreeing with his decision with several people commenting below that the mother had acted entitled."in full support! this has nothing to do with breast feeding it's about a comedy act for adults. it's not a wiggles concert! what on earth was she thinking?" said one of his supporters on facebook.the melbourne international comedy festival in which barker was part of the lineup said "any interaction between performers and their audiences requires sensitivity and respect.""in our festival managed venues babes in arms are generally allowed but we do ask people to sit up the back with their child so they can quickly and easily leave if the baby gets noisy so as not to disturb the artist and other patrons" the statement said.
"""

In [56]:
txt

'\nwhat should a comedian do when a baby cradled in its mother\'s arms starts making noises in the middle of a set?american comedian arj barker was faced with that awkward situation during a show in australia on saturday night at the melbourne international comedy festival.in the middle of his hour-long gig the comedian known for his observational and satirical style made the decision to eject the mother and her 7-month-old baby prompting some in the audience to leave the show in solidarity with the mother. other patrons reportedly heckled her as she left.the interaction sparked fierce debate in australia about the rights of mothers to take their babies wherever they like and an entertainer\'s right to perform without interruptions from infants who some argue should be left at home.barker has repeatedly defended his decision to ask trish faranda and her baby clara to leave the event which specified a minimum age of 15 for attendees.speaking to melbourne radio station 3aw monday barker 

In [57]:
sum_t5 = pipe_t5(txt,**gen_kwargs)

In [58]:
len(txt.split())

1007

In [59]:
sum_bart = pipe_bart(' '.join(txt.split()[:810]),**gen_kwargs)

In [60]:
sum_t5[0]['summary_text']

'american comedian arj barker was performing at the melbourne international comedy festival . he asked a mother and her 7-month-old baby to leave the show . the incident sparked fierce debate in australia about the rights of mothers to take their babies wherever they like .'

In [61]:
sum_bart[0]['summary_text']

"american comedian arj barker made the decision to eject the mother and her 7-month-old baby . other patrons reportedly heckled her as she left . the interaction sparked fierce debate in australia about the rights of mothers to take their babies wherever they like and an entertainer's right to perform without interruptions ."