Imports

In [1]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import pandas as pd 
import numpy as np
model_name = "google/pegasus-xsum"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('articles.csv')

In [3]:
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [4]:
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading (…)"pytorch_model.bin";: 100%|██████████| 2.28G/2.28G [12:00<00:00, 3.16MB/s]
Downloading (…)neration_config.json: 100%|██████████| 259/259 [00:00<00:00, 68.3kB/s]


In [6]:
df.head()

Unnamed: 0,title,link,published,text,text_length
0,"Chevron, Comcast, and 10 More Companies That R...",https://www.barrons.com/articles/chevron-comca...,2023-01-28,,0
1,This janitor in Vermont amassed an $8M fortune...,https://finance.yahoo.com/news/janitor-vermont...,2023-01-28,warren buffett is reported to have once said ...,4369
2,New 'PELOSI' bill may have a spiteful name — b...,https://finance.yahoo.com/news/american-people...,2023-01-28,in a mic drop moment in the debate around insi...,5610
3,Lucid stock soars amid Saudi buyout speculation,https://finance.yahoo.com/news/lucid-stock-soa...,2023-01-28,lucid lcid shares soared as much as on f...,2198
4,Billionaire Adani's Empire Loses $51 Billion i...,https://www.thestreet.com/technology/billionai...,2023-01-28,suspicion is creating chaos in the empire buil...,6857


In [7]:
# drop nan values
df = df.dropna()
df.head()

Unnamed: 0,title,link,published,text,text_length
1,This janitor in Vermont amassed an $8M fortune...,https://finance.yahoo.com/news/janitor-vermont...,2023-01-28,warren buffett is reported to have once said ...,4369
2,New 'PELOSI' bill may have a spiteful name — b...,https://finance.yahoo.com/news/american-people...,2023-01-28,in a mic drop moment in the debate around insi...,5610
3,Lucid stock soars amid Saudi buyout speculation,https://finance.yahoo.com/news/lucid-stock-soa...,2023-01-28,lucid lcid shares soared as much as on f...,2198
4,Billionaire Adani's Empire Loses $51 Billion i...,https://www.thestreet.com/technology/billionai...,2023-01-28,suspicion is creating chaos in the empire buil...,6857
7,J.P. Morgan Says Now Could Be a Good Time to B...,https://finance.yahoo.com/news/j-p-morgan-says...,2023-01-28,in todays digital world there will always be ...,7355


In [9]:
# for each text in the dataframe, generate a summary
summaries = []
for text in df['text']:
    tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
    translated = model.generate(**tokens)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    summaries.append(tgt_text[0])

# add the summaries to the dataframe
df['summary'] = summaries
df.head()



Unnamed: 0,title,link,published,text,text_length,summary
1,This janitor in Vermont amassed an $8M fortune...,https://finance.yahoo.com/news/janitor-vermont...,2023-01-28,warren buffett is reported to have once said ...,4369,If you want to be ultra rich you don t have to...
2,New 'PELOSI' bill may have a spiteful name — b...,https://finance.yahoo.com/news/american-people...,2023-01-28,in a mic drop moment in the debate around insi...,5610,The latest attempt to stop members of congress...
3,Lucid stock soars amid Saudi buyout speculation,https://finance.yahoo.com/news/lucid-stock-soa...,2023-01-28,lucid lcid shares soared as much as on f...,2198,Shares in electric car maker lucid have been o...
4,Billionaire Adani's Empire Loses $51 Billion i...,https://www.thestreet.com/technology/billionai...,2023-01-28,suspicion is creating chaos in the empire buil...,6857,Billionaire gautam adani has been caught up in...
7,J.P. Morgan Says Now Could Be a Good Time to B...,https://finance.yahoo.com/news/j-p-morgan-says...,2023-01-28,in todays digital world there will always be ...,7355,cybersecurity has become a major priority for ...


In [10]:
# create summarized_length column
df['summarized_length'] = df['summary'].apply(lambda x: len(x.split()))
df.head()

Unnamed: 0,title,link,published,text,text_length,summary,summarized_length
1,This janitor in Vermont amassed an $8M fortune...,https://finance.yahoo.com/news/janitor-vermont...,2023-01-28,warren buffett is reported to have once said ...,4369,If you want to be ultra rich you don t have to...,58
2,New 'PELOSI' bill may have a spiteful name — b...,https://finance.yahoo.com/news/american-people...,2023-01-28,in a mic drop moment in the debate around insi...,5610,The latest attempt to stop members of congress...,26
3,Lucid stock soars amid Saudi buyout speculation,https://finance.yahoo.com/news/lucid-stock-soa...,2023-01-28,lucid lcid shares soared as much as on f...,2198,Shares in electric car maker lucid have been o...,17
4,Billionaire Adani's Empire Loses $51 Billion i...,https://www.thestreet.com/technology/billionai...,2023-01-28,suspicion is creating chaos in the empire buil...,6857,Billionaire gautam adani has been caught up in...,54
7,J.P. Morgan Says Now Could Be a Good Time to B...,https://finance.yahoo.com/news/j-p-morgan-says...,2023-01-28,in todays digital world there will always be ...,7355,cybersecurity has become a major priority for ...,18


In [11]:
print("Average length of articles: ", df['text_length'].mean())
print("Average length of summaries: ", df['summarized_length'].mean())


Average length of articles:  4865.363636363636
Average length of summaries:  31.181818181818183


In [12]:
# save the dataframe to a csv file
df.to_csv('articles_summarized.csv', index=False)

In [15]:
df.iloc[1]['summary']

'The latest attempt to stop members of congress from using their position to get rich on the stock market has been introduced in the US Senate.'