# Extraktiver Transformer

In [82]:
import pandas as pd

df = pd.read_csv("drive_download/summarization_data_newsroom.csv")
df = df.drop(columns=['density_bin'])
df

Unnamed: 0,text,summary
0,By MATT SCHWARTZ in Houston and WENDELL JAMIES...,"Bleeding from a massive chest wound, Tejano st..."
1,"By HOLLY RAMER, Associated Press\n\nCONCORD, N...","By HOLLY RAMER, Associated Press CONCORD, N.H...."
2,Men in battle-fatigues have raided the Moscow ...,Men in battle-fatigues have&nbsp;raided the Mo...
3,"Like all gadgets, cellphones can break. In fac...",Simple home remedies for repairing your mobile...
4,"I visited Saudi Arabia in September 2008, arri...","Hurricane Ike, Ramadan and the billionaire pri..."
...,...,...
108857,"Ahumdinger TV season wrapped Wednesday night, ...","Ahumdinger TV season wrapped Wednesday night, ..."
108858,A Senate panel reached bipartisan agreement on...,A Senate panel reached bipartisan agreement on...
108859,In a dramatic break with the ideological warfa...,"Fragile bipartisan compromise, at best, has pr..."
108860,"In 1967, LaDonna Davis's boyfriend went on a t...",Get style news headlines from The Washington P...


In [83]:
from summarizer import Summarizer
model = Summarizer('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [84]:
def zsm_df(txt):
    zsm_txt = model(txt, ratio=0.6)
    return zsm_txt

In [85]:
df = df[:500]
df

Unnamed: 0,text,summary
0,By MATT SCHWARTZ in Houston and WENDELL JAMIES...,"Bleeding from a massive chest wound, Tejano st..."
1,"By HOLLY RAMER, Associated Press\n\nCONCORD, N...","By HOLLY RAMER, Associated Press CONCORD, N.H...."
2,Men in battle-fatigues have raided the Moscow ...,Men in battle-fatigues have&nbsp;raided the Mo...
3,"Like all gadgets, cellphones can break. In fac...",Simple home remedies for repairing your mobile...
4,"I visited Saudi Arabia in September 2008, arri...","Hurricane Ike, Ramadan and the billionaire pri..."
...,...,...
495,"Starbucks doesn’t offer bank accounts, but peo...",And the amount has doubled rapidly.
496,"It was 1350º, 1665º, 2960º, and 1270º in the g...",He showed he could take the heat
497,"Right now, Joshua has the perfect resumé: Lond...",The warrior in Anthony Joshua is focused on on...
498,"Halle Berry, Gabriel Aubry and Nahla in London...",Berry and Aubry are co-parenting their daughte...


In [86]:
df["zsm"] = df.apply(lambda row : zsm_df(row["text"]), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["zsm"] = df.apply(lambda row : zsm_df(row["text"]), axis = 1)


In [87]:
df = df.drop(index=df[df["zsm"]==""].index).reset_index(drop=True)

In [88]:
df

Unnamed: 0,text,summary,zsm
0,By MATT SCHWARTZ in Houston and WENDELL JAMIES...,"Bleeding from a massive chest wound, Tejano st...",By MATT SCHWARTZ in Houston and WENDELL JAMIES...
1,"By HOLLY RAMER, Associated Press\n\nCONCORD, N...","By HOLLY RAMER, Associated Press CONCORD, N.H....","By HOLLY RAMER, Associated Press\n\nCONCORD, N..."
2,Men in battle-fatigues have raided the Moscow ...,Men in battle-fatigues have&nbsp;raided the Mo...,Men in battle-fatigues have raided the Moscow ...
3,"Like all gadgets, cellphones can break. In fac...",Simple home remedies for repairing your mobile...,"In fact, our habit of carrying our phones cons..."
4,"I visited Saudi Arabia in September 2008, arri...","Hurricane Ike, Ramadan and the billionaire pri...","I visited Saudi Arabia in September 2008, arri..."
...,...,...,...
495,"Starbucks doesn’t offer bank accounts, but peo...",And the amount has doubled rapidly.,"Starbucks doesn’t offer bank accounts, but peo..."
496,"It was 1350º, 1665º, 2960º, and 1270º in the g...",He showed he could take the heat,"It was 1350º, 1665º, 2960º, and 1270º in the g..."
497,"Right now, Joshua has the perfect resumé: Lond...",The warrior in Anthony Joshua is focused on on...,"Right now, Joshua has the perfect resumé: Lond..."
498,"Halle Berry, Gabriel Aubry and Nahla in London...",Berry and Aubry are co-parenting their daughte...,"Halle Berry, Gabriel Aubry and Nahla in London..."


In [89]:
import evaluate

rouge = evaluate.load('rouge')
results = rouge.compute(predictions=df["zsm"],
                        references=df["summary"])
print(results)

{'rouge1': 0.17833193557790705, 'rouge2': 0.11801871710245598, 'rougeL': 0.15161806024084706, 'rougeLsum': 0.1554151438424187}


---
# Abstraktiver Transformer

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
tokenizer = AutoTokenizer.from_pretrained('t5-base')

2023-07-26 16:52:20.478419: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
text = """New York (CNN Business)Netflix is synonymous with streaming, but its competitors have a distinct advantage that threatens the streaming leader's position at the top.
Disney has Disney+, but it also has theme parks, plush Baby Yoda dolls, blockbuster Marvel movies and ESPN. Comcast (CMCSA), Amazon (AMZN), ViacomCBS (VIACA), CNN's parent company WarnerMedia and Apple (AAPL) all have their own streaming services, too, but they also have other forms of revenue.
As for Netflix (NFLX), its revenue driver is based entirely on building its subscriber base. It's worked out well for the company — so far. But it's starting to look like the king of streaming will soon need something other than new subscribers to keep growing.
The streaming service reported Tuesday it now has 208 million subscribers globally, after adding 4 million subscribers in the first quarter of 2021. But that number missed expectations and the forecasts for its next quarter were also pretty weak.
That was a big whiff for Netflix — a company coming off a massive year of growth thanks in large part to the pandemic driving people indoors — and Wall Street's reaction has not been great.
The company's stock dropped as much as 8% on Wednesday, leading some to wonder what the future of the streamer looks like if competition continues to gain strength, people start heading outdoors and if, most importantly, its growth slows.
"If you hit a wall with [subscriptions] then you pretty much don't have a super growth strategy anymore in your most developed markets," Michael Nathanson, a media analyst and founding partner at MoffettNathanson, told CNN Business. "What can they do to take even more revenue out of the market, above and beyond streaming revenues?"
Or put another way, the company's lackluster user growth last quarter is a signal that it wouldn't hurt if Netflix — a company that's lived and died with its subscriber numbers — started thinking about other ways to make money.
An ad-supported Netflix? Not so fast
There are ways for Netflix to make money other than raising prices or adding subscribers. The most obvious: selling advertising.
Netflix could have 30-second commercials on their programming or get sponsors for their biggest series and films. TV has worked that way forever, why not Netflix?
That's probably not going to happen, given that CEO Reed Hastings has been vocal about the unlikelihood of an ad-supported Netflix service. His reasoning: It doesn't make business sense.
"It's a judgment call... It's a belief we can build a better business, a more valuable business [without advertising]," Hastings told Variety in September. "You know, advertising looks easy until you get in it. Then you realize you have to rip that revenue away from other places because the total ad market isn't growing, and in fact right now it's shrinking. It's hand-to-hand combat to get people to spend less on, you know, ABC and to spend more on Netflix."
Hastings added that "there's much more growth in the consumer market than there is in advertising, which is pretty flat."
He's also expressed doubts about Netflix getting into live sports or news, which could boost the service's allure to subscribers, so that's likely out, too, at least for now.
So if Netflix is looking for other forms of near-term revenue to help support its hefty content budget ($17 billion in 2021 alone) then what can it do? There is one place that could be a revenue driver for Netflix, but if you're borrowing your mother's account you won't like it.
Netflix could crack down on password sharing — a move that the company has been considering lately.
"Basically you're going to clean up some subscribers that are free riders," Nathanson said. "That's going to help them get to a higher level of penetration, definitely, but not in long-term."
Lackluster growth is still growth
Missing projections is never good, but it's hardly the end of the world for Netflix. The company remains the market leader and most competitors are still far from taking the company on. And while Netflix's first-quarter subscriber growth wasn't great, and its forecasts for the next quarter alarmed investors, it was just one quarter.
Netflix has had subscriber misses before and it's still the most dominant name in all of streaming, and even lackluster growth is still growth. It's not as if people are canceling Netflix in droves.
Asked about Netflix's "second act" during the company's post-earnings call on Tuesday, Hastings again placed the company's focus on pleasing subscribers.
"We do want to expand. We used to do that thing shipping DVDs, and luckily we didn't get stuck with that. We didn't define that as the main thing. We define entertainment as the main thing," Hastings said.
He added that he doesn't think Netflix will have a second act in the way Amazon has had with Amazon shopping and Amazon Web Services. Rather, Netflix will continue to improve and grow on what it already does best.
"I'll bet we end with one hopefully gigantic, hopefully defensible profit pool, and continue to improve the service for our members," he said. "I wouldn't look for any large secondary pool of profits. There will be a bunch of supporting pools, like consumer products, that can be both profitable and can support the title brands."""

In [4]:
print(tokenizer.model_max_length)

512


In [5]:
import math
mole = tokenizer.model_max_length
x = math.ceil(len(text.split())/mole)

In [6]:
compression = 0.6

ftextlen = math.ceil((1-compression)*len(text.split()))

In [8]:
textlist = []
for i in range(1,x+1):
    txt = " ".join(str(x) for x in text.split()[mole*(i-1):mole*i])
    tokens_input = tokenizer.encode("summarize: " + txt,
                                return_tensors='pt',
                                max_length=mole,
                                )
    
    wahrs = round(len(txt.split())/len(text.split()),2)
    print(f"Min Length:{math.ceil((wahrs*ftextlen)-(wahrs*ftextlen)*0.05)}, Max length:{math.ceil((wahrs*ftextlen)+(wahrs*ftextlen)*0.05)}")
    summary_ids = model.generate(tokens_input,  min_length=math.ceil((wahrs*ftextlen)-(wahrs*ftextlen)*0.05), max_length=math.ceil((wahrs*ftextlen)+(wahrs*ftextlen)*0.05),num_beams=2)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(len(summary.split()))
    textlist.append(summary)
finaltxt = " ".join(str(x) for x in textlist)

Min Length:437, Max length:483


ValueError: The following `model_kwargs` are not used by the model: ['truncation'] (note: typos in the generate arguments will also show up in this list)

In [None]:
len(finaltxt.split())

531

In [None]:
ftextlen

792

In [None]:
len(text.split())

879

---

# Summarizer + Paraphrasing

In [90]:
from transformers import pipeline
from summarizer import Summarizer
import nltk
nltk.download('punkt')

from nltk import tokenize

def summarizer(txt,r,Summodel,pipe):
    zwstxt = Summodel(txt, ratio=r)

    zwstxtl = tokenize.sent_tokenize(zwstxt)

    output = " ".join([pipe("paraphrase:"+ sent)[0]["generated_text"] for sent in zwstxtl])
    
    return output


[nltk_data] Downloading package punkt to /home/rechcons/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [91]:
import pandas as pd

df = pd.read_csv("drive_download/summarization_data_newsroom.csv")
df = df.drop(columns=['density_bin'])
df = df[:500]
df

Unnamed: 0,text,summary
0,By MATT SCHWARTZ in Houston and WENDELL JAMIES...,"Bleeding from a massive chest wound, Tejano st..."
1,"By HOLLY RAMER, Associated Press\n\nCONCORD, N...","By HOLLY RAMER, Associated Press CONCORD, N.H...."
2,Men in battle-fatigues have raided the Moscow ...,Men in battle-fatigues have&nbsp;raided the Mo...
3,"Like all gadgets, cellphones can break. In fac...",Simple home remedies for repairing your mobile...
4,"I visited Saudi Arabia in September 2008, arri...","Hurricane Ike, Ramadan and the billionaire pri..."
...,...,...
495,"Starbucks doesn’t offer bank accounts, but peo...",And the amount has doubled rapidly.
496,"It was 1350º, 1665º, 2960º, and 1270º in the g...",He showed he could take the heat
497,"Right now, Joshua has the perfect resumé: Lond...",The warrior in Anthony Joshua is focused on on...
498,"Halle Berry, Gabriel Aubry and Nahla in London...",Berry and Aubry are co-parenting their daughte...


In [92]:
Summodel = Summarizer('distilbert-base-uncased')
model_name = "google/flan-t5-base"

pipe = pipeline("text2text-generation",model= model_name)

df["zsm"] = df.apply(lambda row : summarizer(row["text"],0.6,Summodel,pipe), axis = 1)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [93]:
df = df.drop(index=df[df["zsm"]==""].index).reset_index(drop=True)

In [94]:
df

Unnamed: 0,text,summary,zsm
0,By MATT SCHWARTZ in Houston and WENDELL JAMIES...,"Bleeding from a massive chest wound, Tejano st...","Bleeding from a massive chest wound, Tejano st..."
1,"By HOLLY RAMER, Associated Press\n\nCONCORD, N...","By HOLLY RAMER, Associated Press CONCORD, N.H....","CONCORD, N.H. -- A sick American engineer who ..."
2,Men in battle-fatigues have raided the Moscow ...,Men in battle-fatigues have&nbsp;raided the Mo...,men in battle-fatigues raided the headquarters...
3,"Like all gadgets, cellphones can break. In fac...",Simple home remedies for repairing your mobile...,our habit of carrying our phones constantly--e...
4,"I visited Saudi Arabia in September 2008, arri...","Hurricane Ike, Ramadan and the billionaire pri...","I visited Saudi Arabia in September 2008, arri..."
...,...,...,...
495,"Starbucks doesn’t offer bank accounts, but peo...",And the amount has doubled rapidly.,"Starbucks is a coffee chain, but it does offer..."
496,"It was 1350º, 1665º, 2960º, and 1270º in the g...",He showed he could take the heat,"It was 1350o, 1665o, 2960o, and 1270o in Now, ..."
497,"Right now, Joshua has the perfect resumé: Lond...",The warrior in Anthony Joshua is focused on on...,Joshua has the perfect resumé: London 2012 Oly...
498,"Halle Berry, Gabriel Aubry and Nahla in London...",Berry and Aubry are co-parenting their daughte...,Halle Berry and her ex Gabriel Aubry are repor...


In [95]:
import evaluate

rouge = evaluate.load('rouge')
results = rouge.compute(predictions=df["zsm"],
                        references=df["summary"])
print(results)

{'rouge1': 0.19431937791423598, 'rouge2': 0.08712874610235541, 'rougeL': 0.14453400582566894, 'rougeLsum': 0.14502872371943992}
