In [1]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU. Using CPU.")

GPU is available!
Using device: NVIDIA GeForce GTX 1650 Ti


# 1. Loading the Data

In [2]:
fake_news = pd.read_csv(r"C:\Users\neupa\OneDrive\Desktop\data606_capstone_teamC\Data\fake_news_1\News _dataset\Fake.csv")
real_news = pd.read_csv(r"C:\Users\neupa\OneDrive\Desktop\data606_capstone_teamC\Data\fake_news_1\News _dataset\True.csv")

In [3]:
fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
fake_news.iloc[0]["text"]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t ev

In [5]:
real_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
fake_news['label'] = 1
real_news['label'] = 0

df = pd.concat([fake_news, real_news], ignore_index=True)
del fake_news, real_news

In [7]:
df.reset_index(drop=True, inplace=True)

In [8]:
df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


# 2. Tokenization

We use the [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) embedding model from huggingface. 

This embedding model is relatively lightweight, one of the most popular ones, and converts an input text (max 512 tokens) into a 384 dimensional dense vector embedding. 


The produced embedding can be used for checking sentence similarity, clustering, information retrieval, or text classification by passing in the embeddings as input features.

In [13]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [10]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [11]:
df.isnull().sum()   

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [12]:
all_title = df["title"].fillna("").values
all_text = df["text"].fillna("").values

In [14]:
total_title_tokens = []

for current_title in tqdm(all_title):
    encoded = tokenizer.encode(current_title, return_tensors='pt')
    total_title_tokens.append(len(encoded[0]))

100%|██████████| 44898/44898 [00:08<00:00, 5291.99it/s]


In [15]:
max(total_title_tokens)

63

This is good. So, title is within the 512 input token limit for our embedding model. Lets check for text.

In [16]:
total_text_tokens = []

for current_text in tqdm(all_text):
    encoded = tokenizer.encode(current_text, return_tensors='pt')
    total_text_tokens.append(len(encoded[0]))

  0%|          | 0/44898 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (730 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 44898/44898 [01:25<00:00, 522.57it/s] 


In [17]:
max(total_text_tokens)

14285

In [25]:
largest = sorted(set(total_text_tokens), reverse=True)

In [26]:
largest[0], largest[1], largest[2]

(14285, 10292, 9746)

In [27]:
total_text_token_np = np.array(total_text_tokens)

In [28]:
total_text_token_np[total_text_token_np > 512].shape

(17854,)

Not good! Looks like we need to summarize this again.

In [29]:
available_tokens_for_text_summarization = []
summarize_text = []

for text_token_count, title_token_count in tqdm(zip(total_text_tokens, total_title_tokens)):
    if text_token_count + title_token_count <= 512:
        perform_summarization = False
    else:
        perform_summarization = True


    available_tokens_for_text = 512 - title_token_count
    available_tokens_for_text_summarization.append(available_tokens_for_text)
    summarize_text.append(perform_summarization)

44898it [00:00, 1446914.03it/s]


In [30]:
true_indices = [idx for idx, val in enumerate(summarize_text) if val]
all_text_true = [all_text[idx] for idx in true_indices]
max_tokens = [available_tokens_for_text_summarization[idx] for idx in true_indices]

Now we have all the big texts, and max tokens available to summarize them. Lets do that using Groq.

I will use llama-3.1-8b-instant from Groq to do the summarization

In [42]:
def return_prompt(
    title: str,
    text: str,
    title_token_count: int,
    text_token_count: int,
    available_tokens_for_summarization: int,
    max_tokens: int = 512
):
    """
    Generates a prompt to summarize or rewrite the input news text to fit within
    the token limit of an embedding model, preserving key information.
    """
    prompt = f"""Input details:
    
    Title: {title}\n
    Title Token Count: {title_token_count}\n
    Original Text: {text}\n
    Original Text Token Count: {text_token_count}\n
    Max Allowed Total Tokens (Title + Text): {max_tokens}\n
    Available Tokens for Text Summary: {available_tokens_for_summarization}\n
    
    Now rewrite the article text only (not the title) within the available token budget. Your output should be clean, self-contained, and informative. 
    Do not add any information that is not present in the original news text or do not censor the text.
    Your output must be exactly text summary and nothing else. You are also not allowed to water-down or censor the text as the text may contain sensitive information related 
    to politics, race or religion. Your job is to just summarize the text and not censor/water-down the text in any way shape or form.
    The summary should be concise, informative, and maintain the original meaning.
    """
    return prompt


In [43]:
labels = df['label'].values

In [44]:
''' 
This loop is used to find the first text that requires summarization.
It will break once it finds a text that needs summarization.
'''
for i in range(len(summarize_text)):
    if summarize_text[i]:
        current_available_tokens = available_tokens_for_text_summarization[i]
        current_title = all_title[i]
        current_text = all_text[i]
        current_title_token_count = total_title_tokens[i]
        current_text_token_count = total_text_tokens[i]
        prompt = return_prompt(
            current_title,
            current_text,
            current_title_token_count,
            current_text_token_count,
            current_available_tokens,
        )
        
        break

In [45]:
print(labels[i])

1


In [46]:
prompts = []
maximum_allocated_tokens = []

for i in range(0, len(all_text)):
    if summarize_text[i]:
        current_available_tokens = available_tokens_for_text_summarization[i]
        current_title = all_title[i]
        current_text = all_text[i]
        current_title_token_count = total_title_tokens[i]
        current_text_token_count = total_text_tokens[i]
        prompts.append(
                return_prompt(
                current_title,
                current_text,
                current_title_token_count,
                current_text_token_count,
                current_available_tokens,
            )
        )
        maximum_allocated_tokens.append(current_available_tokens)

In [47]:
len(summarize_text)

44898

In [48]:
print(prompt)

Input details:

    Title:  Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing

    Title Token Count: 17

    Original Text: Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as y

In [49]:
len(tokenizer.encode(prompt, return_tensors='pt')[0])

921

Hmmm.. our prompt iself is ~921 tokens. So, hopefully, our model will not hallucinate and give us proper summary.

In [50]:
from groq import Groq

llm_api_key = os.getenv("LLM_API_KEY")
client = Groq(api_key=llm_api_key)

In [51]:
system_prompt = '''You are an expert summarizer helping a user prepare input for a sentence embedding model with a strict 512-token input limit.
The user will always include the full **title** of the news article. Your job is to **rewrite or summarize the news article text only**, using no more than the available tokens provided. The goal is to preserve **all important meaning** from the article without exceeding the token budget.

You must:
- Leave the title unchanged
- Output **only the rewritten article text**
- Not include anything non-relevant stuff in your response

You will be provided Input details.
'''

In [52]:
def run_groq_summary(prompt_text, max_tokens, model="llama3-70b-8192"):
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content":system_prompt},
            {"role": "user", "content": prompt_text}
        ],
        model=model,
        temperature=0.1,
        top_p=1,
        max_completion_tokens=max_tokens # this is the maximum number of tokens possible (title tokens + max_tokens = 512)
    )
    return response.choices[0].message.content

In [53]:
print(current_text)

BRUSSELS (Reuters) - NATO allies on Tuesday welcomed President Donald Trump s decision to commit more forces to Afghanistan, as part of a new U.S. strategy he said would require more troops and funding from America s partners. Having run for the White House last year on a pledge to withdraw swiftly from Afghanistan, Trump reversed course on Monday and promised a stepped-up military campaign against  Taliban insurgents, saying:  Our troops will fight to win .  U.S. officials said he had signed off on plans to send about 4,000 more U.S. troops to add to the roughly 8,400 now deployed in Afghanistan. But his speech did not define benchmarks for successfully ending the war that began with the U.S.-led invasion of Afghanistan in 2001, and which he acknowledged had required an   extraordinary sacrifice of blood and treasure .  We will ask our NATO allies and global partners to support our new strategy, with additional troops and funding increases in line with our own. We are confident they w

In [54]:
print(prompt)

Input details:

    Title:  Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing

    Title Token Count: 17

    Original Text: Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as y

In [55]:
%%time

output = run_groq_summary(prompt, current_available_tokens)

CPU times: total: 93.8 ms
Wall time: 1.15 s


In [56]:
print(output)

Donald Trump's New Year's Eve message sparked outrage as he wished a Happy New Year to his "enemies, haters, and the very dishonest Fake News Media." The tweet was met with criticism, with many calling it "despicable, petty, and infantile." Trump's lack of decency was on full display as he couldn't even rise above his grievances to wish Americans a happy new year. This is not a new behavior for Trump, who has a history of directing messages to his "enemies" and "haters" on various holidays. His un-presidential tweets have become a hallmark of his presidency.


<b> Original </b>

Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t even allow him to rise above the gutter long enough to wish the American citizens a happy new year!  Bishop Talbert Swan (@TalbertSwan) December 31, 2017no one likes you  Calvin (@calvinstowell) December 31, 2017Your impeachment would make 2018 a great year for America, but I ll also accept regaining control of Congress.  Miranda Yaver (@mirandayaver) December 31, 2017Do you hear yourself talk? When you have to include that many people that hate you you have to wonder? Why do the they all hate me?  Alan Sandoval (@AlanSandoval13) December 31, 2017Who uses the word Haters in a New Years wish??  Marlene (@marlene399) December 31, 2017You can t just say happy new year?  Koren pollitt (@Korencarpenter) December 31, 2017Here s Trump s New Year s Eve tweet from 2016.Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love!  Donald J. Trump (@realDonaldTrump) December 31, 2016This is nothing new for Trump. He s been doing this for years.Trump has directed messages to his  enemies  and  haters  for New Year s, Easter, Thanksgiving, and the anniversary of 9/11. pic.twitter.com/4FPAe2KypA  Daniel Dale (@ddale8) December 31, 2017Trump s holiday tweets are clearly not presidential.How long did he work at Hallmark before becoming President?  Steven Goodine (@SGoodine) December 31, 2017He s always been like this . . . the only difference is that in the last few years, his filter has been breaking down.  Roy Schulze (@thbthttt) December 31, 2017Who, apart from a teenager uses the term haters?  Wendy (@WendyWhistles) December 31, 2017he s a fucking 5 year old  Who Knows (@rainyday80) December 31, 2017So, to all the people who voted for this a hole thinking he would change once he got into power, you were wrong! 70-year-old men don t change and now he s a year older.Photo by Andrew Burton/Getty Images.


<b> Summary </b>

Donald Trump's New Year's Eve message sparked outrage as he wished a Happy New Year to his "enemies, haters, and the very dishonest Fake News Media." The tweet was met with criticism, with many calling it "despicable, petty, and infantile." Trump's lack of decency was on full display as he couldn't even rise above his grievances to wish Americans a happy new year. This is not a new behavior for Trump, who has a history of directing messages to his "enemies" and "haters" on various holidays. His un-presidential tweets have become a hallmark of his presidency.

In [57]:
print("The token count of original text of the news article: ", len(tokenizer.encode(current_text, return_tensors='pt')[0]))
print("The token count of new summarized text of the news article: ", len(tokenizer.encode(output, return_tensors='pt')[0]))

The token count of original text of the news article:  559
The token count of new summarized text of the news article:  141


In [59]:
print("The token count of original title of the news article: ", total_title_tokens[-1])

The token count of original title of the news article:  14


In [60]:
total_title_tokens[-1] + len(tokenizer.encode(output, return_tensors='pt')[0])

155

<b> Which is less than 512.  Perfect !!!

In [61]:
llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

def get_token_count_for_prompt(prompt_text):
    """
    Returns the token count for a given prompt text.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt_text},
    ]

    prompt = llm_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    encoded = llm_tokenizer.encode(prompt, return_tensors='pt')
    return len(encoded[0])

In [62]:
texts = []
labels_for_text_summarization = []
title_tokens= []
text_tokens = []
new_title = []
new_text = []
prompt_token_count = []

for i in tqdm(range(0, len(all_text))):
    current_title = all_title[i]
    current_text = all_text[i]
    current_label = labels[i]
    combined_text = f"{current_title}. {current_text}"

    if summarize_text[i]:
        current_available_tokens = available_tokens_for_text_summarization[i]
        current_title_token_count = total_title_tokens[i]
        current_text_token_count = total_text_tokens[i]
        current_prompt = return_prompt(
            current_title,
            current_text,
            current_title_token_count,
            current_text_token_count,
            current_available_tokens,
        )
        

        texts.append(current_prompt)
        new_title.append(current_title)
        new_text.append(current_text)
        title_tokens.append(current_title_token_count)
        text_tokens.append(current_text_token_count)
        prompt_token_count.append(get_token_count_for_prompt(current_prompt))        
        labels_for_text_summarization.append(current_label)

100%|██████████| 44898/44898 [01:30<00:00, 494.94it/s] 


In [68]:
df_for_summarization = pd.DataFrame({
    'title': new_title, 
    'text': new_text,
    'label': labels_for_text_summarization,
    "prompt": texts,
    "title_tokens": title_tokens,
    "text_tokens": text_tokens,
    "prompt_token_count": prompt_token_count
})

In [69]:
df_for_summarization

Unnamed: 0,title,text,label,prompt,title_tokens,text_tokens,prompt_token_count
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,Input details:\n\n Title: Donald Trump Sen...,17,730,1109
1,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1,Input details:\n\n Title: Sheriff David Cl...,19,930,1299
2,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",1,Input details:\n\n Title: Trump Is So Obse...,20,706,1050
3,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,1,Input details:\n\n Title: Former CIA Direc...,25,574,924
4,Heiress To Disney Empire Knows GOP Scammed Us...,Abigail Disney is an heiress with brass ovarie...,1,Input details:\n\n Title: Heiress To Disne...,20,630,989
...,...,...,...,...,...,...,...
19243,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,0,"Input details:\n\n Title: U.S., North Korea...",19,889,1237
19244,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,0,"Input details:\n\n Title: U.S., North Korea...",19,889,1237
19245,"U.S., North Korea clash at U.N. arms forum on ...",GENEVA (Reuters) - North Korea and the United ...,0,"Input details:\n\n Title: U.S., North Korea...",20,810,1160
19246,North Korea shipments to Syria chemical arms a...,UNITED NATIONS (Reuters) - Two North Korean sh...,0,Input details:\n\n Title: North Korea shipm...,17,554,906


In [70]:
df_for_summarization["prompt_token_count"].max()

np.int64(10636)

In [66]:
# saving the texts that need summarization to a CSV file which wiill be used later for doing the summarization
pd.DataFrame({
    'text': texts,
    'label': labels_for_text_summarization
}).to_csv("Summary_prompts_FinalTesting.csv", index=False)

In [67]:
pd.DataFrame({
    'text': texts,
    'label': labels_for_text_summarization
})

Unnamed: 0,text,label
0,Input details:\n\n Title: Donald Trump Sen...,1
1,Input details:\n\n Title: Sheriff David Cl...,1
2,Input details:\n\n Title: Trump Is So Obse...,1
3,Input details:\n\n Title: Former CIA Direc...,1
4,Input details:\n\n Title: Heiress To Disne...,1
...,...,...
19243,"Input details:\n\n Title: U.S., North Korea...",0
19244,"Input details:\n\n Title: U.S., North Korea...",0
19245,"Input details:\n\n Title: U.S., North Korea...",0
19246,Input details:\n\n Title: North Korea shipm...,0


<b> We will now proceed to notebook llm_summarization.ipynb which will do the summarization

In [71]:
summarized_df = pd.read_csv("llama3_2_3b_summary_results_FinalTesting.csv")

In [72]:
summarized_df.head()

Unnamed: 0,Prompts,Results
0,<|begin_of_text|><|start_header_id|>system<|en...,Donald Trump wished Americans a Happy New Year...
1,<|begin_of_text|><|start_header_id|>system<|en...,Former Milwaukee Sheriff David Clarke is facin...
2,<|begin_of_text|><|start_header_id|>system<|en...,Donald Trump announced he'd return to work the...
3,<|begin_of_text|><|start_header_id|>system<|en...,Many people have raised alarm about Donald Tru...
4,<|begin_of_text|><|start_header_id|>system<|en...,"Abigail Disney, an heiress to the Disney empir..."


In [73]:
new_title = []
new_text = []
new_label = []
summarized_text = []
title_tokens = []
text_tokens = []

for i, (title, text, label) in enumerate(tqdm(zip(all_title, all_text, labels))):
    if not summarize_text[i]:
        new_title.append(title)
        new_text.append(text)
        new_label.append(label)
        title_tokens.append(total_title_tokens[i])
        text_tokens.append(total_text_tokens[i])
        summarized_text.append(np.nan)

df1 = pd.DataFrame({
    'title': new_title, 
    'text': new_text,
    'label': new_label,
    "title_tokens": title_tokens,
    "text_tokens": text_tokens,
    'text_summary': summarized_text
})

44898it [00:00, 334412.18it/s]


In [74]:
df1

Unnamed: 0,title,text,label,title_tokens,text_tokens,text_summary
0,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,13,387,
1,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1,13,477,
2,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,1,18,367,
3,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,1,19,481,
4,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,1,19,454,
...,...,...,...,...,...,...
25645,Headless torso could belong to submarine journ...,COPENHAGEN (Reuters) - Danish police said on T...,0,13,353,
25646,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",0,13,168,
25647,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,0,9,422,
25648,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,0,11,236,


In [75]:
print(df_for_summarization["prompt_token_count"].max()) # this is way too long text; we are excluding any news article with tokens larger than 20000
df_for_summarization = df_for_summarization[df_for_summarization["prompt_token_count"] < 20000]

10636


In [76]:
df_for_summarization = df_for_summarization.reset_index(drop=True)

In [77]:
df_for_summarization.shape, summarized_df.shape

((19248, 7), (19248, 2))

In [80]:
df_for_summarization["text_summary"] = summarized_df['Results']

In [81]:
print(df_for_summarization.sample(1, random_state=42)["text"].values[0])

Establishment Republican pollster Frank Luntz looked more like a CNN host than a Republican pollster in a room full of committed Roy Moore voters. Luntz was obviously frustrated, as Alabama voters dug in their heels, and refused to back down on their support for Judge Roy Moore in today s Senate election to fill AG Jeff Sessions seat in Alabama.MONTGOMERY, Alabama   Frank Luntz, a GOP establishment messaging consultant, was visibly flabbergasted as every single one of his focus group participants in a Birmingham area Vice News-produced panel backed Judge Roy Moore for U.S. Senate. Titled  Why These Alabama Voters Are Sticking By Roy Moore,  Luntz s Vice News focus group aired on Vice News Tonight on Dec. 8 on HBO. Are you all Christians here?  Luntz opens the seven-and-a-half-minute long segment.  Yes,  all of the focus group participants, who joined Luntz in a Birmingham area restaurant, replied. Is Roy Moore a good Christian?  he followed up. Yes,  one woman replied.  Absolutely,  an

<b> Original </b>

Establishment Republican pollster Frank Luntz looked more like a CNN host than a Republican pollster in a room full of committed Roy Moore voters. Luntz was obviously frustrated, as Alabama voters dug in their heels, and refused to back down on their support for Judge Roy Moore in today s Senate election to fill AG Jeff Sessions seat in Alabama.MONTGOMERY, Alabama   Frank Luntz, a GOP establishment messaging consultant, was visibly flabbergasted as every single one of his focus group participants in a Birmingham area Vice News-produced panel backed Judge Roy Moore for U.S. Senate. Titled  Why These Alabama Voters Are Sticking By Roy Moore,  Luntz s Vice News focus group aired on Vice News Tonight on Dec. 8 on HBO. Are you all Christians here?  Luntz opens the seven-and-a-half-minute long segment.  Yes,  all of the focus group participants, who joined Luntz in a Birmingham area restaurant, replied. Is Roy Moore a good Christian?  he followed up. Yes,  one woman replied.  Absolutely,  another said. Absolutely?  Luntz followed up in disbelief.  Yes,  the woman shot back.  Without any doubt whatsoever?  Luntz asked again.After some more back and forth, a man in the focus group spoke up. Scottie Porter, a real estate developer, said:He s not my choice, I m not voting for him because I like him. I m voting for him because I don t want Doug Jones. But Roy Moore is entitled to the presumption of innocence in the law and in the Bible just like anybody else should be. There are only accusations. There have been no charges filed. All you have is a group of women who have come forward. How many? How many?  Luntz pressed Porter. Seven,  he replied. There s really only three,  one woman yelled out. How many women have to come forward before you say  wait a minute, where there s smoke there s fire ?  Luntz asked the group.Chuck Moore, a retired sales consultant, replied:  It s about the legitimacy, not just how many. How many are not being paid? Or being coerced to do this? How many of them do you think are being paid?  Luntz asked the group. All of them,  some replied in unison. By a show of hands, how many of you think all the women are being paid?  Luntz asked the full group.Three hands in the group went up. Seriously?  Luntz asked in disbelief, before the camera turned to homemaker Jane Wade. To me, there are only two women that have a smoking gun but the women s their reputations are questionable at the time,  Wade said. Is this how you want to be treated as a woman if something were to happen to you? Do you want to be dismissed that way?  Luntz asked Gina Doran, a retired school bus driver. You better have proof,  Doran fired back at Luntz.Watch: Breitbart News

In [82]:
print(df_for_summarization.sample(1, random_state=42)["text_summary"].values[0])

Establishment Republican pollster Frank Luntz looked frustrated as Alabama voters dug in their heels, refusing to back down on their support for Judge Roy Moore in the Senate election. Luntz's focus group, titled "Why These Alabama Voters Are Sticking By Roy Moore," aired on Vice News Tonight on HBO, featuring a panel of committed Moore supporters. When asked if they were Christians, all participants replied "yes." Luntz pressed on, asking if Moore was a good Christian, with one woman responding "absolutely." A real estate developer, Scottie Porter, explained he's voting for Moore because he doesn't want Doug Jones, but believes Moore is entitled to the presumption of innocence. Porter questioned the legitimacy of the accusations against Moore, suggesting that only those who have been paid or coerced should be considered guilty. A homemaker, Jane Wade, expressed concern that women who make accusations against Moore are being treated unfairly. A retired school bus driver, Gina Doran, to

<b> Summarized </b>

Establishment Republican pollster Frank Luntz looked frustrated as Alabama voters dug in their heels, refusing to back down on their support for Judge Roy Moore in the Senate election. Luntz's focus group, titled "Why These Alabama Voters Are Sticking By Roy Moore," aired on Vice News Tonight on HBO, featuring a panel of committed Moore supporters. When asked if they were Christians, all participants replied "yes." Luntz pressed on, asking if Moore was a good Christian, with one woman responding "absolutely." A real estate developer, Scottie Porter, explained he's voting for Moore because he doesn't want Doug Jones, but believes Moore is entitled to the presumption of innocence. Porter questioned the legitimacy of the accusations against Moore, suggesting that only those who have been paid or coerced should be considered guilty. A homemaker, Jane Wade, expressed concern that women who make accusations against Moore are being treated unfairly. A retired school bus driver, Gina Doran, told Luntz to "you better have proof" before believing the accusations.

In [83]:
df_for_summarization = df_for_summarization[df1.columns]

In [84]:
final_df = pd.concat([df1, df_for_summarization], ignore_index=True)

In [85]:
final_df.to_csv("final_data_with_summarization_FinalTesting1.csv", index=False)

In [86]:
final_df

Unnamed: 0,title,text,label,title_tokens,text_tokens,text_summary
0,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,13,387,
1,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1,13,477,
2,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,1,18,367,
3,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,1,19,481,
4,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,1,19,454,
...,...,...,...,...,...,...
44893,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,0,19,889,North Korea and the US clashed at a U.N. forum...
44894,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,0,19,889,North Korea and the United States clashed at a...
44895,"U.S., North Korea clash at U.N. arms forum on ...",GENEVA (Reuters) - North Korea and the United ...,0,20,810,North Korea and the US clashed at the United N...
44896,North Korea shipments to Syria chemical arms a...,UNITED NATIONS (Reuters) - Two North Korean sh...,0,17,554,Two North Korean shipments to a Syrian governm...
