In [1]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU. Using CPU.")

GPU is available!
Using device: NVIDIA GeForce GTX 1650 Ti


# 1. Loading the Data

In [3]:
df = pd.read_csv(r"C:\Users\neupa\OneDrive\Desktop\data606_capstone_teamC\Data\fake_news_0\WELFake_Dataset.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


So, we have primarily to main input features: title and text.

We can either choose to only train on title or text. The length of text is too big.

If we wish to apply pretrained embeddings, we need to ensure that our input text is not too big and not exceed the max input tokens of the embedding model. Otherwise, our text will be truncated and thats no good.

In [5]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [6]:
# we don't need rows where both 'title' and 'text' are NaN
df = df.dropna(subset=['title', 'text'], how='all')

In [7]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

Well, it looks like either the title is null or text is null but not both. Thats fine. Our approach will utilize both.

In [8]:
random_titles = df['title'].sample(50, random_state=80).values
random_texts = df['text'].sample(50, random_state=80).values

In [9]:
random_titles[0]

"Trump say appeals court decision on travel ban was 'political'"

In [10]:
random_texts[0]

'WASHINGTON (Reuters) - President Donald Trump on Thursday called the appellate court ruling that upheld the suspension of his order restricting travel from seven Muslim-majority countries a “political decision,” and vowed his administration would ultimately prevail. “We’ll see them in court,” Trump told reporters who had gathered outside his press secretary’s office. “It’s a political decision.” Trump said he did not view the ruling as a major setback for his White House. “This is just a decision that came down, but we’re going to win the case,” he said. '

# 2. Tokenization 

We use the [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) embedding model from huggingface. 

This embedding model is relatively lightweight, one of the most popular ones, and converts an input text (max 512 tokens) into a 384 dimensional dense vector embedding. 


The produced embedding can be used for checking sentence similarity, clustering, information retrieval, or text classification by passing in the embeddings as input features.

In [11]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

In [12]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [13]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [14]:
tokenizer

BertTokenizerFast(name_or_path='sentence-transformers/all-MiniLM-L6-v2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

We can see that the model or its tokenizor has more than 30k tokens, and the maximum tokens it can take in as on a input sequence is 512. If less than that, it can pad the text and if bigger, it will truncate. 

In [15]:
random_titles[0]

"Trump say appeals court decision on travel ban was 'political'"

In [16]:
tokenizer.encode(random_titles[0], return_tensors='pt')

tensor([[ 101, 8398, 2360, 9023, 2457, 3247, 2006, 3604, 7221, 2001, 1005, 2576,
         1005,  102]])

The above is the id/index in the tokenizer's vocabulary of the corresponding text. For instance, the text is tokenized as follows:

`['[CLS]', 'Trump', 'say', 'appeals', 'court', 'decision', 'on', 'travel', 'ban', 'was', "'", 'political', "'", '[SEP]']`

In [17]:
ids = [101, 8398, 2360, 9023, 2457, 3247, 2006, 3604, 7221, 2001, 1005, 2576, 1005, 102]
decoded = tokenizer.decode(ids)
print(decoded)

[CLS] trump say appeals court decision on travel ban was ' political ' [SEP]


In [18]:
from tqdm import tqdm

for current_title in tqdm(random_titles):
    encoded = tokenizer.encode(current_title, return_tensors='pt')
    print("Number of tokens for current title:", len(encoded[0]))

100%|██████████| 50/50 [00:00<00:00, 3396.31it/s]

Number of tokens for current title: 14
Number of tokens for current title: 24
Number of tokens for current title: 15
Number of tokens for current title: 14
Number of tokens for current title: 13
Number of tokens for current title: 22
Number of tokens for current title: 42
Number of tokens for current title: 24
Number of tokens for current title: 36
Number of tokens for current title: 21
Number of tokens for current title: 13
Number of tokens for current title: 15
Number of tokens for current title: 10
Number of tokens for current title: 19
Number of tokens for current title: 21
Number of tokens for current title: 23
Number of tokens for current title: 20
Number of tokens for current title: 21
Number of tokens for current title: 12
Number of tokens for current title: 24
Number of tokens for current title: 13
Number of tokens for current title: 18
Number of tokens for current title: 18
Number of tokens for current title: 17
Number of tokens for current title: 15
Number of tokens for curr




Titles aren't big so we don't hit the max token limit. Lets look at text.

In [19]:
for current_text in tqdm(random_texts):
    encoded = tokenizer.encode(current_text, return_tensors='pt')
    print("Number of tokens for current text:", len(encoded[0]))

  0%|          | 0/50 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (700 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 50/50 [00:00<00:00, 407.05it/s]

Number of tokens for current text: 123
Number of tokens for current text: 700
Number of tokens for current text: 570
Number of tokens for current text: 946
Number of tokens for current text: 41
Number of tokens for current text: 1034
Number of tokens for current text: 27
Number of tokens for current text: 153
Number of tokens for current text: 588
Number of tokens for current text: 861
Number of tokens for current text: 117
Number of tokens for current text: 513
Number of tokens for current text: 972
Number of tokens for current text: 272
Number of tokens for current text: 910
Number of tokens for current text: 747
Number of tokens for current text: 315
Number of tokens for current text: 1998
Number of tokens for current text: 26
Number of tokens for current text: 666
Number of tokens for current text: 253
Number of tokens for current text: 10
Number of tokens for current text: 270
Number of tokens for current text: 408
Number of tokens for current text: 265
Number of tokens for curren




Looks like some of the texts are too big (exceed the input token limit). Hmm. I do not want to lose data by truncating. Lets see if i can use LLMs to summarize the title and text and create a new feature instead that is <500 input tokens.

I created an API key on groq.com and set it up in the current virtual environment. Grok is a platform that provides high-throughput and performance endpoints to use and test multiple open-source LLMs for free. There are rate limits to the free account but it is enough for our use case.

In [20]:
api_key = os.getenv("LLM_API_KEY") # this is my GROQ API KEY

In [21]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [22]:
all_title = df["title"].fillna("").values
all_text = df["text"].fillna("").values

In [23]:
# titles_exceeding_512_token_limit = []
# titles_token_count = []

# for title in tqdm(all_title):
#     encoded = tokenizer.encode(title, return_tensors='pt')
#     titles_token_count.append(len(encoded[0]))
#     if len(encoded[0]) > 512:
#         titles_exceeding_512_token_limit.append(title)

In [24]:
titles_exceeding_512_token_limit = []
titles_token_count = []
texts_token_count = []

for i, (current_title, current_text) in enumerate(tqdm(zip(all_title, all_text))):
    encoded_title = tokenizer.encode(current_title, return_tensors='pt')
    encoded_text = tokenizer.encode(current_text, return_tensors='pt')

    titles_token_count.append(len(encoded_title[0]))
    texts_token_count.append(len(encoded_text[0]))

    if len(encoded_title[0]) > 512:
        titles_exceeding_512_token_limit.append(current_title)

72134it [03:25, 350.76it/s]


In [25]:
titles_exceeding_512_token_limit, min(titles_token_count), max(titles_token_count)

([], 2, 109)

Perfect. All of the titles are good. None of them exceed the input limit. The biggest title has 109 tokens whereas there are some empty titles which is why there are just two tokens that are start and end default tokens.

I will keep the title as is. And, i will calculate the maximum number of tokens possible for each text so that my title+text will be <= 512. 

If a title + text is already <=512, it will be left untouched wheres the text that exceeds this limit will be passed to an LLM to summarize to <=512 by text specific prompting.

In [26]:
# available_tokens_for_text = 512 - np.array(titles_token_count)

In [27]:
available_tokens_for_text_summarization = []
summarize_text = []

for text_token_count, title_token_count in tqdm(zip(texts_token_count, titles_token_count)):
    if text_token_count + title_token_count <= 512:
        perform_summarization = False
    else:
        perform_summarization = True


    available_tokens_for_text = 512 - title_token_count
    available_tokens_for_text_summarization.append(available_tokens_for_text)
    summarize_text.append(perform_summarization)

72134it [00:00, 1231858.85it/s]


In [28]:
sum(summarize_text)

36879

In [29]:
true_indices = [idx for idx, val in enumerate(summarize_text) if val]
all_text_true = [all_text[idx] for idx in true_indices]
max_tokens = [available_tokens_for_text_summarization[idx] for idx in true_indices]

In [30]:
len(all_text_true)

36879

Now we have all the big texts, and max tokens available to summarize them. Lets do that using Groq.

I will use llama-3.1-8b-instant from Groq to do the summarization

In [31]:
def return_prompt(
    title: str,
    text: str,
    title_token_count: int,
    text_token_count: int,
    available_tokens_for_summarization: int,
    max_tokens: int = 512
):
    """
    Generates a prompt to summarize or rewrite the input news text to fit within
    the token limit of an embedding model, preserving key information.
    """
    prompt = f"""Input details:
    
    Title: {title}\n
    Title Token Count: {title_token_count}\n
    Original Text: {text}\n
    Original Text Token Count: {text_token_count}\n
    Max Allowed Total Tokens (Title + Text): {max_tokens}\n
    Available Tokens for Text Summary: {available_tokens_for_summarization}\n
    
    Now rewrite the article text only (not the title) within the available token budget. Your output should be clean, self-contained, and informative. 
    Do not add any information that is not present in the original news text. Your output must be exactly text summary and nothing else
    """
    return prompt


In [34]:
''' 
This loop is used to find the first text that requires summarization.
It will break once it finds a text that needs summarization.
'''
for i in range(len(summarize_text)):
    if summarize_text[i]:
        current_available_tokens = available_tokens_for_text_summarization[i]
        current_title = all_title[i]
        current_text = all_text[i]
        current_title_token_count = titles_token_count[i]
        current_text_token_count = texts_token_count[i]
        prompt = return_prompt(
            current_title,
            current_text,
            current_title_token_count,
            current_text_token_count,
            current_available_tokens,
        )
        
        break

In [35]:
# prompts = []
# maximum_allocated_tokens = []

# for i in range(0, len(all_text)):
#     if summarize_text[i]:
#         current_available_tokens = available_tokens_for_text_summarization[i]
#         current_title = all_title[i]
#         current_text = all_text[i]
#         current_title_token_count = titles_token_count[i]
#         current_text_token_count = texts_token_count[i]
#         prompts.append(
#                 return_prompt(
#                 current_title,
#                 current_text,
#                 current_title_token_count,
#                 current_text_token_count,
#                 current_available_tokens,
#             )
#         )
#         maximum_allocated_tokens.append(current_available_tokens)

In [36]:
print(prompt)

Input details:

    Title: LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]

    Title Token Count: 35

    Original Text: No comment is expected from Barack Obama Members of the #FYF911 or #FukYoFlag and #BlackLivesMatter movements called for the lynching and hanging of white people and cops. They encouraged others on a radio show Tuesday night to  turn the tide  and kill white people and cops to send a message about the killing of black people in America.One of the F***YoFlag organizers is called  Sunshine.  She has a radio blog show hosted from Texas called,  Sunshine s F***ing Opinion Radio Show. A snapshot of her #FYF911 @LOLatWhiteFear Twitter page at 9:53 p.m. shows that she was urging supporters to  Call now!! #fyf911 tonight we continue to dismantle the illusion of white Below is a SNAPSHOT Twitter Radio Call Invite   #FYF911The radio show aired at 10:00 p.m. eastern standard time.During the show

In [37]:
len(tokenizer.encode(prompt, return_tensors='pt')[0])

1330

Hmmm.. our prompt iself is ~1500 tokens. So, hopefully, our model will not hallucinate and give us proper summary.

In [38]:
from groq import Groq

llm_api_key = os.getenv("LLM_API_KEY")
client = Groq(api_key=llm_api_key)

In [39]:
def run_groq_summary(prompt_text, max_tokens, model="llama3-70b-8192"):
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": '''
            You are an expert summarizer helping a user prepare input for a sentence embedding model with a strict 512-token input limit. 
            The user will always include the full **title** of the news article. Your job is to **rewrite or summarize the news article text only**, using no more than the available tokens provided. The goal is to preserve **all important meaning** from the article without exceeding the token budget.
            
            You must:
            - Leave the title unchanged
            - Output **only the rewritten article text**
            - Not include anything non-relevant stuff in your response
             
            You will be provided Input details.
            '''
            },
            {"role": "user", "content": prompt_text}
        ],
        model=model,
        temperature=0.1,
        top_p=1,
        max_completion_tokens=max_tokens # this is the maximum number of tokens possible (title tokens + max_tokens = 512)
    )
    return response.choices[0].message.content

In [41]:
print(current_text)

No comment is expected from Barack Obama Members of the #FYF911 or #FukYoFlag and #BlackLivesMatter movements called for the lynching and hanging of white people and cops. They encouraged others on a radio show Tuesday night to  turn the tide  and kill white people and cops to send a message about the killing of black people in America.One of the F***YoFlag organizers is called  Sunshine.  She has a radio blog show hosted from Texas called,  Sunshine s F***ing Opinion Radio Show. A snapshot of her #FYF911 @LOLatWhiteFear Twitter page at 9:53 p.m. shows that she was urging supporters to  Call now!! #fyf911 tonight we continue to dismantle the illusion of white Below is a SNAPSHOT Twitter Radio Call Invite   #FYF911The radio show aired at 10:00 p.m. eastern standard time.During the show, callers clearly call for  lynching  and  killing  of white people.A 2:39 minute clip from the radio show can be heard here. It was provided to Breitbart Texas by someone who would like to be referred to 

In [42]:
print(prompt)

Input details:

    Title: LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]

    Title Token Count: 35

    Original Text: No comment is expected from Barack Obama Members of the #FYF911 or #FukYoFlag and #BlackLivesMatter movements called for the lynching and hanging of white people and cops. They encouraged others on a radio show Tuesday night to  turn the tide  and kill white people and cops to send a message about the killing of black people in America.One of the F***YoFlag organizers is called  Sunshine.  She has a radio blog show hosted from Texas called,  Sunshine s F***ing Opinion Radio Show. A snapshot of her #FYF911 @LOLatWhiteFear Twitter page at 9:53 p.m. shows that she was urging supporters to  Call now!! #fyf911 tonight we continue to dismantle the illusion of white Below is a SNAPSHOT Twitter Radio Call Invite   #FYF911The radio show aired at 10:00 p.m. eastern standard time.During the show

In [45]:
%%time

output = run_groq_summary(prompt, current_available_tokens)

CPU times: total: 0 ns
Wall time: 1.11 s


In [46]:
print(output)

Members of #FYF911 and #BlackLivesMatter movements called for the lynching and hanging of white people and cops on a radio show. They encouraged killing white people and cops to send a message about the killing of black people in America. One organizer, Sunshine, urged supporters to "call now" and "dismantle the illusion of white supremacy. During the show, callers clearly called for lynching and killing of white people. An unidentified black man said they should target white people when they're alone and hang them from a tree, saying "we might as well utilize them for that s**t and turn the tables on these n**ers." He also said that black people are good at starting trends and that would give them the upper-hand. Another black man spoke up saying they needed to kill cops that are killing us. Sunshine was previously upset when racist white people infiltrated and disrupted one of her conference calls, and she released the phone number of one of them, leading to threatening calls.


<b>Original: </b>

No comment is expected from Barack Obama Members of the #FYF911 or #FukYoFlag and #BlackLivesMatter movements called for the lynching and hanging of white people and cops. They encouraged others on a radio show Tuesday night to  turn the tide  and kill white people and cops to send a message about the killing of black people in America.One of the F***YoFlag organizers is called  Sunshine.  She has a radio blog show hosted from Texas called,  Sunshine s F***ing Opinion Radio Show. A snapshot of her #FYF911 @LOLatWhiteFear Twitter page at 9:53 p.m. shows that she was urging supporters to  Call now!! #fyf911 tonight we continue to dismantle the illusion of white Below is a SNAPSHOT Twitter Radio Call Invite   #FYF911The radio show aired at 10:00 p.m. eastern standard time.During the show, callers clearly call for  lynching  and  killing  of white people.A 2:39 minute clip from the radio show can be heard here. It was provided to Breitbart Texas by someone who would like to be referred to as  Hannibal.  He has already received death threats as a result of interrupting #FYF911 conference calls.An unidentified black man said  when those mother f**kers are by themselves, that s when when we should start f***ing them up. Like they do us, when a bunch of them ni**ers takin  one of us out, that s how we should roll up.  He said,  Cause we already roll up in gangs anyway. There should be six or seven black mother f**ckers, see that white person, and then lynch their ass. Let s turn the tables. They conspired that if  cops started losing people,  then  there will be a state of emergency. He speculated that one of two things would happen,  a big-ass [R s?????] war,  or  ni**ers, they are going to start backin  up. We are already getting killed out here so what the f**k we got to lose? Sunshine could be heard saying,  Yep, that s true. That s so f**king true. He said,  We need to turn the tables on them. Our kids are getting shot out here. Somebody needs to become a sacrifice on their side.He said,  Everybody ain t down for that s**t, or whatever, but like I say, everybody has a different position of war.  He continued,  Because they don t give a f**k anyway.  He said again,  We might as well utilized them for that s**t and turn the tables on these n**ers. He said, that way  we can start lookin  like we ain t havin  that many casualties, and there can be more causalities on their side instead of ours. They are out their killing black people, black lives don t matter, that s what those mother f**kers   so we got to make it matter to them. Find a mother f**ker that is alone. Snap his ass, and then f***in hang him from a damn tree. Take a picture of it and then send it to the mother f**kers. We  just need one example,  and  then people will start watchin .  This will turn the tables on s**t, he said. He said this will start  a trickle-down effect.  He said that when one white person is hung and then they are just  flat-hanging,  that will start the  trickle-down effect.  He continued,  Black people are good at starting trends. He said that was how  to get the upper-hand. Another black man spoke up saying they needed to kill  cops that are killing us. The first black male said,  That will be the best method right there. Breitbart Texas previously reported how Sunshine was upset when  racist white people  infiltrated and disrupted one of her conference calls. She subsequently released the phone number of one of the infiltrators. The veteran immediately started receiving threatening calls.One of the #F***YoFlag movement supporters allegedly told a veteran who infiltrated their publicly posted conference call,  We are going to rape and gut your pregnant wife, and your f***ing piece of sh*t unborn creature will be hung from a tree. Breitbart Texas previously encountered Sunshine at a Sandra Bland protest at the Waller County Jail in Texas, where she said all white people should be killed. She told journalists and photographers,  You see this nappy-ass hair on my head?   That means I am one of those more militant Negroes.  She said she was at the protest because  these redneck mother-f**kers murdered Sandra Bland because she had nappy hair like me. #FYF911 black radicals say they will be holding the  imperial powers  that are actually responsible for the terrorist attacks on September 11th accountable on that day, as reported by Breitbart Texas. There are several websites and Twitter handles for the movement. Palmetto Star  describes himself as one of the head organizers. He said in a YouTube video that supporters will be burning their symbols of  the illusion of their superiority,  their  false white supremacy,  like the American flag, the British flag, police uniforms, and Ku Klux Klan hoods.Sierra McGrone or  Nocturnus Libertus  posted,  you too can help a young Afrikan clean their a** with the rag of oppression.  She posted two photos, one that appears to be herself, and a photo of a black man, wiping their naked butts with the American flag.For entire story: Breitbart News

<b>Summarized:</b>

Members of #FYF911 and #BlackLivesMatter movements called for the lynching and hanging of white people and cops on a radio show. They encouraged killing white people and cops to send a message about the killing of black people in America. One organizer, Sunshine, urged supporters to "call now" and "dismantle the illusion of white supremacy. During the show, callers clearly called for lynching and killing of white people. An unidentified black man said they should target white people when they're alone and hang them from a tree, saying "we might as well utilize them for that s**t and turn the tables on these n**ers." He also said that black people are good at starting trends and that would give them the upper-hand. Another black man spoke up saying they needed to kill cops that are killing us. Sunshine was previously upset when racist white people infiltrated and disrupted one of her conference calls, and she released the phone number of one of them, leading to threatening calls.


Not bad at all!!!. So, lets see the token count in the summarized text.

In [47]:
print("The token count of original text of the news article: ", len(tokenizer.encode(current_text, return_tensors='pt')[0]))
print("The token count of new summarized text of the news article: ", len(tokenizer.encode(output, return_tensors='pt')[0]))

The token count of original text of the news article:  1200
The token count of new summarized text of the news article:  215


In [48]:
print("The token count of original title of the news article: ", titles_token_count[-1])

The token count of original title of the news article:  11


In [49]:
titles_token_count[-1] + len(tokenizer.encode(output, return_tensors='pt')[0])

226

<b> Which is less than 512.  Perfect !!!

In [50]:
labels = df.label.values

In [51]:
import csv

with open('new_training_data.csv', 'a', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)

    for i in tqdm(range(0, len(all_text))):
        current_title = all_title[i]
        current_text = all_text[i]
        current_label = labels[i]
        combined_text = f"{current_title}. {current_text}"

        if summarize_text[i]:
            current_available_tokens = available_tokens_for_text_summarization[i]
            current_title_token_count = titles_token_count[i]
            current_text_token_count = texts_token_count[i]
            current_prompt = return_prompt(
                current_title,
                current_text,
                current_title_token_count,
                current_text_token_count,
                current_available_tokens,
            )
            summarized_text = run_groq_summary(current_prompt, current_available_tokens)
            combined_text = f"{current_title}. {summarized_text}"

        writer.writerow([current_title, current_text, combined_text, current_label])

  0%|          | 41/72134 [04:49<141:18:04,  7.06s/it]


KeyboardInterrupt: 

# 3. Generating Embeddings

# 4. Train/Valid/Test

# 5. Model Training, Optimization and Evaluation

# OLD

In [None]:
X = df["text"].values[:1000]

In [None]:
text_embeddings = []

from tqdm import tqdm

for i in tqdm(X):
  text_embeddings.append(model.encode(i))

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
X_emb = np.array(text_embeddings)
y = df["label"].values[:1000]

In [None]:
np.unique_counts(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_emb, y, test_size=0.15, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
log_reg = LogisticRegression(n_jobs=-1)
log_reg.fit(X_train, y_train)

In [None]:
train_preds = log_reg.predict(X_train)
val_preds = log_reg.predict(X_val)
test_preds = log_reg.predict(X_test)

In [None]:
print(classification_report(y_train, train_preds))

In [None]:
print(classification_report(y_val, val_preds))

In [None]:
print(classification_report(y_test, test_preds))

# XGBoost

In [None]:
!pip install xgboost --quiet

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=1000,
                      early_stopping_rounds=30,
                       tree_method="gpu_hist",   # Use GPU
    predictor="gpu_predictor" # Optional: optimize prediction speed on GPU
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)


In [None]:
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)
test_preds = model.predict(X_test)

In [None]:
print("XGBoost")
print(classification_report(y_train, train_preds))

In [None]:
print("XGBoost")
print(classification_report(y_val, val_preds))

In [None]:
print("XGBoost")
print(classification_report(y_test, test_preds))