In [1]:
import os
import torch
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from vllm import LLM, SamplingParams

In [None]:
os.environ["HF_TOKEN"] = "HF_TOKEN"

In [3]:
!nvidia-smi

Sun Jun 22 18:33:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.20             Driver Version: 570.133.20     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10G                    On  |   00000000:00:1E.0 Off |                    0 |
|  0%   25C    P8              9W /  300W |       0MiB /  23028MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Loading our LLM onto vLLM and preparing tokenizer

Selected LLM: [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)

In [4]:
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU. Using CPU.")

GPU is available!
Using device: NVIDIA A10G


In [3]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

In [None]:
sampling_params = SamplingParams(temperature=0.1, top_p=1, max_tokens=512)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [7]:
llm = LLM(model=model_id, dtype=torch.bfloat16, max_model_len=25000) # one of the prompts has lots of tokens so

INFO 06-22 18:33:54 [config.py:823] This model supports multiple tasks: {'generate', 'reward', 'score', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 06-22 18:33:54 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 06-22 18:34:00 [__init__.py:244] Automatically detected platform cuda.
INFO 06-22 18:34:03 [core.py:455] Waiting for init message from front-end.
INFO 06-22 18:34:03 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='meta-llama/Llama-3.2-3B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=25000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:36<00:36, 36.85s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:47<00:00, 21.71s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:47<00:00, 23.98s/it]



INFO 06-22 18:34:54 [default_loader.py:272] Loading weights took 48.03 seconds
INFO 06-22 18:34:54 [gpu_model_runner.py:1624] Model loading took 6.0165 GiB and 48.859937 seconds
INFO 06-22 18:35:02 [backends.py:462] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/e9dfdcf700/rank_0_0 for vLLM's torch.compile
INFO 06-22 18:35:02 [backends.py:472] Dynamo bytecode transform time: 8.19 s
INFO 06-22 18:35:06 [backends.py:161] Cache the graph of shape None for later use
INFO 06-22 18:35:29 [backends.py:173] Compiling a graph for general shape takes 26.13 s
INFO 06-22 18:35:37 [monitor.py:34] torch.compile takes 34.32 s in total
INFO 06-22 18:35:38 [gpu_worker.py:227] Available KV cache memory: 12.61 GiB
INFO 06-22 18:35:39 [kv_cache_utils.py:715] GPU KV cache size: 118,080 tokens
INFO 06-22 18:35:39 [kv_cache_utils.py:719] Maximum concurrency for 25,000 tokens per request: 4.72x
INFO 06-22 18:36:01 [gpu_model_runner.py:2048] Graph capturing finished in 23 secs, took 0.45 G

# Testing on a single prompt for speed

In [None]:
system_prompt = '''You are an expert summarizer helping a user prepare input for a sentence embedding model with a strict 512-token input limit.
The user will always include the full **title** of the news article. Your job is to **rewrite or summarize the news article text only**, using no more than the available tokens provided. The goal is to preserve **all important meaning** from the article without exceeding the token budget.

You must:
- Leave the title unchanged
- Output **only the rewritten article text**
- Not include anything non-relevant stuff in your response

You will be provided Input details.
'''

prompt = '''Input details:

    Title: LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]

    Title Token Count: 35

    Original Text: No comment is expected from Barack Obama Members of the #FYF911 or #FukYoFlag and #BlackLivesMatter movements called for the lynching and hanging of white people and cops. They encouraged others on a radio show Tuesday night to  turn the tide  and kill white people and cops to send a message about the killing of black people in America.One of the F***YoFlag organizers is called  Sunshine.  She has a radio blog show hosted from Texas called,  Sunshine s F***ing Opinion Radio Show. A snapshot of her #FYF911 @LOLatWhiteFear Twitter page at 9:53 p.m. shows that she was urging supporters to  Call now!! #fyf911 tonight we continue to dismantle the illusion of white Below is a SNAPSHOT Twitter Radio Call Invite   #FYF911The radio show aired at 10:00 p.m. eastern standard time.During the show, callers clearly call for  lynching  and  killing  of white people.A 2:39 minute clip from the radio show can be heard here. It was provided to Breitbart Texas by someone who would like to be referred to as  Hannibal.  He has already received death threats as a result of interrupting #FYF911 conference calls.An unidentified black man said  when those mother f**kers are by themselves, that s when when we should start f***ing them up. Like they do us, when a bunch of them ni**ers takin  one of us out, that s how we should roll up.  He said,  Cause we already roll up in gangs anyway. There should be six or seven black mother f**ckers, see that white person, and then lynch their ass. Let s turn the tables. They conspired that if  cops started losing people,  then  there will be a state of emergency. He speculated that one of two things would happen,  a big-ass [R s?????] war,  or  ni**ers, they are going to start backin  up. We are already getting killed out here so what the f**k we got to lose? Sunshine could be heard saying,  Yep, that s true. That s so f**king true. He said,  We need to turn the tables on them. Our kids are getting shot out here. Somebody needs to become a sacrifice on their side.He said,  Everybody ain t down for that s**t, or whatever, but like I say, everybody has a different position of war.  He continued,  Because they don t give a f**k anyway.  He said again,  We might as well utilized them for that s**t and turn the tables on these n**ers. He said, that way  we can start lookin  like we ain t havin  that many casualties, and there can be more causalities on their side instead of ours. They are out their killing black people, black lives don t matter, that s what those mother f**kers   so we got to make it matter to them. Find a mother f**ker that is alone. Snap his ass, and then f***in hang him from a damn tree. Take a picture of it and then send it to the mother f**kers. We  just need one example,  and  then people will start watchin .  This will turn the tables on s**t, he said. He said this will start  a trickle-down effect.  He said that when one white person is hung and then they are just  flat-hanging,  that will start the  trickle-down effect.  He continued,  Black people are good at starting trends. He said that was how  to get the upper-hand. Another black man spoke up saying they needed to kill  cops that are killing us. The first black male said,  That will be the best method right there. Breitbart Texas previously reported how Sunshine was upset when  racist white people  infiltrated and disrupted one of her conference calls. She subsequently released the phone number of one of the infiltrators. The veteran immediately started receiving threatening calls.One of the #F***YoFlag movement supporters allegedly told a veteran who infiltrated their publicly posted conference call,  We are going to rape and gut your pregnant wife, and your f***ing piece of sh*t unborn creature will be hung from a tree. Breitbart Texas previously encountered Sunshine at a Sandra Bland protest at the Waller County Jail in Texas, where she said all white people should be killed. She told journalists and photographers,  You see this nappy-ass hair on my head?   That means I am one of those more militant Negroes.  She said she was at the protest because  these redneck mother-f**kers murdered Sandra Bland because she had nappy hair like me. #FYF911 black radicals say they will be holding the  imperial powers  that are actually responsible for the terrorist attacks on September 11th accountable on that day, as reported by Breitbart Texas. There are several websites and Twitter handles for the movement. Palmetto Star  describes himself as one of the head organizers. He said in a YouTube video that supporters will be burning their symbols of  the illusion of their superiority,  their  false white supremacy,  like the American flag, the British flag, police uniforms, and Ku Klux Klan hoods.Sierra McGrone or  Nocturnus Libertus  posted,  you too can help a young Afrikan clean their a** with the rag of oppression.  She posted two photos, one that appears to be herself, and a photo of a black man, wiping their naked butts with the American flag.For entire story: Breitbart News

    Original Text Token Count: 1200

    Max Allowed Total Tokens (Title + Text): 512

    Available Tokens for Text Summary: 477


    Now rewrite the article text only (not the title) within the available token budget. Your output should be clean, self-contained, and informative.
    Do not add any information that is not present in the original news text. Your output must be exactly text summary and nothing else'''

In [6]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt},
]

In [7]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [11]:
outputs = llm.generate([prompt]*100, sampling_params)

Adding requests: 100%|██████████| 100/100 [00:00<00:00, 341.67it/s]
Processed prompts: 100%|██████████| 100/100 [00:08<00:00, 11.31it/s, est. speed input: 16734.03 toks/s, output: 2348.41 toks/s]


In [12]:
print(outputs[-1].outputs[0].text)

Members of the #FYF911 and #BlackLivesMatter movements called for the lynching and hanging of white people and cops, urging others to send a message about the killing of black people in America. A radio show host, Sunshine, encouraged supporters to kill white people and cops, saying "Find a mother f**ker that is alone. Snap his ass, and then f***in hang him from a damn tree." 

A caller on the show said, "When those mother f**kers are by themselves, that's when we should start f***ing them up." Another caller suggested lynching cops that are killing black people. Sunshine previously released the phone number of a veteran who infiltrated one of her conference calls, and he received threatening calls. 

Sunshine has made previous statements, including saying all white people should be killed and that she was at a Sandra Bland protest because "these redneck mother-f**kers murdered Sandra Bland because she had nappy hair like me." The movement plans to hold the imperial powers accountable 

In [13]:
len(tokenizer.encode(outputs[-1].outputs[0].text, return_tensors='pt')[0])

221

Well, processing 100 prompts took ~11 seconds, and we have an average of 2500 output tokens per second which is not bad at all. Lets process all prompts now. 

# Reading file and preparing prompts

In [12]:
df = pd.read_csv("Summary_prompts.csv")

In [13]:
print(df.shape)
df.head()

(36879, 2)


Unnamed: 0,text,label
0,Input details:\n\n Title: LAW ENFORCEMENT O...,1
1,"Input details:\n\n Title: Bobby Jindal, rai...",0
2,Input details:\n\n Title: Latest Pipeline L...,1
3,Input details:\n\n Title: GOP Senator Just...,1
4,Input details:\n\n Title: Schumer calls on ...,0


In [14]:
print(df.text.values[0])

Input details:

    Title: LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]

    Title Token Count: 35

    Original Text: No comment is expected from Barack Obama Members of the #FYF911 or #FukYoFlag and #BlackLivesMatter movements called for the lynching and hanging of white people and cops. They encouraged others on a radio show Tuesday night to  turn the tide  and kill white people and cops to send a message about the killing of black people in America.One of the F***YoFlag organizers is called  Sunshine.  She has a radio blog show hosted from Texas called,  Sunshine s F***ing Opinion Radio Show. A snapshot of her #FYF911 @LOLatWhiteFear Twitter page at 9:53 p.m. shows that she was urging supporters to  Call now!! #fyf911 tonight we continue to dismantle the illusion of white Below is a SNAPSHOT Twitter Radio Call Invite   #FYF911The radio show aired at 10:00 p.m. eastern standard time.During the show

In [15]:
from tqdm import tqdm

In [16]:
total_prompts = []
token_count = []

for current_prompt in tqdm(df.text.values):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": current_prompt},
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    total_prompts.append(prompt)
    encoded = tokenizer.encode(prompt, return_tensors='pt')
    token_count.append(len(encoded[0]))

100%|██████████| 36879/36879 [03:18<00:00, 185.53it/s]


In [17]:
df['token_count'] = token_count
df['prompt'] = total_prompts

In [18]:
df["token_count"].describe(), df["token_count"].quantile(0.999)

(count    36879.000000
 mean      1436.214838
 std        958.622055
 min        555.000000
 25%        969.000000
 50%       1183.000000
 75%       1589.000000
 max      42427.000000
 Name: token_count, dtype: float64,
 np.float64(10766.0))

In [19]:
df = df[df["token_count"] < 20000]

In [20]:
total_prompts = df.prompt.values

# Summarization using vLLM + Llama-3.2-3B-Instruct

In [21]:
print(total_prompts[-10])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 22 Jun 2025

You are an expert summarizer helping a user prepare input for a sentence embedding model with a strict 512-token input limit.
The user will always include the full **title** of the news article. Your job is to **rewrite or summarize the news article text only**, using no more than the available tokens provided. The goal is to preserve **all important meaning** from the article without exceeding the token budget.

You must:
- Leave the title unchanged
- Output **only the rewritten article text**
- Not include anything non-relevant stuff in your response

You will be provided Input details.<|eot_id|><|start_header_id|>user<|end_header_id|>

Input details:

    Title:  Florida Judge Blames Rape Victim For Attending Music Festival

    Title Token Count: 12

    Original Text: The Ultra Music Festival in Miami is one of the largest electronic dance music festivals in 

In [22]:
print(total_prompts[-1])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 22 Jun 2025

You are an expert summarizer helping a user prepare input for a sentence embedding model with a strict 512-token input limit.
The user will always include the full **title** of the news article. Your job is to **rewrite or summarize the news article text only**, using no more than the available tokens provided. The goal is to preserve **all important meaning** from the article without exceeding the token budget.

You must:
- Leave the title unchanged
- Output **only the rewritten article text**
- Not include anything non-relevant stuff in your response

You will be provided Input details.<|eot_id|><|start_header_id|>user<|end_header_id|>

Input details:

    Title: Goldman Sachs Endorses Hillary Clinton For President

    Title Token Count: 11

    Original Text: Goldman Sachs Endorses Hillary Clinton For President For Goldman Sachs, was there really any other cho

In [25]:
%%time

outputs = llm.generate(total_prompts, sampling_params)

Adding requests: 100%|██████████| 36868/36868 [02:13<00:00, 277.17it/s]
Processed prompts: 100%|██████████| 36868/36868 [3:29:01<00:00,  2.94it/s, est. speed input: 4200.93 toks/s, output: 694.15 toks/s]    

CPU times: user 3min 35s, sys: 14.8 s, total: 3min 50s
Wall time: 3h 31min 14s





In [28]:
results = [output.outputs[0].text for output in outputs]     

In [29]:
print(results[-1])

Goldman Sachs CEO Lloyd Blankfein endorsed Hillary Clinton for president, a move she had been working towards for years. The endorsement comes after Clinton gave speeches to Goldman Sachs and other Wall Street banks, praising their talents and explaining her positions on financial regulation. In 2013, Clinton told Goldman Sachs that Dodd-Frank had to be passed for "political reasons" and that regulations should be written by those who know the industry best. She also stated that banks are scared of regulations and that this is hindering economic growth. Clinton has taken a tough public position on Wall Street during the campaign, but her private comments to Goldman Sachs suggest she may be open to giving them influence.


In [30]:
result_df = pd.DataFrame({"Prompts": total_prompts, "Results": results})

In [32]:
result_df

Unnamed: 0,Prompts,Results
0,<|begin_of_text|><|start_header_id|>system<|en...,Members of the #FYF911 and #BlackLivesMatter m...
1,<|begin_of_text|><|start_header_id|>system<|en...,"Louisiana Gov. Bobby Jindal, a Hindu by birth,..."
2,<|begin_of_text|><|start_header_id|>system<|en...,Underscoring the dangers of America’s unreliab...
3,<|begin_of_text|><|start_header_id|>system<|en...,Sen. Ben Sasse (R-Neb.) took aim at Alt-Right ...
4,<|begin_of_text|><|start_header_id|>system<|en...,"Charles Schumer, the top Democrat in the U.S. ..."
...,...,...
36863,<|begin_of_text|><|start_header_id|>system<|en...,Hackers believed to be working for the Russian...
36864,<|begin_of_text|><|start_header_id|>system<|en...,Giuliani demands that Democrats apologize for ...
36865,<|begin_of_text|><|start_header_id|>system<|en...,Thousands of migrants flooded into a train sta...
36866,<|begin_of_text|><|start_header_id|>system<|en...,Donald Trump's combative style has buffeted Me...


In [33]:
result_df.to_csv("llama3_2_3b_summary_results.csv", index=False)

In [26]:
# import csv
# from tqdm import tqdm

# batch_size = 100
# output_file = "summarized_output.csv"

# # Write headers once
# with open(output_file, "w", newline='', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerow(["original_text", "summary"])

In [27]:
# for i in tqdm(range(0, len(total_prompts), batch_size)):
#     batch_prompts = total_prompts[i:i + batch_size]

#     outputs = llm.generate(batch_prompts, sampling_params)

#     summaries = [out.outputs[0].text if out.outputs else "" for out in outputs]

#     with open(output_file, "a", newline='', encoding='utf-8') as f:
#         writer = csv.writer(f)
#         for original, summary in zip(batch_prompts, summaries):
#             writer.writerow([original, summary])