# BART Summarize News Articles

This notebook:
1) Loads your dataset
2) Extracts the `body` text
3) Summarizes each article to ~500 words with **facebook/bart-large-cnn**


## 0) Install & Imports

In [1]:

# If running locally: uncomment to install
# !pip install --upgrade pip
# !pip install ray[default] transformers torch accelerate pandas tqdm openai python-dotenv

import os
import math
import json
import time
from dataclasses import dataclass
from typing import List, Dict, Any
import pandas as pd
from tqdm import tqdm

# Hugging Face summarization
from transformers import pipeline

# OpenAI Responses API (modern SDK)
from openai import OpenAI

# Optional: load .env file if present
try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass


  from .autonotebook import tqdm as notebook_tqdm


## 1) Config

In [None]:

# Path to your CSV
CSV_PATH = "datatset2_NullFilled_fullcontent.csv" #"balanced_training_dataset.csv"  # change if needed

# Which column has the article text?
TEXT_COLUMN = "full_content"  #body # auto-detected from your upload

# Which model to use for summarization
HF_SUMMARY_MODEL = "facebook/bart-large-cnn"

# Target ~words for the summary (we'll trim by words after generation)
SUMMARY_TARGET_WORDS = 500
MAX_NEW_TOKENS = 400
MIN_NEW_TOKENS = 300 #150 #300

# Batch sizes
SUMMARIZE_BATCH = 32   # adjust based on your GPU/CPU/RAM
CLASSIFY_BATCH  = 16  # small batches help rate limits

# Active learning thresholds
MIN_CONFIDENCE_ACCEPT = 0.60   # if max class prob < 0.60 -> needs_review
MIN_MARGIN_ACCEPT     = 0.15   # if (top1 - top2) < 0.15 -> needs_review

# Output paths
OUT_CSV_ALL   = "labeled_with_summaries.csv"
OUT_CSV_REVIEW = "needs_review.csv"


## 2) Load Data

In [20]:

df = pd.read_csv(CSV_PATH)
assert TEXT_COLUMN in df.columns, f"Column '{TEXT_COLUMN}' not in CSV columns: {df.columns}"
df = df.copy()
print("Rows:", len(df), "| Columns:", list(df.columns))

# Basic cleaning of text
def clean_text(x: str) -> str:
    if not isinstance(x, str):
        return ""
    return ' '.join(x.split())

df[TEXT_COLUMN] = df[TEXT_COLUMN].map(clean_text)
df = df[df[TEXT_COLUMN].str.len() > 0].reset_index(drop=True)
print("After cleaning, rows:", len(df))

df.head()

Rows: 2882 | Columns: ['Unnamed: 0', 'id', 'category', 'full_content']
After cleaning, rows: 2882


Unnamed: 0.1,Unnamed: 0,id,category,full_content
0,0,0,Climate,"Environmental Defenders Face Harassment, Intim..."
1,1,1,Weather,"Piers, Roxie, and Ryuki rock Pasio with a publ..."
2,2,2,Weather,It goes without saying that boots are one of t...
3,3,3,Climate,Giraffes are the world's tallest mammals and a...
4,4,4,Music,Sleep disturbances and subsequent fatigue are ...


## 3) Summarize to ~500 words with BART
We control length by setting a **token** max/min and then trimming by **words** to ~500.

In [21]:
import os, time
import math
import pandas as pd
import torch
import ray
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# ---------- Worker definition ----------
@ray.remote(num_gpus=1)
class BartWorker:
    def __init__(self, model_name=HF_SUMMARY_MODEL, dtype="float16"):
        # Ray sets CUDA_VISIBLE_DEVICES to the assigned GPU automatically.
        device = "cuda" if torch.cuda.is_available() else "cpu"
        torch_dtype = getattr(torch, dtype) if device == "cuda" else torch.float32
        self.device = device

        self.tok = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            torch_dtype=torch_dtype
        ).to(device).eval()

    @torch.inference_mode()
    def summarize_batch(self, texts, max_new_tokens=MAX_NEW_TOKENS,
                        min_new_tokens=MIN_NEW_TOKENS, target_words=SUMMARY_TARGET_WORDS):
        # Clean + tokenize
        texts = [" ".join(str(t).split()) for t in texts]
        inputs = self.tok(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024
        ).to(self.device)

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            min_new_tokens=min_new_tokens,
            num_beams=4,
            do_sample=False,
            length_penalty=1.0,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        dec = [self.tok.decode(o, skip_special_tokens=True) for o in outputs]

        # Trim to ~target_words
        trimmed = []
        for s in dec:
            w = s.split()
            trimmed.append(" ".join(w[:target_words]) if len(w) > target_words else s)
        return trimmed

    def summarize_chunks(self, texts, batch_size=SUMMARIZE_BATCH):
        out = []
        for i in range(0, len(texts), batch_size):
            out.extend(self.summarize_batch(texts[i:i+batch_size]))
        return out


# ---------- Orchestration ----------
def ray_summarize_dataframe(df: pd.DataFrame,
                            text_col: str = TEXT_COLUMN,
                            batch_size: int = SUMMARIZE_BATCH):
    assert text_col in df.columns, f"Missing column: {text_col}"

    # Start Ray (single-node). num_gpus=None â†’ Ray auto-detects.
    if not ray.is_initialized():
        ray.init(ignore_reinit_error=True)

    num_gpus = int(ray.available_resources().get("GPU", 0))
    if num_gpus < 1:
        raise RuntimeError("No GPUs visible to Ray. Check your job allocation / CUDA_VISIBLE_DEVICES.")

    print(f"ðŸŸ¢ Ray initialized | GPUs detected: {num_gpus}")

    # Create one BartWorker per GPU
    workers = [BartWorker.remote() for _ in range(num_gpus)]

    # Create many small chunks (more chunks than GPUs keeps all workers busy)
    texts = df[text_col].astype(str).tolist()
    n = len(texts)
    CHUNK_ROWS = batch_size * 32   # each task â‰ˆ 32 inference batches
    chunks = [(i, texts[i:i+CHUNK_ROWS]) for i in range(0, n, CHUNK_ROWS)]

    # Dispatch chunks round-robin to workers
    futures = []
    for k, (start_idx, chunk_texts) in enumerate(chunks):
        w = workers[k % num_gpus]
        futures.append((start_idx, w.summarize_chunks.remote(chunk_texts, batch_size=batch_size)))

    # Gather results and place them back by index
    out = [None] * n
    t0 = time.time()
    for start_idx, fut in tqdm(futures, desc="Collecting summaries"):
        summaries = ray.get(fut)
        out[start_idx:start_idx+len(summaries)] = summaries

    print(f"âœ… Done: {n} rows in {(time.time()-t0)/60:.2f} min")
    return out


# ----------- RUN -----------
# df: your dataframe already loaded
if "summary_500" not in df.columns or df["summary_500"].isna().any():
    df = df.copy()
    df["summary_500"] = ray_summarize_dataframe(df, TEXT_COLUMN, SUMMARIZE_BATCH)
else:
    print("Column 'summary_500' already present, skipping summarization.")

print(df[["summary_500"]].head(2))

# When done
# ray.shutdown()


2025-10-30 03:35:10,877	INFO worker.py:2004 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


ðŸŸ¢ Ray initialized | GPUs detected: 5


Collecting summaries:   0%|          | 0/3 [00:00<?, ?it/s][36m(BartWorker pid=1378859)[0m `torch_dtype` is deprecated! Use `dtype` instead!
Collecting summaries: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [08:07<00:00, 162.36s/it]

âœ… Done: 2882 rows in 8.12 min
                                         summary_500
0  Environmental Defenders Face Harassment, Intim...
1  Piers, Roxie, and Ryuki rock Pasio with a publ...





In [23]:
# When done
ray.shutdown()

In [None]:
df.to_csv("BART_summarized_dataset.csv")