In [1]:
import os
os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"

from langchain_community.document_loaders import WebBaseLoader
import re
import textwrap

In [2]:
# Step 1: Load the webpage
url = "https://understandingwar.org/research/russia-ukraine/russian-offensive-campaign-assessment-november-4-2025/"
loader = WebBaseLoader(url)
docs = loader.load()

# Step 2: Extract raw text
raw_text = docs[0].page_content

In [28]:
# Step 3: Clean the text
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+|https\S+", "", text)

    # Remove [number] citations
    text = re.sub(r"\[\d+\]", "", text)

    # Remove everything after "WARNING:" (case-insensitive, greedy match)
    text = re.split(r"\bWARNING\b[:\s]*", text, flags=re.IGNORECASE)[0]

    # Remove button-like phrases (e.g., "Download", "Share", "Subscribe", "Donate", "Read More")
    button_keywords = [
    "Download", "Share", "Subscribe", "Donate", "Read More", "Sign Up",
    "Back to Top", "Close Search", "Menu", "Get Involved", "Join Our Network"
    ]

    for keyword in button_keywords:
        # Match the keyword followed by optional punctuation or short phrases, but stop before deleting full sentences
        text = re.sub(rf"\b{keyword}\b(?:[\s:–-]{1,3}\w{{1,10}})?[.!]?", "", text, flags=re.IGNORECASE)


    # Remove isolated link text (e.g., "About ISW", "Map Room")
    link_labels = [
        "About ISW", "Analysis", "Education", "Get Involved", "Newsroom", "Map Room",
        "Skip to contentSkip to Content", "Options"
    ]

    for label in link_labels:
        # Match the label only if it's isolated or followed by short UI-like text
        text = re.sub(rf"\b{label}\b(?:[\s:–-]{1,3}\w{{1,10}})?[.!]?", "", text, flags=re.IGNORECASE)

    # Remove extra whitespace
    text = re.sub(r"\s{2,}", " ", text).strip()

    return text

In [29]:
print(f"Original Length: {len(raw_text)} characters")
cleaned_text = clean_text(raw_text)
print(f"Cleaned Length: {len(cleaned_text)} characters\n")

Original Length: 65033 characters
Cleaned Length: 43015 characters



In [30]:
print(f"Original Length: {len(raw_text)} characters")

cleaned_text = clean_text(raw_text)
print(f"Cleaned Length: {len(cleaned_text)} characters\n")

cleaned_text = textwrap.fill(cleaned_text, width=180)
print(cleaned_text)

Original Length: 65033 characters
Cleaned Length: 43015 characters

Russian Offensive Campaign Assessment, Nov. 4, 2025 | ISW PAGE Objectivity, Truth, Insight Our Capabilities ISW’s Comparative Advantage Our History Improving the National Security
Debate Who We Are Meet the Team Careers Join the Mission Open Positions ISW produces the world’s premier open-source conflict maps SEE MAPS The General Jack Keane Center for
National Security Dive into ISW's Complete Map Catalog Briefing Room Videos, Podcasts, and Interactive Content Research Library Browse ISW's Entire Body of Work TEAMS/PORTFOLIOS
Russia & Ukraine Middle East China & Taiwan Adversary Entente Contemporary & Future War Geospatial Intelligence Cognitive Warfare The General David H. Petraeus Center for Emerging
Leaders Hertog War Studies ISW’s Premier Educational Program for Undergraduates Fellowships Enrichment and Advancement Opportunities for Entry-Level and Mid-Career Professionals
Internships Launch your Career at ISW with

In [27]:
print(textwrap.fill(raw_text, width=180))

        Russian Offensive Campaign Assessment, Nov. 4, 2025 | ISW                                                                            Options
DOWNLOAD PAGE                              Share               Skip to contentSkip to Content             Donate  Menu           Menu  About ISW    About ISW Objectivity, Truth,
Insight    Our Capabilities ISW’s Comparative Advantage   Our History Improving the National Security Debate     Who We Are Meet the Team   Careers Join the Mission   Open
Positions        MAP ROOM ISW produces the world’s premier open-source conflict maps   SEE MAPS         Analysis    Analysis The General Jack Keane Center for National Security
Map Room Dive into ISW's Complete Map Catalog   Briefing Room Videos, Podcasts, and Interactive Content   Research Library Browse ISW's Entire Body of Work    TEAMS/PORTFOLIOS
Russia & Ukraine   Middle East   China & Taiwan     Adversary Entente   Contemporary & Future War   Geospatial Intelligence   Cognitive Warfare     