### Getting url from extension

In [1]:
import requests
from bs4 import BeautifulSoup as bs
# import readability
from readability import Document

In [4]:
url = "https://www.foxnews.com/politics/judge-grants-19-ags-preliminary-injunction-against-doge-access-treasury-payment-system"


### Scraper

In [5]:
def fetch_webpage(url):
    # Define headers to simulate a browser
    headers = {
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36")
    }
    
    try:
        # Perform GET request with headers and a timeout
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises HTTPError for bad responses (4xx, 5xx)
        
        # Return the raw HTML content
        return response.text
        
    except requests.exceptions.HTTPError as http_err:
        # Log HTTP errors with status codes
        print(f"HTTP error occurred: {http_err}")
        return None
        
    except requests.exceptions.RequestException as req_err:
        # Handle other request-related errors (e.g., connection errors, timeouts)
        print(f"Request error occurred: {req_err}")
        return None


In [6]:
# Regex
import re

def parse_webpage(html_content):
    # Initialize BeautifulSoup with the HTML content
    soup = bs(html_content, "html.parser")
    
    # # Remove unwanted elements
    # for element in soup(["script", "style", "noscript", "header", "footer", "nav"]):
    #     element.decompose()
    
    # Optionally, you can target specific tags that likely contain the main content,
    # like <article> or <main>. For example:
    main_content = soup.find("article") or soup.find("main") or soup.find("p")
    if main_content:
        text = main_content.get_text(separator="\n")
    else:
        text = soup.get_text(separator="\n")
    
    # For simplicity, here we'll extract text from the entire cleaned soup
    text = soup.get_text(separator="\n")
    
    # Clean up the extracted text:
    # 1. Remove leading/trailing whitespace from each line
    lines = [line.strip() for line in text.splitlines()]
    # 2. Remove empty lines
    clean_lines = [line for line in lines if line]
    # 3. Join lines into a single string
    clean_text = "\n".join(clean_lines)
    
    # Further cleaning: collapse multiple spaces into one
    clean_text = re.sub(r'\s{2,}', ' ', clean_text)
    
    return clean_text


In [7]:
def parse_with_readability(html_content):
    # Use readability to extract the main content
    doc = Document(html_content)
    # Get the cleaned HTML of the main content
    summary_html = doc.summary()
    # Optionally, get the title extracted by readability
    title = doc.title()
    
    # Parse the summary HTML with BeautifulSoup
    soup = bs(summary_html, "lxml")
    text = soup.get_text(separator="\n")
    
    # Clean up the extracted text:
    # 1. Strip whitespace from each line
    lines = [line.strip() for line in text.splitlines()]
    # 2. Filter out empty lines
    clean_lines = [line for line in lines if line]
    # 3. Join the lines back into a single string
    clean_text = "\n".join(clean_lines)
    
    # Further cleaning: collapse multiple spaces into one
    clean_text = re.sub(r'\s{2,}', ' ', clean_text)
    
    return clean_text, title

In [8]:

# Example usage:
html_content = fetch_webpage("https://www.foxnews.com/politics/judge-grants-19-ags-preliminary-injunction-against-doge-access-treasury-payment-system")  # Use your previous function here
# parsed_text = parse_webpage(html_content)
extracted_text, article_title = parse_with_readability(html_content)
# print(parsed_text)


In [None]:
from flask import Flask, request, jsonify
import re


In [11]:
article_title

'DOGE temporaily blocked from accessing Treasury payment system | Fox News'

https://edition.cnn.com/2025/02/21/politics/trump-fires-top-us-general-cq-brown/index.html

curl -X POST http://127.0.0.1:5000/scrape -H "Content-Type: application/json" -d "{\"url\": \"https://edition.cnn.com/2025/02/21/politics/trump-fires-top-us-general-cq-brown/index.html\"}"


In [10]:
print(extracted_text)

Fox News national correspondent Bryan Llenas has the latest on 19 states suing to stop DOGE from accessing certain information on ‘America Reports.’
A
federal judge
on Friday granted an injunction requested by 19 attorneys general to prevent the Elon Musk-led Department of Government Efficiency (DOGE) from having access to the Treasury Department's central payment system.
The ruling by U.S. District Judge Jeannette Vargas extends the pause by issuing a preliminary injunction, a legal step that blocks access to the records while the case is litigated on the merits.
In her 64-page decision, Vargas noted she was granting the preliminary injunction
preventing DOGE
from accessing the payment records because of the possible disclosure of the states’ bank records. However, she also said the plaintiffs "have not demonstrated that they are entitled to the broad and sweeping relief they seek, which would far exceed the scope of the present TRO (Temporary restraining order)."
The White House on W

In [5]:

# Example usage:
url = "https://www.foxnews.com/politics/judge-grants-19-ags-preliminary-injunction-against-doge-access-treasury-payment-system"
html_content = fetch_webpage(url)
if html_content:
    print("Successfully fetched webpage content.")
else:
    print("Failed to fetch webpage content.")


Successfully fetched webpage content.




In [17]:
url

'https://www.foxnews.com/politics/judge-grants-19-ags-preliminary-injunction-against-doge-access-treasury-payment-system'

In [None]:
request = requests.get(url)

curl -X POST http://127.0.0.1:5000/scrape \ -H "Content-Type: application/json" \ -d '{"url": "https://www.foxnews.com/politics/judge-grants-19-ags-preliminary-injunction-against-doge-access-treasury-payment-system"}'

### ___

In [16]:
from flask import Flask, request, jsonify
from readability import Document
from bs4 import BeautifulSoup
import requests
import re

app = Flask(__name__)

def fetch_webpage(url):
    headers = {
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/90.0.4430.93 Safari/537.36")
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None

def parse_with_readability(html_content):
    # Use readability to extract the main content and title
    doc = Document(html_content)
    summary_html = doc.summary()
    title = doc.title()
    
    # Parse the summary HTML using BeautifulSoup
    soup = BeautifulSoup(summary_html, "lxml")
    text = soup.get_text(separator="\n")
    
    # Clean up the extracted text:
    lines = [line.strip() for line in text.splitlines()]
    clean_lines = [line for line in lines if line]
    clean_text = "\n".join(clean_lines)
    clean_text = re.sub(r'\s{2,}', ' ', clean_text)
    
    return clean_text, title

@app.route('/scrape', methods=['POST'])
def scrape():
    # Get JSON data from the POST request
    data = request.get_json()
    url = data.get("url")
    if not url:
        return jsonify({"error": "Missing URL parameter"}), 400

    # Fetch the webpage content
    html_content = fetch_webpage(url)
    if not html_content:
        return jsonify({"error": "Failed to fetch webpage content"}), 500

    # Parse the webpage content using readability and BeautifulSoup
    parsed_text, title = parse_with_readability(html_content)

    # Prepare the result dictionary
    result = {
        "url": url,
        "title": title,
        "text": parsed_text
    }

    # Return the result as a JSON response
    return jsonify(result)

if __name__ == "__main__":
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

In [14]:

if __name__ == "__main__":
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1