In [7]:
import requests

url = 'https://www.visitmaryland.org/'
response = requests.get(url)
print('HTTP status code:', response.status_code)


HTTP status code: 403


In [8]:
import requests
from bs4 import BeautifulSoup


def get_http_status(url):
    """Fetch the URL and return the HTTP status code and text, with headers."""
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyBot/0.1; +http://example.com/bot)"
    }
    resp = requests.get(url, headers=headers)
    return resp.status_code, resp.text

def extract_visible_text(html_content):
    """Extract and return visible text from HTML, ignoring <script> and <style> tags."""
    soup = BeautifulSoup(html_content, "html.parser")

    for tag in soup(["script", "style"]):
        tag.decompose()

    #text
    text = soup.get_text(separator="\n")

    lines = [line.strip() for line in text.splitlines()]
    visible_lines = [line for line in lines if line]
    visible_text = "\n".join(visible_lines)
    return visible_text

def extract_headings(html_content, levels=("h1","h2","h3")):
    """Extract headings of given levels, return list of tuples (tag, heading_text)."""
    soup = BeautifulSoup(html_content, "html.parser")
    headings = []
    for level in levels:
        for h in soup.find_all(level):
            text = h.get_text().strip()
            headings.append((level, text))
    return headings

def extract_links(html_content):
    """Extract all URLs in <a href="..."> from a page."""
    soup = BeautifulSoup(html_content, "html.parser")
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        links.add(href)
    return sorted(links)

def save_first_paragraph(html_content, filename):
    """Extract the first <p> paragraph from Wikipedia's main content and save it to a local file."""
    soup = BeautifulSoup(html_content, "html.parser")

    #wikipedia articles inclded
    content_div = soup.find("div", {"id": "mw-content-text"})
    if not content_div:
        raise ValueError("Could not find main content div")

    p = content_div.find("p")
    if p is None:
        raise ValueError("No <p> tag found in Wikipedia content")

    first_para = p.get_text().strip()
    #file saved
    with open(filename, "w", encoding="utf-8") as f:
        f.write(first_para)
    return first_para


def main():
    #viisitting the  Maryland HTTP status
    url_md = "https://www.visitmaryland.org/"
    status_code_md, content_md = get_http_status(url_md)
    print(f"VisitMaryland.org HTTP status code: {status_code_md}\n")

    #extract visible text from Visit Maryland main page
    visible_text_md = extract_visible_text(content_md)
    print("Visible Text from VisitMaryland.org:")
    print("-" * 40)
    print(visible_text_md[:2000])  # print first 2000 characters for brevity
    print("\n--- End of visible text preview ---\n")

    #extract headings from 
    url_wiki_nlp = "https://en.wikipedia.org/wiki/Natural_language_processing"
    status_code_wiki, content_wiki = get_http_status(url_wiki_nlp)
    print(f"Wikipedia NLP page HTTP status code: {status_code_wiki}\n")

    headings = extract_headings(content_wiki)
    print("Headings (h1, h2, h3) from Wikipedia NLP page:")
    for tag, text in headings:
        print(f"{tag}: {text}")
    print()

    #extracting all links
    links = extract_links(content_wiki)
    print("Some links from the Wikipedia NLP page:")
    for i, link in enumerate(links[:30]):  # show first 30 links
        print(f"{i+1}: {link}")
    print(f"... (Total links found: {len(links)})\n")

    #extracting first paragraph from Wikipedia NLP page and saving it for use
    filename = "nlp_intro.txt"
    intro_para = save_first_paragraph(content_wiki, filename)
    print(f"First paragraph of NLP page saved to {filename}:")
    print(intro_para)

if __name__ == "__main__":
    main()


VisitMaryland.org HTTP status code: 403

Visible Text from VisitMaryland.org:
----------------------------------------
Just a moment...
Enable JavaScript and cookies to continue

--- End of visible text preview ---

Wikipedia NLP page HTTP status code: 200

Headings (h1, h2, h3) from Wikipedia NLP page:
h1: Natural language processing
h2: Contents
h2: History
h2: Approaches: Symbolic, statistical, neural networks
h2: Common NLP tasks
h2: General tendencies and (possible) future directions
h2: See also
h2: References
h2: Further reading
h2: External links
h3: Symbolic NLP (1950s – early 1990s)
h3: Statistical NLP (1990s–present)
h3: Statistical approach
h3: Neural networks
h3: Text and speech processing
h3: Morphological analysis
h3: Syntactic analysis
h3: Lexical semantics (of individual words in context)
h3: Relational semantics (semantics of individual sentences)
h3: Discourse (semantics beyond individual sentences)
h3: Higher-level NLP applications
h3: Cognition

Some links from the