In [7]:
import requests
from bs4 import BeautifulSoup
import re

# Function to filter out irrelevant links
def is_relevant_link(url):
    irrelevant_patterns = [
        r"^/$",
        r"/(sitemap|adoption-subsidy|content|news|profile)",
        r".*?facebook\.com",
        r".*?twitter\.com",
        r".*?youtube\.com",
        r".*?instagram\.com",
        r".*?linkedin\.com"
    ]
    for pattern in irrelevant_patterns:
        if re.match(pattern, url):
            return False
    return True

# Function to scrape all links from a webpage
def scrape_links(url):
    links = []
    page_number = 1
    
    while True:
        page_url = f"{url}?page={page_number}"
        
        # Send a GET request to the URL
        response = requests.get(page_url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all anchor tags (links) on the page
            page_links = soup.find_all('a', href=True)
            
            # Extract and append the URLs
            for link in page_links:
                href = link['href']
                if is_relevant_link(href):
                    links.append(href)
            
            # Check if there is a next page
            next_page_link = soup.find('a', text='Next')
            if next_page_link:
                page_number += 1
            else:
                break
        else:
            print("Failed to retrieve page:", page_url)
            break
    
    return links

# URL of the sitemap
sitemap_url = "https://dcs.az.gov/sitemap"

# Call the function to scrape links from the sitemap
sitemap_links = scrape_links(sitemap_url)

# Print the scraped links
for link in sitemap_links:
    print("Link:", link)


Link: #main-content
Link: https://dcs.az.gov/about
Link: https://dcs.az.gov/about
Link: https://dcs.az.gov/about/administration/dcsdirector
Link: https://dcs.az.gov/about/administration/dcsCEO
Link: https://dcs.az.gov/about/engage
Link: /about/offices
Link: /about/policy
Link: /about/policy
Link: /about/policy/rulemaking
Link: /about/policy/eeo
Link: /about/policy/non-discrimination
Link: /about/policy/lep
Link: /about/procurement
Link: /about/strategic-plan
Link: /about/community-advisory-committee
Link: /about/volunteer
Link: /about/volunteer
Link: /about/volunteer/interest-form
Link: /about/volunteer/giving-tree
Link: https://dcs.az.gov/LuggageofLove
Link: /about/volunteer/community-screening-partners
Link: https://dcs.az.gov/fact
Link: /about/contact
Link: /careers
Link: /careers
Link: /careers/search/child-safety-specialist
Link: /careers/search/ocwi
Link: https://dcs.az.gov/careers/search
Link: /careers/compassioneers
Link: /reports
Link: /reports
Link: https://dcs.az.gov/settlem

In [9]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Function to filter out irrelevant links
def is_relevant_link(url):
    irrelevant_patterns = [
        r"^/$",
        r"/(sitemap|adoption-subsidy|content|news|profile)",
        r".*?facebook\.com",
        r".*?twitter\.com",
        r".*?youtube\.com",
        r".*?instagram\.com",
        r".*?linkedin\.com"
    ]
    for pattern in irrelevant_patterns:
        if re.match(pattern, url):
            return False
    return True

# Function to scrape text from a webpage
def scrape_text(url):
    # Check if the URL is a relative URL or an anchor link
    if url.startswith("#") or url.startswith("/"):
        # Construct the complete URL using the base URL of the sitemap
        complete_url = sitemap_url + url
    else:
        complete_url = url

    # Send a GET request to the complete URL
    response = requests.get(complete_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all text content on the page
        text_content = soup.get_text()
        
        return text_content
    else:
        print("Failed to retrieve page:", complete_url)
        return None


# URL of the sitemap
sitemap_url = "https://dcs.az.gov/sitemap"

# Call the function to scrape links from the sitemap
sitemap_links = scrape_links(sitemap_url)

# Scrape text content from each relevant link
corpus = ""
for link in sitemap_links:
    if is_relevant_link(link):
        text = scrape_text(link)
        if text:
            corpus += text + "\n\n"

# Clean and preprocess the corpus (remove HTML tags, special characters, etc.)
# Tokenize the corpus into words
tokens = word_tokenize(corpus.lower())  # Convert to lowercase to maintain consistency
clean_tokens = [token for token in tokens if token.isalnum()]  # Remove non-alphanumeric tokens

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in clean_tokens if token not in stop_words]

# Join tokens back into a string
cleaned_corpus = ' '.join(filtered_tokens)

# Save the corpus to a file
with open("corpus.txt", "w", encoding="utf-8") as file:
    file.write(cleaned_corpus)

print("Corpus created and saved successfully!")


[nltk_data] Downloading package punkt to /home/ryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 

In [13]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')

def is_relevant_link(url):
    if not isinstance(url, str):  # Check if the URL is a string
        print("Invalid URL type:", url)
        return False
    
    irrelevant_patterns = [
        r"^/$",
        r"/(sitemap|adoption-subsidy|content|news|profile)",
        r".*?facebook\.com",
        r".*?twitter\.com",
        r".*?youtube\.com",
        r".*?instagram\.com",
        r".*?linkedin\.com"
    ]
    for pattern in irrelevant_patterns:
        if re.match(pattern, url):
            return False
    return True


# Function to scrape text from a webpage
def scrape_text(url):
    # Check if the URL is a relative URL or an anchor link
    if url.startswith("#") or url.startswith("/"):
        # Construct the complete URL using the base URL of the sitemap
        complete_url = sitemap_url + url
    else:
        complete_url = url

    # Send a GET request to the complete URL
    try:
        response = requests.get(complete_url, timeout=10)  # Set timeout to 10 seconds
    except requests.exceptions.RequestException as e:
        print(f"Error occurred for: {complete_url} - {e}")
        return None

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all text content on the page
        text_content = soup.get_text()
        
        return text_content
    else:
        print("Failed to retrieve page:", complete_url)
        return None

# URL of the sitemap
sitemap_url = "https://dcs.az.gov/sitemap"

# Call the function to scrape links from the sitemap
sitemap_links = scrape_links(sitemap_url)

# Initialize progress bar
pbar = tqdm(total=len(sitemap_links), desc="Scraping Progress")

# Scrape text content from each relevant link
corpus = ""
for link in sitemap_links:
    pbar.update(1)  # Increment progress bar
    if is_relevant_link(link):
        text = scrape_text(link)
        if text:
            corpus += text + "\n\n"
        else:
            print("Skipping link:", link)

# Close progress bar
pbar.close()

# Clean and preprocess the corpus (remove HTML tags, special characters, etc.)
# Tokenize the corpus into words
tokens = word_tokenize(corpus.lower())  # Convert to lowercase to maintain consistency
clean_tokens = [token for token in tokens if token.isalnum()]  # Remove non-alphanumeric tokens

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in clean_tokens if token not in stop_words]

# Join tokens back into a string
cleaned_corpus = ' '.join(filtered_tokens)

# Save the corpus to a file
with open("corpus.txt", "w", encoding="utf-8") as file:
    file.write(cleaned_corpus)

print("Corpus created and saved successfully!")


[nltk_data] Downloading package punkt to /home/ryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Scraping Progress:  69%|██████████████████████████████████████▊                 | 208/300 [10:04:40<4:27:27, 174.43s/it]
Scraping Progress:  21%|█████████████                                                  | 62/300 [00:38<13:52,  3.50s/it]

Error occurred for: https://www.adoptuskids.org/meet-the-children/search-for-children/search - HTTPSConnectionPool(host='www.adoptuskids.org', port=443): Read timed out. (read timeout=10)
Skipping link: https://www.adoptuskids.org/meet-the-children/search-for-children/search


Scraping Progress:  68%|██████████████████████████████████████████▎                   | 205/300 [01:28<00:22,  4.14it/s]

Invalid URL type: None


Scraping Progress: 100%|██████████████████████████████████████████████████████████████| 300/300 [01:55<00:00,  2.60it/s]


Corpus created and saved successfully!
