In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import re
import pandas as pd

In [2]:
df = pd.DataFrame()

In [3]:
active =[]
links = []
paragraphs = []
times = []
bases = []
images = []

In [4]:

def is_active_url(url):
    """
    Check if the URL is active by sending a HEAD request.
    """
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.status_code == 200
    except requests.RequestException:
        return False



In [5]:
def find_all_links(url, soup):
    """
    Find all the hyperlinks in the current page.
    """
    links = set()
    for link in soup.find_all('a', href=True):
        full_url = urljoin(url, link['href'])
        if url in full_url:  # Only consider URLs in the same domain
            links.add(full_url)
    return links



In [6]:
def crawl_website(base_url, max_pages=100):
    """
    Crawl a website to count active pages.
    
    Parameters:
        base_url (str): The starting URL of the website to crawl.
        max_pages (int): The maximum number of pages to crawl.
    
    Returns:
        int: The count of active pages.
    """
    visited_urls = set()
    urls_to_visit = {base_url}
    active_pages_count = 0

    while urls_to_visit and len(visited_urls) < max_pages:
        current_url = urls_to_visit.pop()
        
        if current_url in visited_urls:
            continue

        visited_urls.add(current_url)

        try:
            response = requests.get(current_url, timeout=5)
            if response.status_code == 200:
                print(f"Active URL: {current_url}")
                active_pages_count += 1
                soup = BeautifulSoup(response.content, 'html.parser')
                links = find_all_links(current_url, soup)
                urls_to_visit.update(links - visited_urls)
            else:
                print(f"Inactive URL: {current_url} - Status Code: {response.status_code}")
        
        except requests.RequestException as e:
            print(f"Failed to fetch {current_url}: {e}")

        # Be polite to the server, avoid hammering it
        time.sleep(1)
    
    print(f"Total active pages: {active_pages_count}")
    return active_pages_count



In [7]:
def getPageTime(url):
    response = requests.get(url)
    html_content = response.text
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Check for meta tags related to modification date
    last_modified = soup.find('meta', {'http-equiv': 'last-modified'})
    date_meta = soup.find('meta', {'name': 'date'})
    
    if last_modified:
        return last_modified.get('content')
    elif date_meta:
        return date_meta.get('content')
    else:
        return None


In [8]:
def extract_base_url(url):
    # Regular expression pattern for extracting base URL
    pattern = re.compile(r'^(https?:\/\/[^\/]+)')
    
    # Search for the pattern in the provided URL
    match = pattern.search(url)
    
    # Return the base URL if a match is found, otherwise None
    return match.group(1) if match else None

In [9]:
def getPageText(url):    
    response = requests.get(url)
    html_content = response.text
    
    # Step 2: Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Step 3: Remove Unwanted Tags
    # Find and remove <header>, <footer>, and <nav> tags
    for tag_name in ['header', 'footer', 'nav']:
        for element in soup.find_all(tag_name):
            element.decompose()  # Removes the tag and its content
    
    # Step 4: Extract <p> Tags
    # Find all <p> tags
    paragraphs = soup.find_all('p')
    
    return "\n".join([i.get_text(strip=True) for i in paragraphs])

In [16]:
def getImages(url):
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
    
        # Find all <img> tags on the page
        img_tags = soup.find_all('img')
    
        # Extract the URLs of the images
        img_urls = []
        for img in img_tags:
            img_url = img.get('src')
            # Handle relative URLs by converting them to absolute URLs
            img_url = urljoin(url, img_url)
            img_urls.append(img_url)
    
        # Print the collected image URLs
        print(f"Found {len(img_urls)} images:")
        return img_urls
    else:
        print(f"Failed to retrieve webpage. Status code: {response.status_code}")
        return []

In [17]:
visited_urls = set()

In [18]:
start = time.time()

In [23]:
def crawl_website(base_url, max_pages=100):
    
    global visited_urls

    try:
        response = requests.get(base_url, timeout=5)
        if response.status_code == 200:
            print(f"Active URL: {base_url}")
            soup = BeautifulSoup(response.content, 'html.parser')
            current_links = find_all_links(base_url, soup)
            visited_urls.add(base_url)
        else:
            print(f"Inactive URL: {base_url} - Status Code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Failed to fetch {base_url}: {e}")

    for i in [ j for j in current_links if j not in visited_urls]:
        if(is_active_url(i)):
            paragraphs.append(getPageText(i))
            links.append(i)
            crawl_website(i)
            times.append(getPageTime(i))
            bases.append(extract_base_url(i))
            images.append(getImages(i))

In [24]:
crawl_website("https://www.buckner.org/", 10)

Active URL: https://www.buckner.org/
Active URL: https://www.buckner.org/foster-care-adoption-post-adoption
Found 4 images:
Active URL: https://www.buckner.org/family-hope-centers-faq
Found 4 images:
Active URL: https://www.buckner.org/blog
Active URL: https://www.buckner.org/blog/empowering-dads-at-the-2024-fatherhood-summit
Active URL: https://www.buckner.org/blog/empowering-dads-at-the-2024-fatherhood-summit#slide0
Found 7 images:
Found 7 images:
Active URL: https://www.buckner.org/blog/houston-church-celebrating-more-than-two-decades-of-impacting-vulnerable-children
Active URL: https://www.buckner.org/blog/houston-church-celebrating-more-than-two-decades-of-impacting-vulnerable-children#slide0
Found 8 images:
Found 8 images:
Active URL: https://www.buckner.org/blog/giving-and-receiving-grace
Active URL: https://www.buckner.org/blog/giving-and-receiving-grace#slide0
Found 6 images:
Found 6 images:
Active URL: https://www.buckner.org/blog/page,5/
Found 16 images:
Active URL: https://

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 images:
Active URL: https://www.buckner.org/#slide0
Found 10 images:
Active URL: https://www.buckner.org/blog/trusting-god-with-our-fears
Found 6 images:
Active URL: https://www.buckner.org/retirement-services/senior-independent-living
Found 11 images:
Active URL: https://www.buckner.org/guas-para-redes-sociales-de-buckner
Found 4 images:
Active URL: https://www.buckner.org/foster-care-adoption
Found 8 images:
Active URL: https://www.buckner.org/volunteer
Found 15 images:
Active URL: https://www.buckner.org/financial-accountability
Found 6 images:
Active URL: https://www.buckner.org/family-hope-centers-blog
Found 14 images:
Active URL: https://www.buckner.org/radio
Found 7 images:


KeyboardInterrupt: 

In [25]:
end = time.time()

In [26]:
print(end - start)

1653.4505710601807


In [27]:
len(links)

215

In [28]:
len(paragraphs)

215

In [29]:
len(times)

214

In [33]:
df = pd.DataFrame({"links":links[0:20], "paragraphs":paragraphs[0:20], "time": times[0:20], "base_urls":bases[0:20],  "img_urls":images[0:20]})

In [34]:
df

Unnamed: 0,links,paragraphs,time,base_urls,img_urls
0,https://www.buckner.org/where-we-work,We're excited to share with you all the ways y...,,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
1,https://www.buckner.org/dallas-buckner-center-...,Did you know you can change the life of a chil...,,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
2,https://www.buckner.org/foster-care-adoption-p...,"""And my God will meet all your needs according...",,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
3,https://www.buckner.org/family-hope-centers-faq,Families meet with a Buckner social worker who...,,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
4,https://www.buckner.org/blog,Read more\nRead more\nRead more\nRead more\nRe...,,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
5,https://www.buckner.org/blog/empowering-dads-a...,"On August 8, theBuckner Rio Grande Children’s ...",,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
6,https://www.buckner.org/blog/empowering-dads-a...,"On August 8, theBuckner Rio Grande Children’s ...",,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
7,https://www.buckner.org/blog/houston-church-ce...,"South Main Baptist Church inHouston, Texas, ha...",,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
8,https://www.buckner.org/blog/houston-church-ce...,"South Main Baptist Church inHouston, Texas, ha...",,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...
9,https://www.buckner.org/blog/giving-and-receiv...,We often define grace as an undeserved gift fr...,,https://www.buckner.org,[https://ct.pinterest.com/v3/?event=init&tid=2...


Full URL: https://example.com/path/to/resource?query=1#fragment
Base URL: https://example.com

Full URL: http://www.example.com:8080/path?query=123
Base URL: http://www.example.com:8080

Full URL: https://sub.example.com/path/to/page
Base URL: https://sub.example.com

Full URL: ftp://example.com/file
Base URL: None



In [14]:

url = "https://www.buckner.org/"
# Send a GET request to fetch the webpage content


Found 10 images:
https://ct.pinterest.com/v3/?event=init&tid=2612808026275&pd[em]=<hashed_email_address
https://www.buckner.org/images/main/buckner-intl-logo-horizontal-blue.svg
https://www.buckner.org/images/main/buckner-intl-logo-horizontal-white.svg
https://www.buckner.org/images/r/sept-header-1/c1080x405g0-0-1079-405/sept-header-1.jpg
https://www.buckner.org/images/r/sept-header-1/60x60g0-0-1079-405/sept-header-1.jpg
https://www.buckner.org/images/r/sept-header-1/60x60g0-0-1079-405/sept-header-1.jpg
https://www.buckner.org/images/r/helping-others/c480x320g8-136-1160-784/helping-others.jpg
https://www.buckner.org/images/r/children-at-school/c480x320g0-135-1152-783/children-at-school.jpg
https://www.buckner.org/images/r/grace/c480x320g0-205-1152-853/grace.jpg
https://www.buckner.org/images/main/buckner-intl-logo-stacked-blue.svg
