In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import re
import pandas as pd

In [2]:
df = pd.DataFrame()

In [3]:
active =[]
links = []
paragraphs = []
times = []
bases = []
images = []

In [4]:

def is_active_url(url):
    """
    Check if the URL is active by sending a HEAD request.
    """
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.status_code == 200
    except requests.RequestException:
        return False



In [5]:
def find_all_links(url, soup):
    """
    Find all the hyperlinks in the current page.
    """
    links = set()
    for link in soup.find_all('a', href=True):
        full_url = urljoin(url, link['href'])
        if url in full_url:  # Only consider URLs in the same domain
            links.add(full_url)
    return links



In [6]:
def getPageTime(url):
    response = requests.get(url)
    html_content = response.text
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Check for meta tags related to modification date
    last_modified = soup.find('meta', {'http-equiv': 'last-modified'})
    date_meta = soup.find('meta', {'name': 'date'})
    
    if last_modified:
        return last_modified.get('content')
    elif date_meta:
        return date_meta.get('content')
    else:
        return None


In [7]:
def extract_base_url(url):
    # Regular expression pattern for extracting base URL
    pattern = re.compile(r'^(https?:\/\/[^\/]+)')
    
    # Search for the pattern in the provided URL
    match = pattern.search(url)
    
    # Return the base URL if a match is found, otherwise None
    return match.group(1) if match else None

In [8]:
def getPageText(url):    
    response = requests.get(url)
    html_content = response.text
    
    # Step 2: Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Step 3: Remove Unwanted Tags
    # Find and remove <header>, <footer>, and <nav> tags
    for tag_name in ['header', 'footer', 'nav']:
        for element in soup.find_all(tag_name):
            element.decompose()  # Removes the tag and its content
    
    # Step 4: Extract <p> Tags
    # Find all <p> tags
    paragraphs = soup.find_all('p')
    
    return "\n".join([i.get_text(strip=True) for i in paragraphs])

In [9]:
def getImages(url):
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
    
        # Find all <img> tags on the page
        img_tags = soup.find_all('img')
    
        # Extract the URLs of the images
        img_urls = []
        for img in img_tags:
            img_url = img.get('src')
            # Handle relative URLs by converting them to absolute URLs
            img_url = urljoin(url, img_url)
            img_urls.append(img_url)
    
        # Print the collected image URLs
        print(f"Found {len(img_urls)} images:")
        return img_urls
    else:
        print(f"Failed to retrieve webpage. Status code: {response.status_code}")
        return []

In [10]:
visited_urls = set()

In [11]:
start = time.time()

In [12]:
def crawl_website(base_url, max_pages=100):
    
    global visited_urls

    try:
        response = requests.get(base_url, timeout=5)
        if response.status_code == 200:
            print(f"Active URL: {base_url}")
            soup = BeautifulSoup(response.content, 'html.parser')
            current_links = find_all_links(base_url, soup)
            visited_urls.add(base_url)
        else:
            print(f"Inactive URL: {base_url} - Status Code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Failed to fetch {base_url}: {e}")

    for i in [ j for j in current_links if j not in visited_urls]:
        if(is_active_url(i)):
            paragraphs.append(getPageText(i))
            links.append(i)
            crawl_website(i)
            times.append(getPageTime(i))
            bases.append(extract_base_url(i))
            images.append(getImages(i))

In [13]:
crawl_website("https://www.nation.co.ke", 10)

Active URL: https://www.nation.co.ke
Active URL: https://www.nation.co.ke/kenya/account#purchases
Found 1 images:
Active URL: https://www.nation.co.ke/kenya/counties/mountain
Found 23 images:
Active URL: https://www.nation.co.ke/kenya/blogs-opinion/opinion/we-must-take-stand-on-world-issues-4748904
Active URL: https://www.nation.co.ke/kenya/blogs-opinion/opinion/we-must-take-stand-on-world-issues-4748904#summary
Found 10 images:
Active URL: https://www.nation.co.ke/kenya/blogs-opinion/opinion/we-must-take-stand-on-world-issues-4748904#related
Found 10 images:
Active URL: https://www.nation.co.ke/kenya/blogs-opinion/opinion/we-must-take-stand-on-world-issues-4748904#story
Found 10 images:
Found 10 images:
Active URL: https://www.nation.co.ke/kenya/life-and-style/wellness
Found 27 images:
Active URL: https://www.nation.co.ke/kenya/audio
Active URL: https://www.nation.co.ke/kenya/audio/how-to-be-a-good-son-in-law-4638366
Found 4 images:
Active URL: https://www.nation.co.ke/kenya/audio/wha

KeyboardInterrupt: 

In [14]:
end = time.time()

In [15]:
print(end - start)

334.07860922813416


In [16]:
df = pd.DataFrame({"links":links[0:20], "paragraphs":paragraphs[0:20], "time": times[0:20], "base_urls":bases[0:20],  "img_urls":images[0:20]})

In [23]:
df.to_parquet("../data/nation.parquet")

In [24]:
df.to_csv("../data/nation.csv")