<a href="https://colab.research.google.com/github/Thomas13t/webcrawler/blob/main/webcrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# =======================================
# STEP 0: Install Dependencies
# =======================================
!pip install requests beautifulsoup4 ipywidgets

import os
import re
import time
import requests
import ipywidgets as widgets
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime
from IPython.display import display, clear_output
from google.colab import drive

# =======================================
# Utility Functions
# =======================================
def is_same_domain(url, base_domain):
    """Check if the given URL is on the same domain as the base domain."""
    return urlparse(url).netloc == base_domain

def clean_filename(name):
    """Remove or replace characters that are invalid for filenames."""
    return re.sub(r'[^A-Za-z0-9_\-\. ]+', '_', name).strip()

def crawl_website(start_url, max_pages=None, delay=1):
    """
    Crawl a website starting from `start_url`, extracting:
      - The <title> of each page
      - The entire text content of each page (in the same domain).
    Returns: {url: (title, text)} dictionary
    """
    base_domain = urlparse(start_url).netloc

    visited = set()
    to_visit = [start_url]
    page_data = {}  # {url: (title, text)}

    while to_visit:
        url = to_visit.pop()

        if url in visited:
            continue
        visited.add(url)

        # Respect max_pages if provided
        if max_pages and len(visited) > max_pages:
            break

        try:
            response = requests.get(url, timeout=10)
            time.sleep(delay)  # Polite crawling: wait between requests
        except Exception as e:
            print(f"Failed to retrieve {url}: {e}")
            continue

        # Only parse HTML content
        content_type = response.headers.get('Content-Type', '').lower()
        if 'text/html' not in content_type:
            continue

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the page title if it exists; otherwise use the URL
        title_tag = soup.find('title')
        page_title = title_tag.get_text(strip=True) if title_tag else url

        # Extract all text from the page
        page_text = soup.get_text(separator="\n", strip=True)

        # Store the data
        page_data[url] = (page_title, page_text)

        # Find new links on the same domain
        for link_tag in soup.find_all('a', href=True):
            link_url = urljoin(url, link_tag['href'])
            if is_same_domain(link_url, base_domain) and link_url not in visited:
                to_visit.append(link_url)

    return page_data

# =======================================
# STEP 1: Widgets for a user-friendly UI
# =======================================

# 1. Button to mount Google Drive
mount_button = widgets.Button(
    description='Mount Drive',
    button_style='info',
    tooltip='Click to mount your Google Drive',
    icon='hdd'
)

# 2. Text field to specify or create a folder in Drive
folder_text = widgets.Text(
    value='WebCrawlResults',
    description='Folder:',
    placeholder='Folder in MyDrive to store crawl results'
)

folder_button = widgets.Button(
    description='Create/Check Folder',
    button_style='success',
    tooltip='Click to create folder if it does not exist',
    icon='folder-open'
)

# 3. Text field for the website URL
url_widget = widgets.Text(
    value='https://example.com',  # Default
    placeholder='Enter website URL here...',
    description='URL:'
)

# 4. Button to start the crawl
crawl_button = widgets.Button(
    description='Crawl & Save',
    button_style='warning',
    tooltip='Crawl the website and save text files to the specified folder',
    icon='search'
)

# 5. Output area for logs
output_area = widgets.Output()

# =======================================
# STEP 2: Define the event handlers
# =======================================
@output_area.capture(clear_output=True)
def on_mount_button_click(b):
    """Mount Google Drive."""
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")

@output_area.capture(clear_output=False)
def on_folder_button_click(b):
    """Create or check the specified folder in MyDrive."""
    global save_folder_path
    folder_name = folder_text.value.strip()
    if not folder_name:
        print("Please provide a valid folder name.")
        return

    save_folder_path = f'/content/drive/Shareddrives/PDP assistant/pdpdcrawler'
    if not os.path.exists(save_folder_path):
        os.makedirs(save_folder_path, exist_ok=True)
        print(f"Folder created: {save_folder_path}")
    else:
        print(f"Folder already exists: {save_folder_path}")

@output_area.capture(clear_output=False)
def on_crawl_button_click(b):
    """Crawl the URL in url_widget and save results to the specified folder."""
    # Check if we have a drive folder set up
    global save_folder_path
    if 'save_folder_path' not in globals() or not os.path.exists(save_folder_path):
        print("Please create/check the folder before crawling.")
        return

    start_url = url_widget.value.strip()
    if not start_url.startswith(('http://', 'https://')):
        print("Please provide a valid URL (including http:// or https://).")
        return

    print(f"Starting crawl for: {start_url}\n")
    results = crawl_website(start_url, max_pages=None, delay=1)

    if not results:
        print("No pages were found or crawled.")
        return

    print(f"Found {len(results)} pages. Saving to:\n{save_folder_path}\n")

    # 1) Create a single "master" file with all pages combined
    now_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    master_filename = f"ALL_PAGES_{now_str}.txt"
    master_filepath = os.path.join(save_folder_path, master_filename)

    with open(master_filepath, 'w', encoding='utf-8') as master_file:
        for url, (title, text) in results.items():
            # 2) Also create individual files for each page
            safe_title = clean_filename(title)[:100]  # limit length to avoid OS issues
            page_filename = f"{safe_title}_{now_str}.txt"
            page_filepath = os.path.join(save_folder_path, page_filename)

            with open(page_filepath, 'w', encoding='utf-8') as f:
                f.write(f"Title: {title}\n")
                f.write(f"URL: {url}\n")
                f.write(f"Date: {now_str}\n")
                f.write("\n======================\n\n")
                f.write(text)

            print(f"Saved: {page_filepath}")

            # 3) Append the same content to the master file
            master_file.write("========================================\n")
            master_file.write(f"Title: {title}\n")
            master_file.write(f"URL: {url}\n")
            master_file.write(f"Date: {now_str}\n")
            master_file.write("\n----------------------\n\n")
            master_file.write(f"{text}\n\n")

    print(f"\nAll pages were also combined into: {master_filepath}")
    print("\nCrawl complete!")

# =======================================
# STEP 3: Wire up the buttons
# =======================================
mount_button.on_click(on_mount_button_click)
folder_button.on_click(on_folder_button_click)
crawl_button.on_click(on_crawl_button_click)

# =======================================
# STEP 4: Display the UI
# =======================================
ui_box = widgets.VBox([
    widgets.HTML("<h3>1) Mount Google Drive</h3>"),
    mount_button,

    widgets.HTML("<hr><h3>2) Specify a folder in Drive to store results</h3>"),
    folder_text,
    folder_button,

    widgets.HTML("<hr><h3>3) Enter the website URL to crawl</h3>"),
    url_widget,

    widgets.HTML("<hr><h3>4) Crawl the site & save text files</h3>"),
    crawl_button,

    widgets.HTML("<hr>"),
    output_area
])

display(ui_box)



VBox(children=(HTML(value='<h3>1) Mount Google Drive</h3>'), Button(button_style='info', description='Mount Dr…