In [None]:
# Install Google Chrome
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install -y

# Install Python libraries
!pip install selenium requests beautifulsoup4 webdriver-manager google.colab

import time
import csv
import re
import requests
import urllib.parse
from google.colab import files
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager  # Use webdriver-manager to install the correct version of ChromeDriver
from selenium.webdriver.common.by import By

# Your 2Captcha API key
API_KEY = "YOUR_API_KEY_HERE"  # Replace with your actual 2Captcha API key

# Initialize Selenium WebDriver (headless Chrome) with ChromeDriverManager
def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Automatically download and set up ChromeDriver with ChromeDriverManager
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

# Function to solve CAPTCHA using 2Captcha
def solve_captcha(driver, site_key, url):
    if not site_key:
        print("No site key found, skipping CAPTCHA solving.")
        return None

    # Send CAPTCHA request to 2Captcha
    captcha_id_response = requests.post(
        "http://2captcha.com/in.php",
        data={
            'key': API_KEY,
            'method': 'userrecaptcha',
            'googlekey': site_key,
            'pageurl': url,
            'json': 1
        }
    ).json()

    if captcha_id_response["status"] == 1:
        captcha_id = captcha_id_response["request"]
        print("CAPTCHA sent for solving...")
        # Poll for solution
        fetch_url = f"http://2captcha.com/res.php?key={API_KEY}&action=get&id={captcha_id}&json=1"
        while True:
            time.sleep(5)  # Delay before each poll
            response = requests.get(fetch_url).json()
            if response["status"] == 1:
                print("CAPTCHA solved")
                return response["request"]
            elif response["status"] == 0 and response["request"] == "CAPCHA_NOT_READY":
                print("Waiting for CAPTCHA to be solved...")
            else:
                print("Error solving CAPTCHA:", response)
                return None
    else:
        print("Error submitting CAPTCHA:", captcha_id_response)
        return None

# Function to extract the site_key automatically
def get_site_key(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Look for 'data-sitekey' attributes in the page
    site_key = None
    recaptcha_div = soup.find('div', {'class': 'g-recaptcha', 'data-sitekey': True})
    if recaptcha_div:
        site_key = recaptcha_div['data-sitekey']
    else:
        # Try to find the key in a script tag if not in div
        script_tags = soup.find_all('script')
        for script in script_tags:
            if 'data-sitekey' in str(script):
                match = re.search(r'data-sitekey="(.+?)"', str(script))
                if match:
                    site_key = match.group(1)
                    break

    if site_key:
        print(f"Found site key: {site_key}")
    else:
        print("Site key not found")
    return site_key

# Function to clean URLs
def clean_url(url):
    parsed_url = urllib.parse.urlparse(url)
    clean_path = re.sub(r'/+', '/', parsed_url.path)  # Remove duplicate slashes
    clean_url = urllib.parse.urlunparse(parsed_url._replace(path=clean_path, query='', fragment=''))
    return clean_url

# HTML chunking function without breaking tags
def split_html_into_chunks(html, max_chunk_size=2000):
    chunks = []
    current_chunk = ''
    position = 0

    # Find all positions where we can safely split (after closing tags)
    split_positions = [m.end() for m in re.finditer(r'>', html)]
    split_positions.append(len(html))  # Ensure we include the end of the HTML

    last_position = 0
    for pos in split_positions:
        # Extract potential chunk
        potential_chunk = html[last_position:pos]
        if len(current_chunk) + len(potential_chunk) <= max_chunk_size:
            current_chunk += potential_chunk
            last_position = pos
        else:
            # Add current chunk to chunks and start a new chunk
            chunks.append(current_chunk)
            current_chunk = potential_chunk
            last_position = pos

    # Add any remaining content as the last chunk
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Function to extract raw HTML content from a webpage
def extract_content(driver, url):
    driver.get(url)

    # Wait for the page to load
    time.sleep(2)

    # Automatically find the site_key for CAPTCHA
    site_key = get_site_key(driver)

    # If CAPTCHA detected, solve it
    if site_key:
        captcha_solution = solve_captcha(driver, site_key, url)
        if captcha_solution:
            driver.execute_script(f'document.getElementById("g-recaptcha-response").innerHTML="{captcha_solution}";')
            driver.execute_script('document.querySelector("form").submit();')
            time.sleep(5)
            driver.get(url)

    # Get the raw HTML content
    html_content = driver.page_source

    # Split the HTML into chunks without breaking tags
    chunks = split_html_into_chunks(html_content, max_chunk_size=2000)

    content = []
    position = 1
    for chunk in chunks:
        content.append((url, chunk, 'HTML', position))
        position += 1

    return content

# Function to get all internal links from a webpage
def get_all_links(driver, url, domain):
    links = set()
    try:
        driver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            href = urllib.parse.urljoin(url, href)
            parsed_href = urllib.parse.urlparse(href)
            if parsed_href.netloc == domain:
                clean_href = clean_url(href)
                links.add(clean_href)
    except Exception as e:
        print(f"Error collecting links from {url}: {e}")
    return links

# Function to crawl the website and collect all URLs
def crawl_website(driver, base_url):
    domain = urllib.parse.urlparse(base_url).netloc
    to_visit = set([base_url])
    visited = set()
    all_urls = []

    while to_visit:
        current_url = to_visit.pop()
        if current_url in visited:
            continue
        print(f"Crawling: {current_url}")
        visited.add(current_url)
        all_urls.append(current_url)

        # Get all internal links from the current page
        links = get_all_links(driver, current_url, domain)
        to_visit.update(links - visited)

    return all_urls

# Function to scrape all content from a list of URLs
def scrape_all_content(driver, urls):
    all_content = []
    for url in urls:
        url = clean_url(url)
        print(f"Scraping content from: {url}")

        try:
            content = extract_content(driver, url)
            all_content.extend(content)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            continue

        time.sleep(2)  # Wait for 2 seconds between requests

    return all_content

# Main execution
if __name__ == "__main__":
    # Base URL of the website to crawl
    base_url = "https://WEBSITE_YOU_WANT_TO_CRAWL.com"  # Replace with your target website

    # Initialize the driver
    driver = init_driver()

    # Crawl the website to get all URLs
    all_urls = crawl_website(driver, base_url)
    print(f"Total URLs collected: {len(all_urls)}")

    # Scrape content from the collected URLs
    all_content = scrape_all_content(driver, all_urls)

    # Export to CSV
    csv_filename = 'FILE_NAME.csv' # Replace with the name you want the exported file to have
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, escapechar='\\', quoting=csv.QUOTE_NONNUMERIC)
        writer.writerow(['url', 'content', 'type', 'position'])  # Header row
        for row in all_content:
            writer.writerow(row)

    print(f"Website content has been saved to {csv_filename}")

    # Download the CSV file to the local device
    files.download(csv_filename)

    # Clean up and close the driver
    driver.quit()
