In [None]:
# Install Google Chrome
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install -y

# Install Python libraries
!pip install selenium requests beautifulsoup4 webdriver-manager google.colab

import time
import csv
import re
import requests
import urllib.parse
from google.colab import files
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager  # Use webdriver-manager to install the correct version of ChromeDriver

# Your 2Captcha API key
API_KEY = "YOUR_KEY_GOES_HERE"  # Replace with your actual 2Captcha API key

# Initialize Selenium WebDriver (headless Chrome) with ChromeDriverManager
def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Automatically download and set up ChromeDriver with ChromeDriverManager
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

# Function to solve CAPTCHA using 2Captcha
def solve_captcha(driver, site_key, url):
    if not site_key:
        print("No site key found, skipping CAPTCHA solving.")
        return None

    # Send CAPTCHA request to 2Captcha
    captcha_id_response = requests.post(
        "http://2captcha.com/in.php",
        data={
            'key': API_KEY,
            'method': 'userrecaptcha',
            'googlekey': site_key,
            'pageurl': url,
            'json': 1
        }
    ).json()

    if captcha_id_response["status"] == 1:
        captcha_id = captcha_id_response["request"]
        print("CAPTCHA sent for solving...")
        # Poll for solution
        fetch_url = f"http://2captcha.com/res.php?key={API_KEY}&action=get&id={captcha_id}&json=1"
        while True:
            time.sleep(5)  # Delay before each poll
            response = requests.get(fetch_url).json()
            if response["status"] == 1:
                print("CAPTCHA solved")
                return response["request"]
            elif response["status"] == 0 and response["request"] == "CAPCHA_NOT_READY":
                print("Waiting for CAPTCHA to be solved...")
            else:
                print("Error solving CAPTCHA:", response)
                return None
    else:
        print("Error submitting CAPTCHA:", captcha_id_response)
        return None

# Function to extract the site_key automatically
def get_site_key(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Look for 'data-sitekey' attributes in the page
    site_key = None
    recaptcha_div = soup.find('div', {'class': 'g-recaptcha', 'data-sitekey': True})
    if recaptcha_div:
        site_key = recaptcha_div['data-sitekey']
    else:
        # Try to find the key in a script tag if not in div
        script_tags = soup.find_all('script')
        for script in script_tags:
            if 'data-sitekey' in str(script):
                match = re.search(r'data-sitekey="(.+?)"', str(script))
                if match:
                    site_key = match.group(1)
                    break

    if site_key:
        print(f"Found site key: {site_key}")
    else:
        print("Site key not found")
    return site_key

# Function to clean URLs
def clean_url(url):
    parsed_url = urllib.parse.urlparse(url)
    clean_path = re.sub(r'/{2,}', '/', parsed_url.path)  # Remove duplicate slashes
    clean_url = urllib.parse.urlunparse(parsed_url._replace(path=clean_path, query='', fragment=''))
    return clean_url

# Text chunking function
def split_text_into_chunks(text, max_chunk_size=2000):
    # Split the text into sentences using regex that detects sentence endings.
    sentences = re.split(r'(?<=[.!?]) +', text.strip())

    chunks = []
    current_chunk = []

    for sentence in sentences:
        # Join current chunk and check its size.
        if len(' '.join(current_chunk + [sentence])) <= max_chunk_size:
            current_chunk.append(sentence)
        else:
            # If adding this sentence exceeds the chunk size, start a new chunk.
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]

    # Add the last chunk (if any).
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Updated extract_content function
def extract_content(driver, url):
    driver.get(url)

    # Wait for the page to load
    time.sleep(2)

    # Automatically find the site_key for CAPTCHA
    site_key = get_site_key(driver)

    # If CAPTCHA detected, solve it
    if site_key:
        captcha_solution = solve_captcha(driver, site_key, url)
        if captcha_solution:
            driver.execute_script(f'document.getElementById("g-recaptcha-response").innerHTML="{captcha_solution}";')
            driver.execute_script('document.querySelector("form").submit();')
            time.sleep(5)
            driver.get(url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    content = []
    position = 1

    exclude_patterns = [
        re.compile(r'header', re.IGNORECASE),
        re.compile(r'footer', re.IGNORECASE),
        re.compile(r'navigation', re.IGNORECASE),
        re.compile(r'nav', re.IGNORECASE),
        re.compile(r'ads', re.IGNORECASE),
        re.compile(r'sidebar', re.IGNORECASE),
        re.compile(r'banner', re.IGNORECASE),
        re.compile(r'popup', re.IGNORECASE),
        re.compile(r'button', re.IGNORECASE),
        re.compile(r'script', re.IGNORECASE),
        re.compile(r'style', re.IGNORECASE),
        re.compile(r'widget', re.IGNORECASE),
        re.compile(r'social', re.IGNORECASE),
        re.compile(r'comment', re.IGNORECASE),
        re.compile(r'search', re.IGNORECASE),
        re.compile(r'breadcrumb', re.IGNORECASE),
    ]

    current_text = ''
    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'img']):
        # Skip excluded elements
        if any(pattern.search(tag.get('class', [''])[0]) for pattern in exclude_patterns if tag.get('class')) or \
           any(pattern.search(tag.get('id', '')) for pattern in exclude_patterns) or \
           any(pattern.search(tag.name) for pattern in exclude_patterns):
            continue

        if tag.name == 'img':
            # Before adding the image, process any accumulated text
            if current_text.strip():
                # Split accumulated text into chunks
                chunks = split_text_into_chunks(current_text.strip())
                for chunk in chunks:
                    content.append((url, chunk, 'text', position))
                    position += 1
                current_text = ''
            # Process the image
            img_url = tag.get('src')
            if img_url:
                img_url = urllib.parse.urljoin(url, img_url)
                content.append((url, img_url, 'image', position))
                position += 1
        else:
            text = tag.get_text(strip=True)
            if text:
                current_text += ' ' + text  # Accumulate text
                # Check if current_text exceeds max_chunk_size
                if len(current_text) >= 2000:
                    chunks = split_text_into_chunks(current_text.strip())
                    for chunk in chunks:
                        content.append((url, chunk, 'text', position))
                        position += 1
                    current_text = ''  # Reset after processing

    # After the loop, process any remaining text
    if current_text.strip():
        chunks = split_text_into_chunks(current_text.strip())
        for chunk in chunks:
            content.append((url, chunk, 'text', position))
            position += 1

    return content

# Function to scrape all content from a list of URLs
def scrape_all_content(driver, urls):
    all_content = []
    for url in urls:
        url = clean_url(url)
        print(f"Scraping content from: {url}")

        try:
            content = extract_content(driver, url)
            all_content.extend(content)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            continue

        time.sleep(2)  # Wait for 2 seconds between requests

    return all_content

# Function to format multiline string of URLs
def format_urls(multiline_string):
    # Split the input string by new lines and strip extra spaces
    url_list = [url.strip() for url in multiline_string.splitlines() if url.strip()]
    return url_list

# Multiline list of URLs provided
multiline_urls = """
https://URL1.com
https://URL2.com
https://URL3.com
"""  # Add more URLs as needed

# Convert the multiline string to a list of URLs
urls = format_urls(multiline_urls)

# Initialize the driver and start scraping
driver = init_driver()

# Scrape content from the provided URLs
all_content = scrape_all_content(driver, urls)

# Export to CSV
csv_filename = 'EXPORT_FILE_NAME.csv' # Replace with the actual name you want to export the file as
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, escapechar='\\', quoting=csv.QUOTE_NONNUMERIC)
    writer.writerow(['url', 'content', 'type', 'position'])  # Header row
    for row in all_content:
        writer.writerow(row)

print(f"Website content has been saved to {csv_filename}")

# Download the CSV file to the local device
files.download(csv_filename)

# Clean up and close the driver
driver.quit()