In [None]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
from urllib3.util import Retry
from requests.adapters import HTTPAdapter
import csv
import re
import time
from google.colab import files

def clean_url(url):
    parsed_url = urllib.parse.urlparse(url)
    clean_path = re.sub(r'\/{2,}', '/', parsed_url.path)  # Remove duplicate slashes
    clean_url = urllib.parse.urlunparse(parsed_url._replace(path=clean_path, query='', fragment=''))
    return clean_url

def get_all_links(url, domain):
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = set()

    for a_tag in soup.find_all('a', href=True):
        link = urllib.parse.urljoin(url, a_tag['href'])
        clean_link = clean_url(link)
        if domain in urllib.parse.urlparse(clean_link).netloc:
            links.add(clean_link)

    return links

def get_all_urls(base_url):
    to_visit = set([base_url])
    visited = set()
    all_urls = set()
    domain = urllib.parse.urlparse(base_url).netloc

    while to_visit:
        current_url = to_visit.pop()
        if current_url in visited:
            continue

        visited.add(current_url)
        all_urls.add(current_url)
        print(f"Collecting links from: {current_url}")

        try:
            # Get all links from the current page
            links = get_all_links(current_url, domain)
            to_visit.update(links - visited)

        except requests.exceptions.RequestException as e:
            if e.response and e.response.status_code == 429:
                print(f"Rate limit exceeded. Waiting for 70 seconds before retrying...") # Rate limiting - adjust as needed
                time.sleep(70) # Wait for 70 seconds before retrying (rate limiting - adjust as needed)
                to_visit.add(current_url)  # Retry the current URL
                continue
            print(f"Error collecting links from {current_url}: {e}")
            continue

        time.sleep(15)  # Wait for 15 seconds between requests (rate limiting - adjust as needed)

    return all_urls

def extract_content(url):
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    content = []
    position = 1

    # Extract text and images based on tags
    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'img']):
        if tag.name == 'img':
            img_url = tag.get('src')
            if img_url:
                img_url = urllib.parse.urljoin(url, img_url)
                content.append((url, img_url, 'image', position))
                position += 1
        else:
            text = tag.get_text(strip=True)
            if text:
                content.append((url, text, 'text', position))
                position += 1

    return content

def scrape_all_content(urls):
    all_content = []
    for url in urls:
        print(f"Scraping content from: {url}")

        try:
            # Extract content from the current page
            content = extract_content(url)
            all_content.extend(content)

        except requests.exceptions.RequestException as e:
            if e.response and e.response.status_code == 429:
                print(f"Rate limit exceeded. Waiting for 70 seconds before retrying...") # Rate limiting - adjust as needed
                time.sleep(70)
                # Retry the current URL after delay (rate limiting - adjust as needed)
                try:
                    content = extract_content(url)
                    all_content.extend(content)
                except Exception as retry_e:
                    print(f"Error scraping {url} after retry: {retry_e}")
            else:
                print(f"Error scraping {url}: {e}")

        time.sleep(15)  # Wait for 15 seconds between requests (rate limiting - adjust as needed)

    return all_content

# Create a session with retry logic
session = requests.Session()
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))

# SET WEBSITE TO SCRAPE HERE
base_url = 'https://YOUR_URL_GOES_HERE.com'  # Replace with your target domain, e.g., 'https://example.com' (this is the starting point for the scraper)
all_urls = get_all_urls(base_url)
all_content = scrape_all_content(all_urls)

# Export to CSV
csv_filename = 'EXPORT_FILE_NAME.csv' # Replace with the desired filename for the CSV output
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['url', 'content', 'type', 'position'])  # Header row
    for row in all_content:
        writer.writerow(row)

print(f"Website content has been saved to {csv_filename}")

# Download the CSV file to the local device
files.download(csv_filename)