# Notebook For Web Scraping


In [None]:
!pip install selenium pandas tqdm webdriver-manager

Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading webdriver_manager-4.0.2-py2.py3-none-any.w

In [None]:
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/

0% [Working]            Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,642 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,718 kB]
Get:13 https://r2u.st

In [None]:
import os
import time
import random
import re
import pandas as pd
from tqdm import tqdm
from urllib.parse import urljoin, urlparse

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException

In [None]:


class MATLABDocScraper:
    def __init__(self, start_url, delay_range=(2, 5), output_dir="matlab_docs"):
        self.start_url = start_url
        self.delay_range = delay_range
        self.visited_urls = set()
        self.to_visit = [start_url]
        self.base_domain = urlparse(start_url).netloc
        self.output_dir = output_dir
        self.docs = []

        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Setup Chrome options for Colab
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')

        # Add realistic user agent
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36")

        # Initialize the Chrome driver with Colab's ChromeDriver path
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 15)

    def is_valid_url(self, url):
        """Check if URL is valid and belongs to the same domain"""
        parsed = urlparse(url)
        # Focus on help documentation and filter out non-documentation pages
        if parsed.netloc != self.base_domain:
            return False
        if "javascript:" in url or url.endswith((".pdf", ".zip", ".jpg", ".png", ".gif")):
            return False
        # Focus on help pages
        if "mathworks.com/help" not in url:
            return False
        return True

    def extract_content(self, url):
        """Extract relevant content from the page"""
        try:
            # Wait for main content to load
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))

            # Extract title
            title = self.driver.title

            # Try to find the main content area
            main_content = None

            # Try different possible content container selectors
            for selector in ["div.body_content", "div#doc_center_content", "div.row-offcanvas",
                          "section.content_container", "div.doc_content_container"]:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if elements:
                        main_content = elements[0]
                        break
                except:
                    continue

            # Fallback to body if specific containers aren't found
            if not main_content:
                main_content = self.driver.find_element(By.TAG_NAME, "body")

            # Extract text content
            text_content = main_content.text
            html_content = main_content.get_attribute('outerHTML')

            # Store document
            self.docs.append({
                "url": url,
                "title": title,
                "content": text_content,
                "html": html_content
            })

            # Print success message for debugging
            print(f"Successfully scraped: {title[:50]}...")

        except Exception as e:
            print(f"Error extracting content from {url}: {str(e)}")

    def extract_links(self):
        """Extract links from the current page"""
        links = []
        current_url = self.driver.current_url

        try:
            # Find all link elements
            a_elements = self.driver.find_elements(By.TAG_NAME, "a")

            for element in a_elements:
                try:
                    href = element.get_attribute("href")
                    if href:
                        absolute_link = urljoin(current_url, href)

                        # Only add links that are valid and not already visited or in queue
                        if (self.is_valid_url(absolute_link) and
                            absolute_link not in self.visited_urls and
                            absolute_link not in self.to_visit):
                            links.append(absolute_link)
                except StaleElementReferenceException:
                    continue
                except Exception as e:
                    pass  # Silently ignore individual link errors

        except Exception as e:
            print(f"Error finding links on {current_url}: {str(e)}")

        return links

    def clean_data(self):
        """Clean and preprocess the scraped data"""
        cleaned_docs = []

        for doc in self.docs:
            content = doc["content"]

            # Remove excessive whitespace
            content = re.sub(r'\s+', ' ', content).strip()

            # Remove common HTML artifacts
            content = re.sub(r'[\n\r\t]', ' ', content)

            # Store cleaned content
            cleaned_docs.append({
                "url": doc["url"],
                "title": doc["title"],
                "content": content
            })

        return pd.DataFrame(cleaned_docs)

    def crawl(self, max_pages=200):
        """Start crawling from the initial URL"""
        count = 0

        try:
            with tqdm(total=max_pages, desc="Scraping MATLAB docs") as pbar:
                while self.to_visit and count < max_pages:
                    # Get the next URL to visit
                    url = self.to_visit.pop(0)

                    # Skip if already visited
                    if url in self.visited_urls:
                        continue

                    # Mark as visited
                    self.visited_urls.add(url)

                    try:
                        # Random delay to avoid detection
                        delay = random.uniform(*self.delay_range)
                        time.sleep(delay)

                        # Print current URL for debugging
                        print(f"Visiting: {url}")

                        # Navigate to the page
                        self.driver.get(url)

                        # Check if we've been blocked
                        if "Access Denied" in self.driver.title or "Forbidden" in self.driver.title:
                            print(f"Access denied for {url}, possibly blocked")
                            continue

                        # Extract content
                        self.extract_content(url)

                        # Extract links and add to queue
                        links = self.extract_links()
                        self.to_visit.extend(links)
                        print(f"Found {len(links)} new links")

                        count += 1
                        pbar.update(1)
                        pbar.set_postfix({"URL": url[-30:] if len(url) > 30 else url})

                        # Periodically save progress
                        if count % 5 == 0:
                            temp_df = self.clean_data()
                            temp_df.to_csv(f"{self.output_dir}/matlab_docs_progress.csv", index=False)
                            print(f"Progress saved: {len(temp_df)} documents")

                    except Exception as e:
                        print(f"Error processing {url}: {str(e)}")

        except KeyboardInterrupt:
            print("Scraping interrupted by user")

        finally:
            # Close the browser
            self.driver.quit()

        # Save final results as CSV
        df = self.clean_data()
        if not df.empty:
            df.to_csv(f"{self.output_dir}/matlab_docs.csv", index=False)
            print(f"Scraped {len(df)} pages. Data saved to {self.output_dir}/matlab_docs.csv")
        else:
            print("No data was scraped.")

        return df

# Start scraping
if __name__ == "__main__":
    start_url = "https://in.mathworks.com/help/slrealtime/ug/troubleshooting-basics.html"
    scraper = MATLABDocScraper(start_url)
    docs_df = scraper.crawl(max_pages=500)

Scraping MATLAB docs:   0%|          | 0/500 [00:00<?, ?it/s]

Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshooting-basics.html
Successfully scraped: Troubleshooting Basics - MATLAB &amp; Simulink...


Scraping MATLAB docs:   0%|          | 1/500 [00:06<57:29,  6.91s/it, URL=ug/troubleshooting-basics.html]

Found 41 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshooting-basics.html#skip_link_anchor
Successfully scraped: Troubleshooting Basics - MATLAB &amp; Simulink...


Scraping MATLAB docs:   0%|          | 2/500 [00:11<46:18,  5.58s/it, URL=g-basics.html#skip_link_anchor]

Found 0 new links
Visiting: https://in.mathworks.com/help/?s_tid=user_nav_help
Successfully scraped: Help Center...


Scraping MATLAB docs:   1%|          | 3/500 [00:21<1:03:36,  7.68s/it, URL=.com/help/?s_tid=user_nav_help]

Found 201 new links
Visiting: https://in.mathworks.com/help/index.html?s_tid=CRUX_lftnav


Scraping MATLAB docs:   1%|          | 4/500 [00:26<52:52,  6.40s/it, URL=p/index.html?s_tid=CRUX_lftnav]

Successfully scraped: in.mathworks.com...
Found 0 new links
Visiting: https://in.mathworks.com/help/overview/real-time-simulation-and-testing.html?s_tid=hc_product_group_bc


Scraping MATLAB docs:   1%|          | 5/500 [00:29<43:16,  5.25s/it, URL=html?s_tid=hc_product_group_bc]

Successfully scraped: in.mathworks.com...
Found 0 new links
Progress saved: 5 documents
Visiting: https://in.mathworks.com/help/slrealtime/index.html?s_tid=CRUX_lftnav
Successfully scraped: Simulink Real-Time Documentation...


Scraping MATLAB docs:   1%|          | 6/500 [00:36<49:02,  5.96s/it, URL=e/index.html?s_tid=CRUX_lftnav]

Found 29 new links
Visiting: https://in.mathworks.com/help/slrealtime/troubleshooting-in-slrt-target.html?s_tid=CRUX_lftnav
Successfully scraped: Troubleshooting in Simulink Real-Time - MATLAB &am...


Scraping MATLAB docs:   1%|▏         | 7/500 [00:45<56:17,  6.85s/it, URL=-target.html?s_tid=CRUX_lftnav]

Found 5 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshooting-basics.html#responsive_offcanvas
Successfully scraped: Troubleshooting Basics - MATLAB &amp; Simulink...


Scraping MATLAB docs:   2%|▏         | 8/500 [00:51<55:21,  6.75s/it, URL=sics.html#responsive_offcanvas]

Found 0 new links
Visiting: https://in.mathworks.com/help/slrealtime/troubleshooting-in-slrt-target.html?s_tid=CRUX_topnav
Successfully scraped: Troubleshooting in Simulink Real-Time - MATLAB &am...


Scraping MATLAB docs:   2%|▏         | 9/500 [00:59<56:39,  6.92s/it, URL=-target.html?s_tid=CRUX_topnav]

Found 1 new links
Visiting: https://in.mathworks.com/help/slrealtime/examples.html?s_tid=CRUX_topnav&category=troubleshooting-in-slrt-target
Successfully scraped: Example List - MATLAB &amp; Simulink...


Scraping MATLAB docs:   2%|▏         | 10/500 [01:06<58:27,  7.16s/it, URL=troubleshooting-in-slrt-target]

Found 3 new links
Progress saved: 10 documents
Visiting: https://in.mathworks.com/help/slrealtime/referencelist.html?type=function&s_tid=CRUX_topnav&category=troubleshooting-in-slrt-target


Scraping MATLAB docs:   2%|▏         | 11/500 [01:11<50:44,  6.23s/it, URL=troubleshooting-in-slrt-target]

Successfully scraped: in.mathworks.com...
Found 0 new links
Visiting: https://in.mathworks.com/help/slrealtime/referencelist.html?type=block&s_tid=CRUX_topnav&category=troubleshooting-in-slrt-target
Successfully scraped: Troubleshooting in Simulink Real-Time — Blocks...


Scraping MATLAB docs:   2%|▏         | 12/500 [01:20<58:27,  7.19s/it, URL=troubleshooting-in-slrt-target]

Found 10 new links
Visiting: https://in.mathworks.com/help/slrealtime/referencelist.html?type=app&s_tid=CRUX_topnav&category=troubleshooting-in-slrt-target


Scraping MATLAB docs:   3%|▎         | 13/500 [01:24<50:34,  6.23s/it, URL=troubleshooting-in-slrt-target]

Successfully scraped: in.mathworks.com...
Found 0 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-communication-failure-through-firewall.html
Successfully scraped: Troubleshoot Communication Failure Through Firewal...


Scraping MATLAB docs:   3%|▎         | 14/500 [01:35<1:03:20,  7.82s/it, URL=-failure-through-firewall.html]

Found 13 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-cannot-load-shared-object-on-target-computer.html
Successfully scraped: Troubleshoot Cannot Load Shared Object on Target C...


Scraping MATLAB docs:   3%|▎         | 15/500 [01:42<59:48,  7.40s/it, URL=object-on-target-computer.html]

Found 12 new links
Progress saved: 15 documents
Visiting: https://in.mathworks.com/help/slrealtime/ug/vector-canape-troubleshooting.html
Successfully scraped: Troubleshoot Vector CANape Operation - MATLAB &amp...


Scraping MATLAB docs:   3%|▎         | 16/500 [01:50<1:01:55,  7.68s/it, URL=or-canape-troubleshooting.html]

Found 14 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/etas-inca-troubleshooting.html
Successfully scraped: Troubleshoot ETAS Inca Operation - MATLAB &amp; Si...


Scraping MATLAB docs:   3%|▎         | 17/500 [01:58<1:00:54,  7.57s/it, URL=etas-inca-troubleshooting.html]

Found 13 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug_upgrade/troubleshoot-system-upgrade-to-r2020b.html
Successfully scraped: Troubleshoot System Upgrade for R2020b - MATLAB &a...


Scraping MATLAB docs:   4%|▎         | 18/500 [02:04<58:46,  7.32s/it, URL=-system-upgrade-to-r2020b.html]

Found 6 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-missing-real-time-tab.html
Successfully scraped: Troubleshoot Missing Real-Time Tab - MATLAB &amp; ...


Scraping MATLAB docs:   4%|▍         | 19/500 [02:11<57:05,  7.12s/it, URL=oot-missing-real-time-tab.html]

Found 6 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-folder-names-with-spaces-or-special-characters-halt-model-builds.html
Successfully scraped: Troubleshoot Folder Names with Spaces or Special C...


Scraping MATLAB docs:   4%|▍         | 20/500 [02:18<55:42,  6.96s/it, URL=racters-halt-model-builds.html]

Found 8 new links
Progress saved: 20 documents
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-model-links-to-static-libraries-or-shared-objects.html


Scraping MATLAB docs:   4%|▍         | 21/500 [02:21<46:56,  5.88s/it, URL=braries-or-shared-objects.html]

Successfully scraped: in.mathworks.com...
Found 0 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-build-error-for-accelerator-mode.html
Successfully scraped: Troubleshoot Build Error for Accelerator Mode - MA...


Scraping MATLAB docs:   4%|▍         | 22/500 [02:28<50:39,  6.36s/it, URL=rror-for-accelerator-mode.html]

Found 8 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-long-build-times-for-real-time-application.html
Successfully scraped: Troubleshoot Long Build Times for Real-Time Applic...


Scraping MATLAB docs:   5%|▍         | 23/500 [02:34<48:33,  6.11s/it, URL=for-real-time-application.html]

Found 5 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-working-with-persistent-variables.html


Scraping MATLAB docs:   5%|▍         | 24/500 [02:38<42:52,  5.40s/it, URL=with-persistent-variables.html]

Successfully scraped: in.mathworks.com...
Found 0 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug_upgrade/troubleshoot-model-upgrade-to-r2020b.html
Successfully scraped: Troubleshoot Model Upgrade for R2020b - MATLAB &am...


Scraping MATLAB docs:   5%|▌         | 25/500 [02:46<48:42,  6.15s/it, URL=t-model-upgrade-to-r2020b.html]

Found 7 new links
Progress saved: 25 documents
Visiting: https://in.mathworks.com/help/slrealtime/ug_upgrade/troubleshoot-s-function-build-upgrade-for-r2020b.html
Successfully scraped: Troubleshoot S-Function Build Upgrade for R2020b -...


Scraping MATLAB docs:   5%|▌         | 26/500 [02:53<50:42,  6.42s/it, URL=-build-upgrade-for-r2020b.html]

Found 7 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-parameters-not-accessible-by-name.html
Successfully scraped: Troubleshoot Parameters Not Accessible by Name - M...


Scraping MATLAB docs:   5%|▌         | 27/500 [03:00<51:58,  6.59s/it, URL=rs-not-accessible-by-name.html]

Found 11 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-signals-not-accessible-by-name.html
Successfully scraped: Troubleshoot Signals Not Accessible by Name - MATL...


Scraping MATLAB docs:   6%|▌         | 28/500 [03:09<57:59,  7.37s/it, URL=ls-not-accessible-by-name.html]

Found 16 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-signal-data-logging-from-nonvirtual-bus-fixed-point-and-multidimensional-signals.html
Successfully scraped: Troubleshoot Signal Data Logging from Nonvirtual B...


Scraping MATLAB docs:   6%|▌         | 29/500 [03:17<59:16,  7.55s/it, URL=-multidimensional-signals.html]

Found 31 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-signal-data-logging-from-inport-ref-model.html
Successfully scraped: Troubleshoot Signal Data Logging from Inport in Re...


Scraping MATLAB docs:   6%|▌         | 30/500 [03:26<1:03:59,  8.17s/it, URL=ing-from-inport-ref-model.html]

Found 9 new links
Progress saved: 30 documents
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-signal-data-logging-from-inport-ref-mode-test-harness.html
Successfully scraped: Troubleshoot Signal Data Logging from Inport in Re...


Scraping MATLAB docs:   6%|▌         | 31/500 [03:33<1:00:13,  7.70s/it, URL=ort-ref-mode-test-harness.html]

Found 6 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-signal-data-logging-from-send-and-receive-blocks.html
Successfully scraped: Troubleshoot Signal Data Logging from Send and Rec...


Scraping MATLAB docs:   6%|▋         | 32/500 [03:41<1:00:05,  7.70s/it, URL=m-send-and-receive-blocks.html]

Found 12 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-signals-for-streaming-or-file-log-logging.html
Successfully scraped: Troubleshoot Signals for Streaming or File Logging...


Scraping MATLAB docs:   7%|▋         | 33/500 [03:47<56:08,  7.21s/it, URL=aming-or-file-log-logging.html]

Found 8 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-unsatisfactory-real-time-performance.html
Successfully scraped: Troubleshoot Unsatisfactory Real-Time Performance ...


Scraping MATLAB docs:   7%|▋         | 34/500 [03:54<56:26,  7.27s/it, URL=ory-real-time-performance.html]

Found 29 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-overloaded-cpu-from-executing-real-time-application.html
Successfully scraped: Troubleshoot Overloaded CPU from Executing Real-Ti...


Scraping MATLAB docs:   7%|▋         | 35/500 [04:03<59:06,  7.63s/it, URL=ing-real-time-application.html]

Found 18 new links
Progress saved: 35 documents
Visiting: https://in.mathworks.com/help/slrealtime/ug/troubleshoot-gaps-in-streamed-data.html
Successfully scraped: Troubleshoot Gaps in Streamed Data - MATLAB &amp; ...


Scraping MATLAB docs:   7%|▋         | 36/500 [04:09<55:14,  7.14s/it, URL=oot-gaps-in-streamed-data.html]

Found 7 new links
Visiting: https://in.mathworks.com/help/slrealtime/ug_upgrade/troubleshoot-matlab-api-call-upgrade-for-r2020b.html


Scraping MATLAB docs:   7%|▋         | 36/500 [04:15<54:46,  7.08s/it, URL=oot-gaps-in-streamed-data.html]


Scraping interrupted by user
Scraped 36 pages. Data saved to matlab_docs/matlab_docs.csv


In [None]:
import os
import time
import random
import pandas as pd
from tqdm import tqdm
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

class MATLABLinkScraper:
    def __init__(self, start_url, delay_range=(1.5, 3), output_dir="matlab_links"):
        self.start_url = start_url
        self.delay_range = delay_range
        self.visited_urls = set()
        self.to_visit = [start_url]
        self.base_domain = urlparse(start_url).netloc
        self.output_dir = output_dir
        self.links_data = []  # Will store all links found

        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Setup Chrome options for Colab
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')

        # Add realistic user agent
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36")

        # Initialize the Chrome driver
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 10)

    def is_valid_url(self, url):
        """Check if URL is valid and belongs to the same domain"""
        if not url:
            return False
        parsed = urlparse(url)
        # Focus on mathworks domain
        if parsed.netloc != self.base_domain:
            return False
        if "javascript:" in url:
            return False
        return True

    def extract_all_links(self):
        """Extract all links (a href and img src) from the current page"""
        current_url = self.driver.current_url
        page_links = []
        source_page_title = self.driver.title

        try:
            # Extract regular hyperlinks
            a_elements = self.driver.find_elements(By.TAG_NAME, "a")
            for element in a_elements:
                try:
                    href = element.get_attribute("href")
                    text = element.text
                    if href:
                        absolute_link = urljoin(current_url, href)
                        link_type = "hyperlink"

                        # Add to links data
                        self.links_data.append({
                            "source_url": current_url,
                            "source_title": source_page_title,
                            "link_url": absolute_link,
                            "link_text": text,
                            "link_type": link_type
                        })

                        # Only add valid links to visit queue
                        if (self.is_valid_url(absolute_link) and
                            absolute_link not in self.visited_urls and
                            absolute_link not in self.to_visit):
                            page_links.append(absolute_link)

                except StaleElementReferenceException:
                    continue
                except Exception as e:
                    pass

            # Extract image links
            img_elements = self.driver.find_elements(By.TAG_NAME, "img")
            for element in img_elements:
                try:
                    src = element.get_attribute("src")
                    alt = element.get_attribute("alt") or ""
                    if src:
                        absolute_link = urljoin(current_url, src)
                        link_type = "image"

                        # Add to links data
                        self.links_data.append({
                            "source_url": current_url,
                            "source_title": source_page_title,
                            "link_url": absolute_link,
                            "link_text": alt,
                            "link_type": link_type
                        })

                except StaleElementReferenceException:
                    continue
                except Exception as e:
                    pass

        except Exception as e:
            print(f"Error finding links on {current_url}: {str(e)}")

        return page_links

    def save_links_to_csv(self):
        """Save collected links to CSV file"""
        df = pd.DataFrame(self.links_data)
        csv_path = f"{self.output_dir}/matlab_links.csv"
        df.to_csv(csv_path, index=False)
        return df

    def crawl(self, max_pages=200):
        """Start crawling from the initial URL"""
        count = 0

        try:
            with tqdm(total=max_pages, desc="Scraping MATLAB links") as pbar:
                while self.to_visit and count < max_pages:
                    # Get the next URL to visit
                    url = self.to_visit.pop(0)

                    # Skip if already visited
                    if url in self.visited_urls:
                        continue

                    # Mark as visited
                    self.visited_urls.add(url)

                    try:
                        # Random delay to avoid detection
                        delay = random.uniform(*self.delay_range)
                        time.sleep(delay)

                        # Navigate to the page
                        self.driver.get(url)

                        # Check if we've been blocked
                        if "Access Denied" in self.driver.title or "Forbidden" in self.driver.title:
                            print(f"Access denied for {url}, possibly blocked")
                            continue

                        # Extract links and add to queue
                        links = self.extract_all_links()
                        self.to_visit.extend(links)

                        count += 1
                        pbar.update(1)
                        pbar.set_postfix({"URL": url[-30:] if len(url) > 30 else url})

                        # Periodically save progress
                        if count % 10 == 0:
                            temp_df = self.save_links_to_csv()
                            print(f"Progress saved: {len(temp_df)} links collected")

                    except Exception as e:
                        print(f"Error processing {url}: {str(e)}")

        except KeyboardInterrupt:
            print("Scraping interrupted by user")

        finally:
            # Close the browser
            self.driver.quit()

        # Save final results as CSV
        final_df = self.save_links_to_csv()
        print(f"Scraped {len(final_df)} links from {count} pages. Data saved to {self.output_dir}/matlab_links.csv")

        return final_df

# Start scraping
if __name__ == "__main__":
    start_url = "https://in.mathworks.com/help/slrealtime/ug/troubleshooting-basics.html"
    scraper = MATLABLinkScraper(start_url)
    links_df = scraper.crawl(max_pages=200)  # Adjust as needed

    # Display sample of scraped links
    print("\nSample of scraped links:")
    print(links_df.head())

Scraping MATLAB links:   5%|▌         | 10/200 [01:24<22:52,  7.23s/it, URL=html?s_tid=hc_product_group_bc]

Progress saved: 1929 links collected


Scraping MATLAB links:  10%|█         | 20/200 [02:47<31:54, 10.64s/it, URL=arget&page=1&s_tid=CRUX_topnav]

Progress saved: 3714 links collected


Scraping MATLAB links:  15%|█▌        | 30/200 [03:36<12:04,  4.26s/it, URL=for-real-time-application.html]

Progress saved: 4381 links collected


Scraping MATLAB links:  20%|██        | 40/200 [04:35<15:31,  5.82s/it, URL=aming-or-file-log-logging.html]

Progress saved: 5285 links collected


Scraping MATLAB links:  25%|██▌       | 50/200 [05:40<14:46,  5.91s/it, URL=ments/piracy.html?s_tid=gf_pir]

Progress saved: 6491 links collected


Scraping MATLAB links:  30%|███       | 60/200 [06:59<22:52,  9.80s/it, URL=nal-category:recwebinar&page=1]

Progress saved: 8385 links collected


Scraping MATLAB links:  35%|███▌      | 70/200 [08:32<17:14,  7.96s/it, URL=orks.html?s_tid=nav_company_dc]

Progress saved: 10520 links collected


Scraping MATLAB links:  40%|████      | 80/200 [10:06<20:19, 10.16s/it, URL=lligence.html?s_tid=hp_hero_ai]

Progress saved: 12992 links collected


Scraping MATLAB links:  45%|████▌     | 90/200 [11:16<13:18,  7.26s/it, URL=ml?s_tid=hp_solutions_robotics]

Progress saved: 14371 links collected


Scraping MATLAB links:  50%|█████     | 100/200 [12:27<12:11,  7.31s/it, URL=ml?s_tid=hp_teaching_resources]

Progress saved: 15763 links collected


Scraping MATLAB links:  52%|█████▏    | 103/200 [12:59<12:13,  7.56s/it, URL=t_sales.html?s_tid=hp_trial_cs]


Scraping interrupted by user
Scraped 16347 links from 103 pages. Data saved to matlab_links/matlab_links.csv

Sample of scraped links:
                                          source_url  \
0  https://in.mathworks.com/help/slrealtime/ug/tr...   
1  https://in.mathworks.com/help/slrealtime/ug/tr...   
2  https://in.mathworks.com/help/slrealtime/ug/tr...   
3  https://in.mathworks.com/help/slrealtime/ug/tr...   
4  https://in.mathworks.com/help/slrealtime/ug/tr...   

                                     source_title  \
0  Troubleshooting Basics - MATLAB &amp; Simulink   
1  Troubleshooting Basics - MATLAB &amp; Simulink   
2  Troubleshooting Basics - MATLAB &amp; Simulink   
3  Troubleshooting Basics - MATLAB &amp; Simulink   
4  Troubleshooting Basics - MATLAB &amp; Simulink   

                                            link_url           link_text  \
0  https://in.mathworks.com/help/slrealtime/ug/tr...     Skip to content   
1      https://in.mathworks.com/?s_tid=user_nav_logo     

Trying out CrawlForAI

In [None]:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy

async def main():
    # Configure a 2-level deep crawl
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=2,
            include_external=False
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True
    )

    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun("https://example.com", config=config)

        print(f"Crawled {len(results)} pages in total")

        # Access individual results
        for result in results[:3]:  # Show first 3 results
            print(f"URL: {result.url}")
            print(f"Depth: {result.metadata.get('depth', 0)}")

if __name__ == "__main__":
    asyncio.run(main())
