In [7]:
import requests
from bs4 import BeautifulSoup
# Normalize relative links into absolute URLs
from urllib.parse import urljoin, urlparse, urlunparse
import logging
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Class that recursively scrapes all internal links from a given base URL
class AdvancedWebScraper:
# Initialize base URL, depth limit, timeout, and link tracking
    def __init__(self, base_url, max_depth=5, timeout=15):
        """
        Initialize an advanced web scraper with comprehensive link extraction
        
        :param base_url: Starting URL to scrape
        :param max_depth: Maximum depth of recursive scraping
        :param timeout: Request timeout in seconds
        """
        # Normalize the base URL
        parsed_base = urlparse(base_url)
        self.base_domain = parsed_base.netloc.lower()
        self.base_scheme = parsed_base.scheme
# Define the base URL to start scraping (sensitive info redacted)
base_url = "<BASE_URL>"  # REPLACE with target domain
        
        self.max_depth = max_depth
        self.timeout = timeout
        self.visited_urls = set()
        self.all_links = set()
        
        # Configure logging
# Configure logging to capture status and errors
        logging.basicConfig(level=logging.INFO, 
                            format='%(asctime)s - %(levelname)s: %(message)s')
        self.logger = logging.getLogger(__name__)
        
        # Create a robust session with retry mechanism
# Create a requests session with retry configuration
        self.session = requests.Session()
# Set up HTTP retry strategy to handle connection errors gracefully
        retry_strategy = Retry(
            total=3,
            backoff_factor=0.3,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=['GET', 'HEAD']
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)
    
    def normalize_url(self, url):
        """
        Normalize URLs to remove tracking parameters, fragments, etc.
        
        :param url: URL to normalize
        :return: Normalized URL
        """
        try:
            parsed = urlparse(url)
            # Remove fragment and query parameters related to tracking
            cleaned_parsed = parsed._replace(
                fragment='',
                query=re.sub(r'(utm_\w+|ref|track)=[^&]*', '', parsed.query)
            )
            normalized = urlunparse(cleaned_parsed)
            
            # Remove trailing slash for consistency
            return normalized.rstrip('/')
        except Exception as e:
            self.logger.error(f"Error normalizing URL {url}: {e}")
            return url
    
    def is_valid_url(self, url):
        """
        Comprehensive URL validation
        
        :param url: URL to validate
        :return: Boolean indicating if URL is valid
        """
        try:
            parsed = urlparse(url)
            
            # Extensive filtering
            return (
                parsed.scheme in ['http', 'https'] and 
                parsed.netloc.lower() == self.base_domain and
                not any(ext in url.lower() for ext in [
                    '.pdf', '.jpg', '.jpeg', '.png', '.gif', 
                    '.css', '.js', '.xml', '.svg', '.ico', 
                    '.mp4', '.mp3', '.zip', '.csv', '.xls'
                ]) and
                not re.search(r'(#|@|\?)', url)  # Exclude anchors, emails, complex queries
            )
        except Exception as e:
            self.logger.error(f"Error validating URL {url}: {e}")
            return False
    
    def extract_links(self, html_content, base_url):
        """
        Advanced link extraction method
        
        :param html_content: HTML content of the page
        :param base_url: Base URL of the current page
        :return: Set of extracted links
        """
        links = set()
        
        try:
            # Use BeautifulSoup for parsing
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Extract links from various sources
            link_sources = [
                soup.find_all('a', href=True),  # Standard links
                soup.find_all('link', href=True),  # Link tags
                soup.find_all(attrs={'data-href': True}),  # Custom data attributes
            ]
            
            for source in link_sources:
                for link in source:
                    # Get href or data-href attribute
                    href = link.get('href') or link.get('data-href', '')
                    
                    # Convert to absolute URL
# Normalize relative links into absolute URLs
                    full_url = urljoin(base_url, href)
                    
                    # Normalize and validate
                    normalized_url = self.normalize_url(full_url)
                    
                    if self.is_valid_url(normalized_url):
                        links.add(normalized_url)
        
        except Exception as e:
            self.logger.error(f"Error extracting links from {base_url}: {e}")
        
        return links
    
    def scrape_page(self, url, depth=0):
        """
        Recursive page scraping with advanced error handling
        
        :param url: URL to scrape
        :param depth: Current recursion depth
        """
        # Avoid exceeding max depth or revisiting URLs
        if (depth > self.max_depth or 
            url in self.visited_urls):
            return
        
        try:
            # Mark URL as visited
            self.visited_urls.add(url)
            self.logger.info(f"Scraping: {url} (Depth: {depth})")
            
            # Fetch webpage with robust session
            response = self.session.get(
                url, 
                timeout=self.timeout, 
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
                }
            )
            response.raise_for_status()
            
            # Extract and process links
            page_links = self.extract_links(response.text, url)
            
            for link in page_links:
                # Add unique links
                if link not in self.all_links:
                    self.all_links.add(link)
                    
                    # Recursive scraping
                    self.scrape_page(link, depth + 1)
        
        except requests.RequestException as e:
            self.logger.error(f"Request error scraping {url}: {e}")
        except Exception as e:
            self.logger.error(f"Unexpected error scraping {url}: {e}")
    
    def start_scraping(self):
        """
        Initiate the comprehensive web scraping process
        
        :return: Set of unique links within the domain
        """
        self.logger.info(f"Starting advanced domain-confined scraping from {self.base_url}")
        self.scrape_page(self.base_url)
        
        self.logger.info(f"Scraping complete. Total unique links found: {len(self.all_links)}")
        return self.all_links
# Example usage
def main():
    # Example website to scrape (replace with your target website)
# Define the base URL to start scraping (sensitive info redacted)
base_url = "<BASE_URL>"  # REPLACE with target domain
    
    # Create scraper instance
# Initialize scraper with base URL and desired depth
    scraper = AdvancedWebScraper(base_url, max_depth=50)
    
    # Start scraping and get links
    links = scraper.start_scraping()
    
    # Print all discovered links
    print("Discovered Links:")
    for link in sorted(links):
        print(link)
    
    # Print summary
    print(f"\nTotal unique links found: {len(links)}")
    print(links)

if __name__ == "__main__":
    main()

2025-02-24 23:14:00,750 - INFO: Starting advanced domain-confined scraping from ://
2025-02-24 23:14:00,752 - INFO: Scraping: :// (Depth: 0)
2025-02-24 23:14:00,758 - ERROR: Request error scraping ://: No connection adapters were found for '://'
2025-02-24 23:14:00,760 - INFO: Scraping complete. Total unique links found: 0


Discovered Links:

Total unique links found: 0
set()


In [12]:
import pandas as pd

def extract_excel_column(file_path, sheet_name, column_name):
    """
    Extract data from a specific column in an Excel file.
    
    Parameters:
    file_path (str): Path to the Excel file
    sheet_name (str): Name of the sheet to extract data from
    column_name (str): Name of the column to extract data from
    
    Returns:
    list: List containing the values from the specified column
    """
    try:
        # Read the Excel file
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        # Check if the column exists
        if column_name not in df.columns:
            print(f"Error: Column '{column_name}' not found in sheet '{sheet_name}'.")
            return []
        
        # Extract data from the specified column into a list
        column_data = df[column_name].tolist()
        
        return column_data
    
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return []
    except Exception as e:
        print(f"Error: {str(e)}")
        return []

# Example usage
if __name__ == "__main__":
    file_path = "C:\\Users\\hahaha\\Downloads\\MVS Top 10 Instructions.xlsx"  # Replace with your Excel file path
    sheet_name = "Top 10"                 # Replace with your sheet name
    column_name = "URL"               # Replace with your column name
    
    urls = extract_excel_column(file_path, sheet_name, column_name)
    

In [13]:
len(urls)
print(urls)

['https://www.medviewsystems.com//freestyle-lite/', 'https://www.medviewsystems.com//iv-piggy-back-procedure/', 'https://www.medviewsystems.com//picc-line-dressing-change-with-biopatch-and-griplock/', 'https://www.medviewsystems.com//sapphire-pump-infusion-pump/', 'https://www.medviewsystems.com//cadd-solis-ambulatory-infusion-pump/', 'https://www.medviewsystems.com//freedom-60-infusionspumpe-syringe-infusion-system/', 'https://www.medviewsystems.com//invacare-g-series-bed/', 'https://www.medviewsystems.com//salter-aire-elite-compressor/', 'https://www.medviewsystems.com//airsense-11-resmed-cpap/', 'https://www.medviewsystems.com//nuvo-lite-mark-5-oxygen-concentrator-gce/']


In [14]:
import requests
import os

# List of URLs to send requests to
headers = {
    'Authorization': 'jina_e84e30931c6f42df9f2850db7d500612JKXdKvo9ldrvdDbYyEfI0kQSOb6A',
    'X-Return-Format': 'markdown'
}

# Path for the output text file
file_path = 'medviewrag_demo.txt'

# Open the file in append mode ('a') to add new data without overwriting existing content
with open(file_path, 'a') as txt_file:
    # Loop through each URL and make a GET request
    for url in urls:
        url = 'https://r.jina.ai/'+url
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            # If the request was successful, write the response text to the file
            # You can directly write the JSON string, or convert it to text (string format)
            txt_file.write(f"Response from {url}:\n")
            txt_file.write(response.text)  # Write the response as plain text
            txt_file.write("\n\n")  # Add some spacing between responses for readability
        else:
            # Handle failed requests by writing a message
            txt_file.write(f"Failed to retrieve data from {url} with status code {response.status_code}\n\n")

print(f"All responses have been appended to {file_path}")


All responses have been appended to medviewrag_demo.txt
