In [2]:
import requests
from urllib.parse import urlparse, urljoin

def is_allowed_by_robots_txt(url, user_agent='*'):
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    
    # Fetch the robots.txt file
    robots_txt_url = urljoin(base_url, '/robots.txt')
    try:
        response = requests.get(robots_txt_url)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)
    except requests.RequestException as e:
        print(f"Error fetching {robots_txt_url}: {e}")
        return False  # Assume disallowed if robots.txt cannot be fetched

    # Parse the robots.txt content
    rules = response.text.splitlines()
    user_agent_section = False
    disallowed_paths = []

    for line in rules:
        line = line.strip()
        if line.startswith('User-agent:'):
            # Check for the specific user agent section
            user_agent_section = line.split(':', 1)[1].strip() == user_agent or user_agent == '*'
        elif line.startswith('Disallow:') and user_agent_section:
            # Collect disallowed paths
            disallowed_path = line.split(':', 1)[1].strip()
            disallowed_paths.append(disallowed_path)

    # Check if the URL is disallowed
    for path in disallowed_paths:
        if path == '/':  # A disallow of '/' means disallow everything
            return False
        if parsed_url.path.startswith(path):
            return False

    return True  # If not disallowed, it is allowed

# Example usage
if __name__ == "_main_":
    url = "https://github.com/hp1004/CoinWatch--Crypto-Search-and-Trade/tree/main/build"
    if is_allowed_by_robots_txt(url):
        print(f"The URL '{url}' is allowed to be crawled.")
    else:
        print(f"The URL '{url}' is NOT allowed to be crawled.")