<a href="https://colab.research.google.com/github/Nifty0x/web-scraper/blob/main/Insights_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Install required packages
!pip install undetected-chromedriver
!apt-get update
!apt install -y chromium-browser xvfb

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import time
import os

def setup_driver():
    """Setup undetected-chromedriver with appropriate options"""
    options = uc.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    try:
        driver = uc.Chrome(options=options)
        return driver
    except Exception as e:
        print(f"Error setting up driver: {e}")
        return None

def extract_claim_urls(base_url):
    """Extract claim URLs from the specified page"""
    print("Initializing driver...")
    driver = setup_driver()

    if not driver:
        print("Failed to initialize driver")
        return []

    claim_urls = []

    try:
        print(f"\nAccessing {base_url}...")
        driver.get(base_url)

        # Wait for page to load
        print("Waiting for page to load...")
        time.sleep(15)  # Increased wait time

        # Find all links
        print("Searching for links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        print(f"Found {len(links)} total links")

        # Filter for claim URLs
        for link in links:
            try:
                href = link.get_attribute('href')
                if href and 'claim-project' in href:
                    claim_urls.append(href)
                    print(f"Found claim URL: {href}")
            except Exception as e:
                print(f"Error processing link: {e}")

    except Exception as e:
        print(f"Error accessing page: {e}")

    finally:
        try:
            driver.quit()
        except:
            pass

    return claim_urls

# Main execution
print("Setting up the environment...")
base_url = "https://insights.rwa.io"
print("\nStarting URL extraction...")

try:
    claim_urls = extract_claim_urls(base_url)

    print("\nSummary of found claim URLs:")
    if claim_urls:
        for url in claim_urls:
            print(url)
    else:
        print("No claim URLs found.")
except Exception as e:
    print(f"An error occurred: {e}")

Collecting undetected-chromedriver
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting websockets (from undetected-chromedriver)
  Downloading websockets-13.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading websockets-13.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (164 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.1/164.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: undetected-chromedriver
  Building wheel for undetected-chromedriver (setup.py) ... [?25l[?25hdone
  Created w

In [7]:
# Install required packages
!pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup
import re
import time

def extract_claim_urls(base_url):
    """Extract claim URLs using requests and BeautifulSoup"""
    # Headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    claim_urls = []

    try:
        print(f"Accessing {base_url}...")
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes

        print("Parsing page content...")
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all links
        links = soup.find_all('a', href=True)
        print(f"Found {len(links)} total links")

        # Pattern to match claim project URLs
        claim_pattern = re.compile(r'claim-project')

        # Filter for claim URLs
        for link in links:
            href = link['href']
            if claim_pattern.search(href):
                # Make sure we have absolute URLs
                full_url = href if href.startswith('http') else f"{base_url.rstrip('/')}/{href.lstrip('/')}"
                claim_urls.append(full_url)
                print(f"Found claim URL: {full_url}")

        # Also try to find any data-attributes or other elements that might contain the URLs
        elements = soup.find_all(attrs={"data-href": claim_pattern})
        for element in elements:
            href = element.get('data-href')
            if href:
                full_url = href if href.startswith('http') else f"{base_url.rstrip('/')}/{href.lstrip('/')}"
                if full_url not in claim_urls:
                    claim_urls.append(full_url)
                    print(f"Found claim URL from data attribute: {full_url}")

    except requests.exceptions.RequestException as e:
        print(f"Error accessing the page: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

    return claim_urls

def scan_multiple_pages(base_url):
    """Scan multiple pages including potential subpages"""
    all_urls = set()  # Using a set to avoid duplicates

    # List of paths to check
    paths_to_check = [
        "",  # Root page
        "/insights",
        "/project",
        "/list-your-project/claim-project",
        "/browse",
        "/search"
    ]

    for path in paths_to_check:
        url = f"{base_url.rstrip('/')}/{path.lstrip('/')}"
        print(f"\nChecking path: {url}")
        urls = extract_claim_urls(url)
        all_urls.update(urls)
        time.sleep(2)  # Small delay between requests

    return list(all_urls)

# Main execution
print("Starting URL extraction...")
base_url = "https://insights.rwa.io"

try:
    claim_urls = scan_multiple_pages(base_url)

    print("\nSummary of all found claim URLs:")
    if claim_urls:
        for url in sorted(claim_urls):
            print(url)
        print(f"\nTotal unique claim URLs found: {len(claim_urls)}")
    else:
        print("No claim URLs found.")

        # Print the page content for debugging
        print("\nTrying to fetch and print page source for debugging...")
        response = requests.get(base_url)
        print(f"Status Code: {response.status_code}")
        print("\nFirst 1000 characters of page source:")
        print(response.text[:1000])

except Exception as e:
    print(f"An error occurred: {e}")

Starting URL extraction...

Checking path: https://insights.rwa.io/
Accessing https://insights.rwa.io/...
Parsing page content...
Found 46 total links

Checking path: https://insights.rwa.io/insights
Accessing https://insights.rwa.io/insights...
Error accessing the page: 404 Client Error: Not Found for url: https://insights.rwa.io/insights

Checking path: https://insights.rwa.io/project
Accessing https://insights.rwa.io/project...
Error accessing the page: 404 Client Error: Not Found for url: https://insights.rwa.io/project

Checking path: https://insights.rwa.io/list-your-project/claim-project
Accessing https://insights.rwa.io/list-your-project/claim-project...
Parsing page content...
Found 1 total links

Checking path: https://insights.rwa.io/browse
Accessing https://insights.rwa.io/browse...
Error accessing the page: 404 Client Error: Not Found for url: https://insights.rwa.io/browse

Checking path: https://insights.rwa.io/search
Accessing https://insights.rwa.io/search...
Error acc

In [8]:
import requests
from bs4 import BeautifulSoup
import json

def get_project_claim_url(project_slug):
    """Generate claim URL for a specific project"""
    return f"https://insights.rwa.io/list-your-project/claim-project?slug={project_slug}"

def check_url_exists(url):
    """Check if a URL exists and returns content"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        return response.status_code == 200, response.text
    except:
        return False, None

def extract_project_slugs():
    """Extract project slugs from the main page"""
    url = "https://insights.rwa.io"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Look for any script tags containing __NEXT_DATA__
        scripts = soup.find_all('script', {'id': '__NEXT_DATA__'})

        slugs = set()
        for script in scripts:
            try:
                data = json.loads(script.string)
                # You might need to adjust this path based on your actual data structure
                if 'props' in data and 'pageProps' in data['props']:
                    # Extract project data - adjust this based on your actual data structure
                    projects_data = data['props']['pageProps'].get('projects', [])
                    for project in projects_data:
                        if 'slug' in project:
                            slugs.add(project['slug'])
            except:
                continue

        return list(slugs)
    except Exception as e:
        print(f"Error extracting project slugs: {e}")
        return []

def main():
    print("Starting URL extraction...")

    # First try the known project "domani"
    domani_url = get_project_claim_url("domani")
    exists, content = check_url_exists(domani_url)
    if exists:
        print(f"Found valid claim URL: {domani_url}")

    # Then try to extract other project slugs
    print("\nAttempting to extract other project slugs...")
    slugs = extract_project_slugs()

    if slugs:
        print(f"\nFound {len(slugs)} project slugs")
        for slug in slugs:
            url = get_project_claim_url(slug)
            exists, _ = check_url_exists(url)
            if exists:
                print(f"Valid claim URL: {url}")
    else:
        print("No additional project slugs found")

    print("\nTrying direct URL construction for 'domani':")
    direct_urls = [
        "https://insights.rwa.io/list-your-project/claim-project?slug=domani",
        "https://insights.rwa.io/list-your-project/claim-project/domani"
    ]

    for url in direct_urls:
        exists, _ = check_url_exists(url)
        if exists:
            print(f"Valid URL found: {url}")

if __name__ == "__main__":
    main()

Starting URL extraction...
Found valid claim URL: https://insights.rwa.io/list-your-project/claim-project?slug=domani

Attempting to extract other project slugs...
No additional project slugs found

Trying direct URL construction for 'domani':
Valid URL found: https://insights.rwa.io/list-your-project/claim-project?slug=domani
Valid URL found: https://insights.rwa.io/list-your-project/claim-project/domani


In [9]:
import requests
import time
from urllib.parse import quote

def check_url_exists(url):
    """Check if a URL exists and returns content"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        return response.status_code == 200, response.text
    except Exception as e:
        print(f"Error checking URL {url}: {e}")
        return False, None

def get_claim_urls(slug):
    """Generate possible claim URLs for a given slug"""
    base_url = "https://insights.rwa.io/list-your-project/claim-project"
    encoded_slug = quote(slug.lower())
    return [
        f"{base_url}?slug={encoded_slug}",
        f"{base_url}/{encoded_slug}"
    ]

def check_project_slug(slug):
    """Check all possible URL patterns for a given project slug"""
    print(f"\nChecking URLs for project: {slug}")
    valid_urls = []

    urls = get_claim_urls(slug)
    for url in urls:
        exists, _ = check_url_exists(url)
        if exists:
            valid_urls.append(url)
            print(f"✓ Valid URL found: {url}")
        else:
            print(f"✗ Invalid URL: {url}")

    return valid_urls

def main():
    # List of potential project slugs to check
    # Starting with known tokens and common variations
    project_slugs = [
        "domani",
        "DEXTF",
        "dextf",
        "defi-exchange-traded-funds",
        "goldfinch",
        "centrifuge",
        "maple",
        "maple-finance",
        "rwa-market",
        "rwa",
        "blocktower",
        "blocktower-credit",
        "real-world-assets",
        "ondo",
        "ondo-finance",
        "mcp",
        "monetalis",
        "monetalis-clydesdale"
    ]

    print("Starting comprehensive URL check...")
    all_valid_urls = {}

    for slug in project_slugs:
        valid_urls = check_project_slug(slug)
        if valid_urls:
            all_valid_urls[slug] = valid_urls
        time.sleep(1)  # Be nice to the server

    print("\n=== Summary of Results ===")
    if all_valid_urls:
        print(f"\nFound {len(all_valid_urls)} projects with valid claim URLs:")
        for slug, urls in all_valid_urls.items():
            print(f"\nProject: {slug}")
            for url in urls:
                print(f"  {url}")
    else:
        print("No valid claim URLs found beyond the original domani project.")

    print("\nNote: This list may not be exhaustive. Consider checking specific project names or tokens you're interested in.")

if __name__ == "__main__":
    main()

Starting comprehensive URL check...

Checking URLs for project: domani
✓ Valid URL found: https://insights.rwa.io/list-your-project/claim-project?slug=domani
✓ Valid URL found: https://insights.rwa.io/list-your-project/claim-project/domani

Checking URLs for project: DEXTF
✓ Valid URL found: https://insights.rwa.io/list-your-project/claim-project?slug=dextf
✓ Valid URL found: https://insights.rwa.io/list-your-project/claim-project/dextf

Checking URLs for project: dextf
✓ Valid URL found: https://insights.rwa.io/list-your-project/claim-project?slug=dextf
✓ Valid URL found: https://insights.rwa.io/list-your-project/claim-project/dextf

Checking URLs for project: defi-exchange-traded-funds
✓ Valid URL found: https://insights.rwa.io/list-your-project/claim-project?slug=defi-exchange-traded-funds
✓ Valid URL found: https://insights.rwa.io/list-your-project/claim-project/defi-exchange-traded-funds

Checking URLs for project: goldfinch
✓ Valid URL found: https://insights.rwa.io/list-your-pro

In [11]:
import requests
import time
import json
import xml.etree.ElementTree as ET
from xml.dom import minidom
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

def check_url_exists(url, timeout=10):
    """Check if a URL exists"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=timeout)
        return response.status_code == 200
    except Exception:
        return False

def check_project(slug):
    """Check both URL patterns for a project"""
    base_url = "https://insights.rwa.io/list-your-project/claim-project"
    urls = [
        f"{base_url}?slug={slug}",
        f"{base_url}/{slug}"
    ]

    valid_urls = []
    for url in urls:
        if check_url_exists(url):
            valid_urls.append(url)

    return {
        'slug': slug,
        'valid_urls': valid_urls,
        'valid': len(valid_urls) > 0
    }

def save_results_xml(results, timestamp):
    """Save results to an XML file"""
    root = ET.Element("rwa_claim_urls")

    metadata = ET.SubElement(root, "metadata")
    ET.SubElement(metadata, "timestamp").text = timestamp
    ET.SubElement(metadata, "total_projects").text = str(len(results))
    ET.SubElement(metadata, "valid_projects").text = str(len([r for r in results if r['valid']]))

    projects = ET.SubElement(root, "projects")
    for result in results:
        project = ET.SubElement(projects, "project")
        ET.SubElement(project, "slug").text = result['slug']
        ET.SubElement(project, "valid").text = str(result['valid'])
        urls = ET.SubElement(project, "urls")
        for url in result['valid_urls']:
            ET.SubElement(urls, "url").text = url

    xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")

    filename = f"rwa_claim_urls_{timestamp}.xml"
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(xml_str)
    print(f"Results saved to {filename}")

def save_results_txt(results, timestamp):
    """Save results to a TXT file"""
    filename = f"rwa_claim_urls_{timestamp}.txt"
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("RWA.io Claim URLs Results\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Timestamp: {timestamp}\n")
        f.write(f"Total projects checked: {len(results)}\n")
        f.write(f"Projects with valid claim URLs: {len([r for r in results if r['valid']])}\n\n")
        f.write("Valid Claim URLs by Project:\n")
        f.write("-" * 50 + "\n\n")

        for result in results:
            if result['valid']:
                f.write(f"Project: {result['slug']}\n")
                for url in result['valid_urls']:
                    f.write(f"  {url}\n")
                f.write("\n")
    print(f"Results saved to {filename}")

def process_slugs(slugs):
    """Process all slugs with threading and rate limiting"""
    results = []
    total = len(slugs)
    processed = 0

    print(f"Starting to check {total} projects...")
    start_time = time.time()
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_slug = {executor.submit(check_project, slug): slug for slug in slugs}

        for future in as_completed(future_to_slug):
            result = future.result()
            results.append(result)
            processed += 1

            if processed % 10 == 0 or processed == total:
                print(f"Processed {processed}/{total} projects ({(processed/total)*100:.1f}%)")

            if processed % 20 == 0 or processed == total:
                save_results_xml(results, timestamp)
                save_results_txt(results, timestamp)

            time.sleep(0.5)

    end_time = time.time()
    print(f"\nProcessing completed in {end_time - start_time:.1f} seconds")
    return results, timestamp

def print_summary(results):
    """Print summary of results"""
    valid_projects = [r for r in results if r['valid']]

    print("\n=== Results Summary ===")
    print(f"Total projects checked: {len(results)}")
    print(f"Projects with valid claim URLs: {len(valid_projects)}")

    print("\nValid claim URLs found:")
    for project in valid_projects:
        print(f"\nProject: {project['slug']}")
        for url in project['valid_urls']:
            print(f"  {url}")

def main():
    # Complete list of 206 slugs
    slugs = [
        "re.al", "bricksestate", "imo", "contracoin", "usp", "monerium-eur-emoney",
        "euroe-stablecoin", "stasis-euro", "gyen", "xsgd", "bilira", "gho",
        "binance-peg-busd", "gemini-dollar", "frax", "first-digital-usd", "usdc",
        "tether---usdt", "franklin-templeton-benji-investments", "midas", "opentrade",
        "yieldteq", "wisdomtree", "comtech-gold", "helix-finance", "vivacity-finance",
        "damm-finance", "union-protocol", "wildcat-protocol", "digift", "fortunafi",
        "lofty", "hashnote-usyc", "mountain-protocol", "superstate", "realt-tokens",
        "stusdt", "maker-rwa", "ethena", "solid-world", "tprotocol", "invar-finance",
        "sailing-protocol", "kuma-protocol", "danogo", "gold-dao", "binaryx-platform",
        "hiyield", "fundnel-alta", "merj", "1x.exchange", "hex-trust", "addx",
        "investa-x", "fusang", "upvest", "sygnum", "seba-bank", "mt-pelerin", "sdx",
        "stokr", "globacap", "copper", "21finance", "forge", "digishares", "tangany",
        "cashlink", "black-manta", "tokeny", "archax", "vertalo", "tzero",
        "texture-capital", "dtcc", "polymath", "fireblocks", "etana-custody",
        "consensys", "bitgo", "anchorage", "inx", "oasis-pro", "securitize", "hydrax",
        "apraemio", "redbelly-network", "crypto-autos.com", "diment.io",
        "gm-artification", "diamore", "emeraldco.io", "mantra", "particle",
        "yieldbricks", "dinari", "propbase", "realty-x", "rare-spirits", "cask-capital",
        "soil", "zoth.io", "cerchia", "bru.finance", "arca-labs", "alphaledger",
        "aktionariat", "carbify", "thovt", "plume-network", "xend-finance",
        "zambesi-gold", "decentralised-etf", "evident", "dovu", "futu", "bsos",
        "mansa-finance", "trendx", "the-rwax", "artory", "libertum", "tiamonds",
        "trakx", "caskcoin", "templedao", "pax-gold", "realio", "strikex",
        "naos-finance", "weset.io", "labs-group", "chintai", "etherland", "bixos",
        "brickken", "lumishare", "baxunited-states", "stima", "landshare",
        "blocksquare", "boson", "ix-swap", "domani", "propchain", "stobox",
        "galileo-protocol", "regen-network", "opulous", "tokenfi", "alliance-block",
        "pendle", "creditcoin", "polymesh", "aurus", "untangled", "unikura", "truefi",
        "toucan-protocol", "tangible", "swarm", "solv-finance", "ribbon-lend",
        "purplefi", "polytrade-finance", "pearl-exchange", "parcl", "parabol-finance",
        "openeden", "ondo-finance", "obligate", "meld-gold", "matrixdock", "maple",
        "makerdao", "landx", "klimadao", "impactmarket", "huma-finance", "homecoin",
        "goldfinch", "frigg", "florence-finance", "fabrica", "ensuro", "elysia",
        "defyca", "dexstar", "defactor", "cogito-protocol", "credix", "credifi",
        "clearpool", "citadao", "centrifuge", "canza-finance", "bluejay-finance",
        "blockcellar", "bitbond", "backed-finance", "altendis", "artrade", "anzen",
        "alta", "amfi", "aconomy"
    ]

    # Process slugs and get results
    results, timestamp = process_slugs(slugs)

    # Save final results in both formats
    save_results_xml(results, timestamp)
    save_results_txt(results, timestamp)

    # Print summary
    print_summary(results)
    print("\nResults have been saved in both XML and TXT formats:")
    print(f"1. rwa_claim_urls_{timestamp}.xml")
    print(f"2. rwa_claim_urls_{timestamp}.txt")

if __name__ == "__main__":
    main()

Starting to check 206 projects...
Processed 10/206 projects (4.9%)
Processed 20/206 projects (9.7%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 30/206 projects (14.6%)
Processed 40/206 projects (19.4%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 50/206 projects (24.3%)
Processed 60/206 projects (29.1%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 70/206 projects (34.0%)
Processed 80/206 projects (38.8%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 90/206 projects (43.7%)
Processed 100/206 projects (48.5%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 110/206 projects (53.4%)
Processed 120/206 projects (58.3%)
Results saved to rwa_claim_urls_20241