<a href="https://colab.research.google.com/github/Nifty0x/web-scraper/blob/main/Insights_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import requests
import time
import json
import xml.etree.ElementTree as ET
from xml.dom import minidom
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

def check_url_exists(url, timeout=10):
    """Check if a URL exists"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=timeout)
        return response.status_code == 200
    except Exception:
        return False

def check_project(slug):
    """Check both URL patterns for a project"""
    base_url = "https://insights.rwa.io/list-your-project/claim-project"
    urls = [
        f"{base_url}?slug={slug}",
        f"{base_url}/{slug}"
    ]

    valid_urls = []
    for url in urls:
        if check_url_exists(url):
            valid_urls.append(url)

    return {
        'slug': slug,
        'valid_urls': valid_urls,
        'valid': len(valid_urls) > 0
    }

def save_results_xml(results, timestamp):
    """Save results to an XML file"""
    root = ET.Element("rwa_claim_urls")

    metadata = ET.SubElement(root, "metadata")
    ET.SubElement(metadata, "timestamp").text = timestamp
    ET.SubElement(metadata, "total_projects").text = str(len(results))
    ET.SubElement(metadata, "valid_projects").text = str(len([r for r in results if r['valid']]))

    projects = ET.SubElement(root, "projects")
    for result in results:
        project = ET.SubElement(projects, "project")
        ET.SubElement(project, "slug").text = result['slug']
        ET.SubElement(project, "valid").text = str(result['valid'])
        urls = ET.SubElement(project, "urls")
        for url in result['valid_urls']:
            ET.SubElement(urls, "url").text = url

    xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")

    filename = f"rwa_claim_urls_{timestamp}.xml"
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(xml_str)
    print(f"Results saved to {filename}")

def save_results_txt(results, timestamp):
    """Save results to a TXT file"""
    filename = f"rwa_claim_urls_{timestamp}.txt"
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("RWA.io Claim URLs Results\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Timestamp: {timestamp}\n")
        f.write(f"Total projects checked: {len(results)}\n")
        f.write(f"Projects with valid claim URLs: {len([r for r in results if r['valid']])}\n\n")
        f.write("Valid Claim URLs by Project:\n")
        f.write("-" * 50 + "\n\n")

        for result in results:
            if result['valid']:
                f.write(f"Project: {result['slug']}\n")
                for url in result['valid_urls']:
                    f.write(f"  {url}\n")
                f.write("\n")
    print(f"Results saved to {filename}")

def process_slugs(slugs):
    """Process all slugs with threading and rate limiting"""
    results = []
    total = len(slugs)
    processed = 0

    print(f"Starting to check {total} projects...")
    start_time = time.time()
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_slug = {executor.submit(check_project, slug): slug for slug in slugs}

        for future in as_completed(future_to_slug):
            result = future.result()
            results.append(result)
            processed += 1

            if processed % 10 == 0 or processed == total:
                print(f"Processed {processed}/{total} projects ({(processed/total)*100:.1f}%)")

            if processed % 20 == 0 or processed == total:
                save_results_xml(results, timestamp)
                save_results_txt(results, timestamp)

            time.sleep(0.5)

    end_time = time.time()
    print(f"\nProcessing completed in {end_time - start_time:.1f} seconds")
    return results, timestamp

def print_summary(results):
    """Print summary of results"""
    valid_projects = [r for r in results if r['valid']]

    print("\n=== Results Summary ===")
    print(f"Total projects checked: {len(results)}")
    print(f"Projects with valid claim URLs: {len(valid_projects)}")

    print("\nValid claim URLs found:")
    for project in valid_projects:
        print(f"\nProject: {project['slug']}")
        for url in project['valid_urls']:
            print(f"  {url}")

def main():
    # Complete list of 206 slugs
    slugs = [
        "re.al", "bricksestate", "imo", "contracoin", "usp", "monerium-eur-emoney",
        "euroe-stablecoin", "stasis-euro", "gyen", "xsgd", "bilira", "gho",
        "binance-peg-busd", "gemini-dollar", "frax", "first-digital-usd", "usdc",
        "tether---usdt", "franklin-templeton-benji-investments", "midas", "opentrade",
        "yieldteq", "wisdomtree", "comtech-gold", "helix-finance", "vivacity-finance",
        "damm-finance", "union-protocol", "wildcat-protocol", "digift", "fortunafi",
        "lofty", "hashnote-usyc", "mountain-protocol", "superstate", "realt-tokens",
        "stusdt", "maker-rwa", "ethena", "solid-world", "tprotocol", "invar-finance",
        "sailing-protocol", "kuma-protocol", "danogo", "gold-dao", "binaryx-platform",
        "hiyield", "fundnel-alta", "merj", "1x.exchange", "hex-trust", "addx",
        "investa-x", "fusang", "upvest", "sygnum", "seba-bank", "mt-pelerin", "sdx",
        "stokr", "globacap", "copper", "21finance", "forge", "digishares", "tangany",
        "cashlink", "black-manta", "tokeny", "archax", "vertalo", "tzero",
        "texture-capital", "dtcc", "polymath", "fireblocks", "etana-custody",
        "consensys", "bitgo", "anchorage", "inx", "oasis-pro", "securitize", "hydrax",
        "apraemio", "redbelly-network", "crypto-autos.com", "diment.io",
        "gm-artification", "diamore", "emeraldco.io", "mantra", "particle",
        "yieldbricks", "dinari", "propbase", "realty-x", "rare-spirits", "cask-capital",
        "soil", "zoth.io", "cerchia", "bru.finance", "arca-labs", "alphaledger",
        "aktionariat", "carbify", "thovt", "plume-network", "xend-finance",
        "zambesi-gold", "decentralised-etf", "evident", "dovu", "futu", "bsos",
        "mansa-finance", "trendx", "the-rwax", "artory", "libertum", "tiamonds",
        "trakx", "caskcoin", "templedao", "pax-gold", "realio", "strikex",
        "naos-finance", "weset.io", "labs-group", "chintai", "etherland", "bixos",
        "brickken", "lumishare", "baxunited-states", "stima", "landshare",
        "blocksquare", "boson", "ix-swap", "domani", "propchain", "stobox",
        "galileo-protocol", "regen-network", "opulous", "tokenfi", "alliance-block",
        "pendle", "creditcoin", "polymesh", "aurus", "untangled", "unikura", "truefi",
        "toucan-protocol", "tangible", "swarm", "solv-finance", "ribbon-lend",
        "purplefi", "polytrade-finance", "pearl-exchange", "parcl", "parabol-finance",
        "openeden", "ondo-finance", "obligate", "meld-gold", "matrixdock", "maple",
        "makerdao", "landx", "klimadao", "impactmarket", "huma-finance", "homecoin",
        "goldfinch", "frigg", "florence-finance", "fabrica", "ensuro", "elysia",
        "defyca", "dexstar", "defactor", "cogito-protocol", "credix", "credifi",
        "clearpool", "citadao", "centrifuge", "canza-finance", "bluejay-finance",
        "blockcellar", "bitbond", "backed-finance", "altendis", "artrade", "anzen",
        "alta", "amfi", "aconomy"
    ]

    # Process slugs and get results
    results, timestamp = process_slugs(slugs)

    # Save final results in both formats
    save_results_xml(results, timestamp)
    save_results_txt(results, timestamp)

    # Print summary
    print_summary(results)
    print("\nResults have been saved in both XML and TXT formats:")
    print(f"1. rwa_claim_urls_{timestamp}.xml")
    print(f"2. rwa_claim_urls_{timestamp}.txt")

if __name__ == "__main__":
    main()

Starting to check 206 projects...
Processed 10/206 projects (4.9%)
Processed 20/206 projects (9.7%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 30/206 projects (14.6%)
Processed 40/206 projects (19.4%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 50/206 projects (24.3%)
Processed 60/206 projects (29.1%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 70/206 projects (34.0%)
Processed 80/206 projects (38.8%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 90/206 projects (43.7%)
Processed 100/206 projects (48.5%)
Results saved to rwa_claim_urls_20241029_211501.xml
Results saved to rwa_claim_urls_20241029_211501.txt
Processed 110/206 projects (53.4%)
Processed 120/206 projects (58.3%)
Results saved to rwa_claim_urls_20241