# NetNeighbors: Domain Discovery Using CommonCrawl Webgraph

Discover related domains using link topology analysis from the CommonCrawl web graph.

**Run the cells below in order to set up and use the discovery tool.**

In [None]:
# Step 1: Check RAM and clone repository
import psutil
import os

ram_gb = psutil.virtual_memory().total / (1024**3)
print(f"Available RAM: {ram_gb:.1f} GB")

if ram_gb < 20:
    print("\n⚠️ WARNING: You need Colab Pro for this notebook!")
    print("   Required: 20GB+ RAM")
    print(f"   You have: {ram_gb:.1f} GB")
    print("\n   Please enable High-RAM runtime:")
    print("   Runtime → Change runtime type → Runtime shape: High-RAM")
    raise Exception("Insufficient RAM. Please upgrade runtime.")
else:
    print("✅ Sufficient RAM available\n")

# Clone NetNeighbors repo (contains setup scripts and utilities)
if not os.path.exists("NetNeighbors"):
    print("Cloning NetNeighbors repository...")
    !git clone --depth 1 https://github.com/PeterCarragher/NetNeighbors.git > /dev/null 2>&1
    print("✅ Repository cloned")
else:
    print("✅ NetNeighbors repository already exists")

### Step 2: Run Environment Setup

Installs Java 17, Maven, gcsfuse, and builds the discovery tools.

In [None]:
!bash NetNeighbors/scripts/setup.sh

### Step 3: Configure Storage (GCS)

Mount a Google Cloud Storage bucket to cache the webgraph files between sessions.
Leave blank to download files each session (~15 min).

In [None]:
import sys
sys.path.insert(0, '/content/NetNeighbors')
from utils import setup_storage, download_webgraph

# Webgraph version - see https://commoncrawl.org/web-graphs for available versions
VERSION = "cc-main-2025-26-nov-dec-jan"

# Enter GCS bucket name (or leave empty for local storage)
GCS_BUCKET = ""  # e.g., "my-webgraph-bucket"

WEBGRAPH_DIR = setup_storage(GCS_BUCKET if GCS_BUCKET else None)

### Step 4: Download CommonCrawl Webgraph (~10 minutes)

Downloads pre-built graph files directly from CommonCrawl (~23GB total).

In [None]:
download_webgraph(WEBGRAPH_DIR, VERSION)

### Step 5: Verify Installation

In [None]:
import subprocess
subprocess.run(['bash', 'NetNeighbors/scripts/verify.sh', WEBGRAPH_DIR, VERSION])

---

## Discovery Interface

Initialize the discovery tool and use the form below to find related domains.

In [None]:
from webgraph_discovery import WebgraphDiscovery

discovery = WebgraphDiscovery(WEBGRAPH_DIR, VERSION)
print("✅ Discovery tools initialized")
print(f"Graph location: {WEBGRAPH_DIR}")
print(f"Version: {VERSION}")

In [None]:
import ipywidgets as widgets
from IPython.display import display, HTML, FileLink, clear_output
import pandas as pd

# Create input widgets
domains_input = widgets.Textarea(
    value='',
    placeholder='Enter seed domains, one per line:\nexample.com\ntest.org\nsample.net',
    description='',
    layout=widgets.Layout(width='80%', height='200px'),
    style={'description_width': '0px'}
)

min_conn_slider = widgets.IntSlider(
    value=5,
    min=1,
    max=100,
    step=1,
    description='Min Connections:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='60%')
)

direction_radio = widgets.RadioButtons(
    options=[
        ('Backlinks (who links TO seeds)', 'backlinks'),
        ('Outlinks (who seeds link TO)', 'outlinks')
    ],
    value='backlinks',
    description='Direction:',
    style={'description_width': '150px'}
)

run_button = widgets.Button(
    description='Run Discovery',
    button_style='success',
    layout=widgets.Layout(width='200px', height='40px'),
    tooltip='Click to discover related domains'
)

output_area = widgets.Output()

# Display form
display(HTML("<h3>Seed Domains</h3>"))
display(HTML("<p>Enter one domain per line:</p>"))
display(domains_input)
display(HTML("<br>"))
display(min_conn_slider)
display(HTML("<br>"))
display(direction_radio)
display(HTML("<br>"))
display(run_button)
display(HTML("<hr>"))
display(output_area)

# Button click handler
def on_run_click(b):
    output_area.clear_output()
    
    with output_area:
        display(HTML("<h3>Processing...</h3>"))
        
        # Validate input
        domains_text = domains_input.value.strip()
        if not domains_text:
            print("Error: Please enter at least one domain")
            return
        
        seed_domains = [d.strip() for d in domains_text.split('\n') if d.strip()]
        
        if len(seed_domains) == 0:
            print("Error: Please enter at least one domain")
            return
        
        if len(seed_domains) > 1000:
            print("Error: Maximum 1000 domains allowed")
            print(f"You entered: {len(seed_domains)} domains")
            return
        
        # Validate seeds exist in webgraph
        print(f"Validating {len(seed_domains)} seed domains...")
        found, not_found = discovery.validate_seeds(seed_domains)
        
        if len(found) == 0:
            print("\nError: None of the seed domains were found in the webgraph")
            print("\nDomains not found:")
            for d in not_found[:10]:
                print(f"  - {d}")
            if len(not_found) > 10:
                print(f"  ... and {len(not_found)-10} more")
            return
        
        if len(not_found) > 0:
            print(f"\nWarning: {len(not_found)} domains not found in webgraph:")
            for d in not_found[:5]:
                print(f"  - {d}")
            if len(not_found) > 5:
                print(f"  ... and {len(not_found)-5} more")
            print(f"\nProceeding with {len(found)} valid domains\n")
        else:
            print(f"All {len(found)} seed domains found in webgraph\n")
        
        print("="*60)
        print(f"Configuration:")
        print(f"  Direction: {direction_radio.value}")
        print(f"  Minimum connections: {min_conn_slider.value}")
        print(f"  Valid seed domains: {len(found)}")
        print("="*60)
        
        try:
            # Run discovery
            results_df = discovery.discover(
                seed_domains=found,
                min_connections=min_conn_slider.value,
                direction=direction_radio.value
            )
            
            # Clear processing message
            clear_output(wait=True)
            
            # Display results
            if len(results_df) == 0:
                display(HTML("<h3>No Results Found</h3>"))
                print("No domains found matching the criteria.")
                print("\nTry:")
                print("  - Lowering the minimum connections threshold")
                print("  - Using different seed domains")
                print("  - Switching between backlinks and outlinks")
            else:
                display(HTML(f"<h3>Found {len(results_df):,} Domains</h3>"))
                print(f"Discovered {len(results_df):,} domains with >= {min_conn_slider.value} connections\n")
                
                # Style and display dataframe
                display(HTML("<h4>Top Results:</h4>"))
                
                styled_df = results_df.head(100).style.format({
                    'connections': '{:,.0f}',
                    'percentage': '{:.2f}%'
                }).background_gradient(subset=['connections'], cmap='YlOrRd')
                
                display(styled_df)
                
                if len(results_df) > 100:
                    print(f"\n(Showing top 100 of {len(results_df):,} results. Download CSV for full list.)")
                
                # Summary statistics
                print("\n" + "="*60)
                print("Summary Statistics:")
                print(f"  Total discovered: {len(results_df):,} domains")
                print(f"  Connections range: {results_df['connections'].min():.0f} - {results_df['connections'].max():.0f}")
                print(f"  Mean connections: {results_df['connections'].mean():.1f}")
                print(f"  Median connections: {results_df['connections'].median():.0f}")
                print("="*60)
                
                # Download link
                display(HTML("<br><h4>Download Full Results</h4>"))
                display(FileLink('/content/results.csv', result_html_prefix="Click to download: "))
                print(f"\nCSV contains all {len(results_df):,} discovered domains")
                
        except Exception as e:
            clear_output(wait=True)
            display(HTML("<h3>Error During Discovery</h3>"))
            print(f"Error: {str(e)}")
            print("\nTroubleshooting:")
            print("1. Check that all setup cells completed successfully")
            print("2. Verify you're using High-RAM runtime")
            print("3. Try restarting runtime: Runtime -> Restart runtime")
            print("4. Try with fewer seed domains")

run_button.on_click(on_run_click)

print("Tip: Start with 10-20 seed domains and min_connections=5 for fast results!")