# NetNeighbors: Domain Discovery Using CommonCrawl Webgraph

Discover related domains using link topology analysis from the CommonCrawl web graph.

This notebook uses py4j to maintain a persistent JVM with the graph loaded in memory.
After initial load (~5 seconds), queries are **nearly instant**.

**Run the cells below in order to set up and use the discovery tool.**

In [None]:
# Step 1: Check RAM and setup working directory
import psutil
import os

ram_gb = psutil.virtual_memory().total / (1024**3)
print(f"Available RAM: {ram_gb:.1f} GB")

if ram_gb < 20:
    print("\n‚ö†Ô∏è WARNING: You need Colab Pro for this notebook!")
    print("   Required: 20GB+ RAM")
    print(f"   You have: {ram_gb:.1f} GB")
    print("\n   Please enable High-RAM runtime:")
    print("   Runtime ‚Üí Change runtime type ‚Üí Runtime shape: High-RAM")
    raise Exception("Insufficient RAM. Please upgrade runtime.")
else:
    print("‚úÖ Sufficient RAM available\n")

# Determine NetNeighbors location and set as working directory
if os.path.exists("/content"):
    # Colab environment
    if not os.path.exists("/content/NetNeighbors"):
        print("Cloning NetNeighbors repository...")
        !git clone --depth 1 https://github.com/PeterCarragher/NetNeighbors.git /content/NetNeighbors > /dev/null 2>&1
        print("‚úÖ Repository cloned")
    else:
        print("‚úÖ NetNeighbors repository already exists")
    os.chdir("/content/NetNeighbors")
else:
    # Local environment
    if os.path.exists("src/DiscoveryTool.java"):
        print("‚úÖ Already in NetNeighbors directory")
    elif os.path.exists("NetNeighbors/src/DiscoveryTool.java"):
        os.chdir("NetNeighbors")
        print("‚úÖ Changed to NetNeighbors submodule")
    else:
        raise Exception("Cannot find NetNeighbors directory.")

print(f"Working directory: {os.getcwd()}")

### Step 2: Run Environment Setup

Installs Java 17, Maven, py4j, and builds the cc-webgraph tools.

In [None]:
!bash scripts/setup.sh

# Install py4j and gradio
!pip install -q py4j gradio
print("\n‚úÖ py4j and gradio installed")

### Step 3: Configure Storage and Download Webgraph

Downloads pre-built graph files from CommonCrawl (~23GB total).

In [None]:
from utils import setup_storage, download_webgraph

# Webgraph version - see https://commoncrawl.org/web-graphs for available versions
VERSION = "cc-main-2024-feb-apr-may"

# Enter GCS bucket name (or leave empty for local storage)
GCS_BUCKET = None  # e.g., "my-webgraph-bucket"
LOCAL_PATH = None  # e.g., "/mnt/d/dev/data/cc/"

if GCS_BUCKET:
    from google.colab import auth
    auth.authenticate_user()

WEBGRAPH_DIR = setup_storage(bucket_name=GCS_BUCKET, webgraph_dir=LOCAL_PATH)

In [None]:
# Download webgraph files (skip if already downloaded)
download_webgraph(WEBGRAPH_DIR, VERSION)

### Step 4: Initialize Graph Bridge (JVM Backend)

This starts a persistent JVM and loads the graph into memory.
**Takes ~5 seconds**, but then all queries are nearly instant!

In [None]:
from graph_bridge import GraphBridge

# Initialize and load graph
bridge = GraphBridge(WEBGRAPH_DIR, VERSION)
bridge.load_graph()

print("\n" + "="*60)
print("üöÄ Graph loaded! Queries are now instant.")
print("="*60)

### Step 5: Quick Test

Let's verify the bridge is working with a quick query.

In [None]:
import time

# Test domain lookup (should be instant)
test_domains = ["cnn.com", "bbc.com", "foxnews.com", "nonexistent.tld"]

start = time.time()
found, not_found = bridge.validate_seeds(test_domains)
elapsed = time.time() - start

print(f"Validated {len(test_domains)} domains in {elapsed*1000:.1f}ms")
print(f"Found: {found}")
if not_found:
    print(f"Not found: {not_found}")

---

## Discovery Interface

Use the Gradio interface below to discover related domains. Queries are **nearly instant** now that the graph is loaded!

In [None]:
import gradio as gr
import pandas as pd
import time
import os

def run_discovery(domains_text: str, min_connections: int, direction: str):
    """
    Run domain discovery and return results.
    """
    # Parse and validate input
    if not domains_text.strip():
        return None, "Please enter at least one domain", None
    
    seed_domains = [d.strip() for d in domains_text.strip().split('\n') if d.strip()]
    
    if len(seed_domains) == 0:
        return None, "Please enter at least one domain", None
    
    if len(seed_domains) > 10000:
        return None, "Maximum 10,000 domains allowed", None
    
    # Validate seeds
    found, not_found = bridge.validate_seeds(seed_domains)
    
    status_lines = []
    if not_found:
        status_lines.append(f"‚ö†Ô∏è {len(not_found)} domains not found in graph")
        if len(not_found) <= 5:
            status_lines.append(f"   Not found: {', '.join(not_found)}")
        else:
            status_lines.append(f"   Not found: {', '.join(not_found[:5])}... and {len(not_found)-5} more")
    
    if len(found) == 0:
        return None, "No valid domains found in graph", None
    
    status_lines.append(f"‚úÖ {len(found)} valid seed domains")
    
    # Run discovery
    direction_value = "backlinks" if "Backlinks" in direction else "outlinks"
    
    start_time = time.time()
    try:
        results = bridge.discover(
            seed_domains=found,
            min_connections=min_connections,
            direction=direction_value
        )
    except Exception as e:
        return None, f"Error: {str(e)}", None
    
    elapsed = time.time() - start_time
    
    if len(results) == 0:
        status_lines.append(f"\n‚è±Ô∏è Completed in {elapsed:.2f}s")
        status_lines.append("\nNo domains found. Try lowering min connections.")
        return None, "\n".join(status_lines), None
    
    # Convert to DataFrame
    df = pd.DataFrame(results)
    
    # Save to CSV
    csv_path = "/content/results.csv" if os.path.exists("/content") else "results.csv"
    df.to_csv(csv_path, index=False)
    
    # Build status message
    status_lines.append(f"\n‚è±Ô∏è Completed in {elapsed:.2f}s")
    status_lines.append(f"\nüìä Results: {len(df):,} domains")
    status_lines.append(f"   Connections range: {df['connections'].min():.0f} - {df['connections'].max():.0f}")
    status_lines.append(f"   Mean: {df['connections'].mean():.1f} | Median: {df['connections'].median():.0f}")
    status_lines.append(f"\nüíæ Saved to {csv_path}")
    
    return df.head(500), "\n".join(status_lines), csv_path


# Create Gradio interface
with gr.Blocks(title="NetNeighbors - Domain Discovery") as demo:
    gr.Markdown("## üîç Domain Discovery")
    gr.Markdown("Find related domains using CommonCrawl webgraph link analysis.")
    
    with gr.Row():
        with gr.Column(scale=1):
            seeds_input = gr.Textbox(
                label="Seed Domains",
                placeholder="Enter domains, one per line:\ncnn.com\nbbc.com\nfoxnews.com",
                lines=10,
                max_lines=20
            )
            
            min_conn = gr.Slider(
                minimum=1,
                maximum=100,
                value=3,
                step=1,
                label="Minimum Connections",
                info="Only show domains connected to at least this many seeds"
            )
            
            direction = gr.Radio(
                choices=["Backlinks (who links TO seeds)", "Outlinks (who seeds link TO)"],
                value="Backlinks (who links TO seeds)",
                label="Direction"
            )
            
            run_btn = gr.Button("üöÄ Run Discovery", variant="primary")
        
        with gr.Column(scale=2):
            status_output = gr.Textbox(
                label="Status",
                lines=8,
                interactive=False
            )
            
            results_table = gr.Dataframe(
                label="Results (top 500)",
                headers=["domain", "connections", "percentage"],
                wrap=True
            )
            
            download_file = gr.File(label="Download Full Results")
    
    run_btn.click(
        fn=run_discovery,
        inputs=[seeds_input, min_conn, direction],
        outputs=[results_table, status_output, download_file]
    )

# Launch embedded in notebook
demo.launch(inline=True, share=False)

---

## Direct API Usage (Optional)

You can also use the GraphBridge API directly for programmatic access.

In [None]:
# Example: Direct API usage
seeds = ["cnn.com", "bbc.com", "foxnews.com"]

# Discover with counts
results = bridge.discover_backlinks(seeds, min_connections=3)
print(f"Found {len(results)} domains")
for r in results[:10]:
    print(f"  {r['domain']}: {r['connections']} connections")

In [None]:
# Example: Fast discovery (no counts, Java-side filtering)
domains = bridge.discover_fast(seeds, min_connections=3, direction="backlinks")
print(f"Found {len(domains)} domains (fast mode)")
print(domains[:10])

---

## Cleanup

When done, shutdown the JVM to free memory.

In [None]:
# Uncomment to shutdown JVM
# bridge.shutdown()
# print("JVM shutdown complete")