## Section 1: Environment Setup (Run Once)

### Check Available RAM

In [None]:
import psutil

# Check available RAM
ram_gb = psutil.virtual_memory().total / (1024**3)
print(f"Available RAM: {ram_gb:.1f} GB")

if ram_gb < 20:
    print("\n‚ö†Ô∏è WARNING: You need Colab Pro for this notebook!")
    print("   Required: 20GB+ RAM")
    print(f"   You have: {ram_gb:.1f} GB")
    print("\n   Please enable High-RAM runtime:")
    print("   Runtime ‚Üí Change runtime type ‚Üí Runtime shape: High-RAM")
    raise Exception("Insufficient RAM. Please upgrade runtime.")
else:
    print("‚úÖ Sufficient RAM available")
    print("\nYou can proceed with setup!")

### Mount Google Drive (Optional but Recommended)

In [None]:
from google.colab import drive
import os

# Ask user if they want to mount Drive
print("Mount Google Drive to cache webgraph between sessions?")
print("This saves ~15 minutes on future runs.")
print("")
mount_choice = input("Mount Google Drive? (yes/no): ").lower().strip()

if mount_choice in ['yes', 'y']:
    drive.mount('/content/drive')
    WEBGRAPH_DIR = '/content/drive/MyDrive/Colab_Data/webgraph'
    print(f"\n‚úÖ Webgraph will be cached in: {WEBGRAPH_DIR}")
    print("This will persist across sessions!")
else:
    WEBGRAPH_DIR = '/content/webgraph'
    print(f"\n‚ö†Ô∏è Webgraph will be downloaded each session (~15 min)")
    print(f"Stored temporarily in: {WEBGRAPH_DIR}")

# Create directory
os.makedirs(WEBGRAPH_DIR, exist_ok=True)
print(f"\nDirectory created: {WEBGRAPH_DIR}")

### Install Java 17

In [None]:
%%bash
echo "Installing Java 17..."
apt-get update -qq > /dev/null 2>&1
apt-get install -y -qq openjdk-17-jdk-headless maven > /dev/null 2>&1

echo "‚úÖ Java installation complete"
java -version

### Download & Build Tools

This cell clones two repositories:
1. **cc-webgraph** - CommonCrawl's webgraph processing tools (provides BVGraph library)
2. **NetNeighbors** - The discovery tool for this notebook

In [None]:
%%bash
# Clone and build cc-webgraph
if [ ! -d "cc-webgraph" ]; then
    echo "Cloning cc-webgraph repository..."
    git clone --depth 1 https://github.com/commoncrawl/cc-webgraph.git > /dev/null 2>&1
    
    echo "Building cc-webgraph (this may take 1-2 minutes)..."
    cd cc-webgraph
    mvn clean package -DskipTests -q
    
    echo "‚úÖ cc-webgraph built successfully"
else
    echo "‚úÖ cc-webgraph already exists"
fi

# Clone NetNeighbors (contains the discovery tool)
if [ ! -d "NetNeighbors" ]; then
    echo ""
    echo "Cloning NetNeighbors discovery tool..."
    git clone --depth 1 https://github.com/PeterCarragher/NetNeighbors.git > /dev/null 2>&1
    
    echo "Compiling DiscoveryTool..."
    mkdir -p NetNeighbors/bin
    javac -cp "cc-webgraph/target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar" \
        -d NetNeighbors/bin \
        NetNeighbors/src/DiscoveryTool.java
    
    echo "‚úÖ NetNeighbors tools ready"
else
    echo "‚úÖ NetNeighbors already exists"
fi

# Verify JAR and class files exist
echo ""
if [ -f "cc-webgraph/target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar" ]; then
    echo "‚úÖ cc-webgraph JAR found"
else
    echo "‚ùå cc-webgraph JAR not found"
fi

if [ -f "NetNeighbors/bin/DiscoveryTool.class" ]; then
    echo "‚úÖ DiscoveryTool compiled"
else
    echo "‚ùå DiscoveryTool not compiled"
fi

### Download CommonCrawl Webgraph (~10 minutes)

Downloads pre-built graph files directly from CommonCrawl (~23GB total):
- Domain vertices mapping
- Forward graph (BVGraph format) for outlinks
- Transpose graph for backlinks

In [None]:
import os
from tqdm.auto import tqdm
import urllib.request

VERSION = "cc-main-2025-26-nov-dec-jan"
BASE_URL = f"https://data.commoncrawl.org/projects/hyperlinkgraph/{VERSION}/domain"

# Pre-built graph files from CommonCrawl (no need to build ourselves!)
files_to_download = [
    # Domain mapping (required for lookups)
    (f"{VERSION}-domain-vertices.txt.gz", "871 MB"),
    # Pre-built BVGraph (for outlinks)
    (f"{VERSION}-domain.graph", "10.9 GB"),
    (f"{VERSION}-domain.properties", "1.3 KB"),
    # Transpose graph (for backlinks)
    (f"{VERSION}-domain-t.graph", "11.2 GB"),
    (f"{VERSION}-domain-t.properties", "1.3 KB"),
    # Statistics (small, useful metadata)
    (f"{VERSION}-domain.stats", "788 B"),
]

def download_with_progress(url, dest_path, expected_size=""):
    """Download file with progress bar"""
    if os.path.exists(dest_path):
        size_mb = os.path.getsize(dest_path) / (1024 * 1024)
        print(f"‚úÖ Already exists: {os.path.basename(dest_path)} ({size_mb:.1f} MB)")
        return
    
    print(f"Downloading: {os.path.basename(dest_path)} ({expected_size})")
    
    def progress_hook(pbar):
        def update(block_num, block_size, total_size):
            if total_size > 0:
                pbar.total = total_size
                pbar.update(block_size)
        return update
    
    with tqdm(unit='B', unit_scale=True, unit_divisor=1024) as pbar:
        urllib.request.urlretrieve(url, dest_path, reporthook=progress_hook(pbar))
    
    print(f"‚úÖ Downloaded: {os.path.basename(dest_path)}")

print("Downloading CommonCrawl webgraph (pre-built graph files)...")
print(f"Destination: {WEBGRAPH_DIR}")
print(f"\nTotal download: ~23 GB (includes transpose graph for backlinks)")
print("="*60 + "\n")

for filename, size in files_to_download:
    url = f"{BASE_URL}/{filename}"
    dest = os.path.join(WEBGRAPH_DIR, filename)
    download_with_progress(url, dest, size)

print("\n" + "="*60)
print("‚úÖ All graph files downloaded!")
print("\nGraph files are pre-built by CommonCrawl - no build step needed.")

### Verify Installation

In [None]:
import os
import subprocess
import gzip

print("="*60)
print("           INSTALLATION VERIFICATION")
print("="*60 + "\n")

all_checks_passed = True

# Check Java
print("1. Java Runtime:")
try:
    result = subprocess.run(['java', '-version'], capture_output=True, text=True, timeout=5)
    if result.returncode == 0:
        version_line = result.stderr.split('\n')[0]
        print(f"   ‚úÖ {version_line}")
    else:
        print("   ‚ùå Java not working properly")
        all_checks_passed = False
except Exception as e:
    print(f"   ‚ùå Java error: {e}")
    all_checks_passed = False

# Check cc-webgraph JAR
print("\n2. cc-webgraph Tools:")
jar_path = "/content/cc-webgraph/target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar"
if os.path.exists(jar_path):
    size_mb = os.path.getsize(jar_path) / (1024 * 1024)
    print(f"   ‚úÖ JAR file found ({size_mb:.1f} MB)")
else:
    print(f"   ‚ùå JAR file not found")
    all_checks_passed = False

# Check DiscoveryTool
print("\n3. DiscoveryTool:")
tool_path = "/content/NetNeighbors/bin/DiscoveryTool.class"
if os.path.exists(tool_path):
    print(f"   ‚úÖ DiscoveryTool compiled")
else:
    print(f"   ‚ùå DiscoveryTool not found")
    all_checks_passed = False

# Check webgraph data files
print("\n4. Webgraph Data Files:")
VERSION = "cc-main-2025-26-nov-dec-jan"

files_to_check = [
    (f"{VERSION}-domain-vertices.txt.gz", "Vertices (domain mapping)"),
    (f"{VERSION}-domain.graph", "Forward graph (outlinks)"),
    (f"{VERSION}-domain.properties", "Forward graph properties"),
    (f"{VERSION}-domain-t.graph", "Transpose graph (backlinks)"),
    (f"{VERSION}-domain-t.properties", "Transpose graph properties"),
    (f"{VERSION}-domain.stats", "Graph statistics"),
]

for filename, description in files_to_check:
    filepath = os.path.join(WEBGRAPH_DIR, filename)
    if os.path.exists(filepath):
        size = os.path.getsize(filepath)
        if size > 1024 * 1024 * 1024:  # > 1GB
            size_str = f"{size / (1024**3):.1f} GB"
        elif size > 1024 * 1024:  # > 1MB
            size_str = f"{size / (1024**2):.1f} MB"
        else:
            size_str = f"{size / 1024:.1f} KB"
        print(f"   ‚úÖ {description}: {size_str}")
    else:
        print(f"   ‚ùå {description}: MISSING")
        all_checks_passed = False

# Graph statistics
print("\n5. Graph Statistics:")
vertices_file = os.path.join(WEBGRAPH_DIR, f"{VERSION}-domain-vertices.txt.gz")
if os.path.exists(vertices_file):
    print("   Counting domains (this takes ~30 seconds)...")
    try:
        with gzip.open(vertices_file, 'rt', encoding='utf-8') as f:
            num_domains = sum(1 for _ in f)
        print(f"   ‚úÖ Total domains: {num_domains:,}")
    except Exception as e:
        print(f"   ‚ö†Ô∏è Could not count: {e}")

# Read stats file if available
stats_file = os.path.join(WEBGRAPH_DIR, f"{VERSION}-domain.stats")
if os.path.exists(stats_file):
    try:
        with open(stats_file, 'r') as f:
            stats = f.read()
        for line in stats.strip().split('\n'):
            if line.startswith('nodes='):
                print(f"   ‚úÖ Nodes: {int(line.split('=')[1]):,}")
            elif line.startswith('arcs='):
                print(f"   ‚úÖ Edges: {int(line.split('=')[1]):,}")
    except:
        pass

# Final verdict
print("\n" + "="*60)
if all_checks_passed:
    print("üéâ SETUP COMPLETE!")
    print("="*60)
    print("\nYou're ready to discover domains!")
    print("Scroll down to Section 3: Discovery Interface")
else:
    print("‚ö†Ô∏è SETUP INCOMPLETE")
    print("="*60)
    print("\nPlease re-run the failed setup cells above.")

---

## Section 2: Helper Functions

These cells define the discovery functionality. You don't need to modify them.

In [None]:
import subprocess
import pandas as pd
import os
import gzip
from typing import List, Dict, Tuple

class WebgraphDiscovery:
    """
    Wrapper class for running webgraph discovery using the DiscoveryTool.
    """
    
    def __init__(self, webgraph_dir: str, version: str):
        self.webgraph_dir = webgraph_dir
        self.version = version
        self.jar_path = "/content/cc-webgraph/target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar"
        self.tool_class_path = "/content/NetNeighbors/bin"
        self.graph_base = os.path.join(webgraph_dir, f"{version}-domain")
        self.vertices_file = os.path.join(webgraph_dir, f"{version}-domain-vertices.txt.gz")
        
        # Cache for domain validation
        self._domain_set = None
        
    def _load_domain_set(self) -> set:
        """Load set of all domains in webgraph (for validation)"""
        if self._domain_set is not None:
            return self._domain_set
        
        print("Loading domain list (one-time, ~30 seconds)...")
        domains = set()
        with gzip.open(self.vertices_file, 'rt', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    reversed_domain = parts[1]
                    # Convert back to normal notation
                    domain = '.'.join(reversed(reversed_domain.split('.')))
                    domains.add(domain)
        
        self._domain_set = domains
        print(f"‚úÖ Loaded {len(domains):,} domains")
        return domains
    
    def validate_seeds(self, seed_domains: List[str]) -> Tuple[List[str], List[str]]:
        """Validate which seed domains exist in webgraph"""
        domain_set = self._load_domain_set()
        
        found = []
        not_found = []
        
        for domain in seed_domains:
            domain_clean = domain.strip().lower()
            if domain_clean in domain_set:
                found.append(domain_clean)
            else:
                not_found.append(domain_clean)
        
        return found, not_found
    
    def discover(self, 
                 seed_domains: List[str], 
                 min_connections: int,
                 direction: str = 'backlinks') -> pd.DataFrame:
        """
        Run discovery algorithm using the DiscoveryTool.
        
        Args:
            seed_domains: List of seed domain names
            min_connections: Minimum number of connections to include in results
            direction: 'backlinks' (who links TO seeds) or 'outlinks' (who seeds link TO)
            
        Returns:
            DataFrame with columns: domain, connections, percentage
        """
        # Write seeds to file
        seeds_file = '/content/seeds.txt'
        with open(seeds_file, 'w') as f:
            for domain in seed_domains:
                f.write(domain.strip().lower() + '\n')
        
        results_file = '/content/results.csv'
        
        # Build Java command
        cmd = [
            'java',
            '-Xmx48g',  # Use 48GB heap
            '-cp', f'{self.jar_path}:{self.tool_class_path}',
            'DiscoveryTool',
            '--graph', self.graph_base,
            '--vertices', self.vertices_file,
            '--seeds', seeds_file,
            '--output', results_file,
            '--min-connections', str(min_connections),
            '--direction', direction
        ]
        
        print(f"Running discovery ({direction}, min_connections={min_connections})...")
        print(f"Seed domains: {len(seed_domains)}")
        print()
        
        try:
            # Run the discovery tool
            result = subprocess.run(
                cmd, 
                capture_output=True, 
                text=True, 
                timeout=600  # 10 minute timeout
            )
            
            # Print output
            if result.stdout:
                print(result.stdout)
            
            if result.returncode != 0:
                print("Error output:")
                print(result.stderr)
                raise Exception(f"Discovery failed with return code {result.returncode}")
            
            # Read results CSV
            if os.path.exists(results_file):
                df = pd.read_csv(results_file)
                return df
            else:
                print("No results file generated")
                return pd.DataFrame(columns=['domain', 'connections', 'percentage'])
                
        except subprocess.TimeoutExpired:
            raise Exception("Discovery timed out (>10 minutes). Try fewer seed domains.")
        except Exception as e:
            raise Exception(f"Discovery error: {str(e)}")

# Initialize discovery object
VERSION = "cc-main-2025-26-nov-dec-jan"
discovery = WebgraphDiscovery(WEBGRAPH_DIR, VERSION)

print("‚úÖ Discovery tools initialized")
print(f"Graph location: {WEBGRAPH_DIR}")
print(f"Version: {VERSION}")

---

## Section 3: Discovery Interface üéØ

### Use this form to discover related domains!

In [None]:
import ipywidgets as widgets
from IPython.display import display, HTML, FileLink, clear_output
import pandas as pd

# Create input widgets
domains_input = widgets.Textarea(
    value='',
    placeholder='Enter seed domains, one per line:\nexample.com\ntest.org\nsample.net',
    description='',
    layout=widgets.Layout(width='80%', height='200px'),
    style={'description_width': '0px'}
)

min_conn_slider = widgets.IntSlider(
    value=5,
    min=1,
    max=100,
    step=1,
    description='Min Connections:',
    style={'description_width': '150px'},
    layout=widgets.Layout(width='60%')
)

direction_radio = widgets.RadioButtons(
    options=[
        ('Backlinks (who links TO seeds)', 'backlinks'),
        ('Outlinks (who seeds link TO)', 'outlinks')
    ],
    value='backlinks',
    description='Direction:',
    style={'description_width': '150px'}
)

run_button = widgets.Button(
    description='üîç Run Discovery',
    button_style='success',
    layout=widgets.Layout(width='200px', height='40px'),
    tooltip='Click to discover related domains'
)

output_area = widgets.Output()

# Display form
display(HTML("<h2>üìù Discovery Configuration</h2>"))
display(HTML("<p><strong>Seed Domains</strong> (one per line):</p>"))
display(domains_input)
display(HTML("<br>"))
display(min_conn_slider)
display(HTML("<br>"))
display(direction_radio)
display(HTML("<br>"))
display(run_button)
display(HTML("<hr>"))
display(output_area)

# Button click handler
def on_run_click(b):
    output_area.clear_output()
    
    with output_area:
        display(HTML("<h3>‚è≥ Processing...</h3>"))
        
        # Validate input
        domains_text = domains_input.value.strip()
        if not domains_text:
            print("‚ùå Error: Please enter at least one domain")
            return
        
        seed_domains = [d.strip() for d in domains_text.split('\n') if d.strip()]
        
        if len(seed_domains) == 0:
            print("‚ùå Error: Please enter at least one domain")
            return
        
        if len(seed_domains) > 1000:
            print("‚ùå Error: Maximum 1000 domains allowed")
            print(f"You entered: {len(seed_domains)} domains")
            return
        
        # Validate seeds exist in webgraph
        print(f"Validating {len(seed_domains)} seed domains...")
        found, not_found = discovery.validate_seeds(seed_domains)
        
        if len(found) == 0:
            print("\n‚ùå Error: None of the seed domains were found in the webgraph")
            print("\nDomains not found:")
            for d in not_found[:10]:
                print(f"  ‚Ä¢ {d}")
            if len(not_found) > 10:
                print(f"  ... and {len(not_found)-10} more")
            return
        
        if len(not_found) > 0:
            print(f"\n‚ö†Ô∏è Warning: {len(not_found)} domains not found in webgraph:")
            for d in not_found[:5]:
                print(f"  ‚Ä¢ {d}")
            if len(not_found) > 5:
                print(f"  ... and {len(not_found)-5} more")
            print(f"\nProceeding with {len(found)} valid domains\n")
        else:
            print(f"‚úÖ All {len(found)} seed domains found in webgraph\n")
        
        print("="*60)
        print(f"Configuration:")
        print(f"  ‚Ä¢ Direction: {direction_radio.value}")
        print(f"  ‚Ä¢ Minimum connections: {min_conn_slider.value}")
        print(f"  ‚Ä¢ Valid seed domains: {len(found)}")
        print("="*60)
        
        try:
            # Run discovery
            results_df = discovery.discover(
                seed_domains=found,
                min_connections=min_conn_slider.value,
                direction=direction_radio.value
            )
            
            # Clear processing message
            clear_output(wait=True)
            
            # Display results
            if len(results_df) == 0:
                display(HTML("<h3>‚ùå No Results Found</h3>"))
                print("No domains found matching the criteria.")
                print("\nTry:")
                print("  ‚Ä¢ Lowering the minimum connections threshold")
                print("  ‚Ä¢ Using different seed domains")
                print("  ‚Ä¢ Switching between backlinks and outlinks")
            else:
                display(HTML(f"<h3>‚úÖ Found {len(results_df):,} Domains</h3>"))
                print(f"Discovered {len(results_df):,} domains with ‚â•{min_conn_slider.value} connections\n")
                
                # Style and display dataframe
                display(HTML("<h4>Top Results:</h4>"))
                
                styled_df = results_df.head(100).style.format({
                    'connections': '{:,.0f}',
                    'percentage': '{:.2f}%'
                }).background_gradient(subset=['connections'], cmap='YlOrRd')
                
                display(styled_df)
                
                if len(results_df) > 100:
                    print(f"\n(Showing top 100 of {len(results_df):,} results. Download CSV for full list.)")
                
                # Summary statistics
                print("\n" + "="*60)
                print("Summary Statistics:")
                print(f"  ‚Ä¢ Total discovered: {len(results_df):,} domains")
                print(f"  ‚Ä¢ Connections range: {results_df['connections'].min():.0f} - {results_df['connections'].max():.0f}")
                print(f"  ‚Ä¢ Mean connections: {results_df['connections'].mean():.1f}")
                print(f"  ‚Ä¢ Median connections: {results_df['connections'].median():.0f}")
                print("="*60)
                
                # Download link
                display(HTML("<br><h4>üíæ Download Full Results</h4>"))
                display(FileLink('/content/results.csv', result_html_prefix="üì• Click to download: "))
                print(f"\nCSV contains all {len(results_df):,} discovered domains")
                
        except Exception as e:
            clear_output(wait=True)
            display(HTML("<h3>‚ùå Error During Discovery</h3>"))
            print(f"Error: {str(e)}")
            print("\nüìù Troubleshooting:")
            print("1. Check that all setup cells completed successfully")
            print("2. Verify you're using High-RAM runtime")
            print("3. Try restarting runtime: Runtime ‚Üí Restart runtime")
            print("4. Try with fewer seed domains")

run_button.on_click(on_run_click)

print("\nüí° Tip: Start with 10-20 seed domains and min_connections=5 for fast results!")