# URL Testing Notebook

This notebook provides an easy way to test individual URLs and view their responses using the Wikipedia Dead Link Finder package.

In [3]:
# Import required libraries
import sys
import requests
from urllib.parse import urlparse
import pandas as pd
from IPython.display import display, HTML

# Import the package modules
from check_links import (
    check_link_status, 
    check_all_links_with_archives,
    is_likely_false_positive,
    validate_link_with_secondary_check,
    check_with_alternative_methods,
    validate_redirect_chain,
    print_link_summary
)

print("✅ All modules imported successfully!")

✅ All modules imported successfully!


## Test Individual URL Function

In [4]:
def test_url(url, timeout=10.0, show_details=True):
    """
    Test a single URL and display detailed results.
    
    Args:
        url: URL to test
        timeout: Request timeout in seconds
        show_details: Whether to show detailed response information
    """
    print(f"🔍 Testing URL: {url}")
    print("=" * 80)
    
    # Step 1: Initial check
    print("\n📋 Step 1: Initial Check")
    initial_result = check_link_status(url, timeout)
    print(f"Result: {initial_result}")
    
    # Step 2: Check if it's a false positive
    print("\n🔍 Step 2: False Positive Check")
    is_fp = is_likely_false_positive(url, initial_result[1], initial_result[2])
    print(f"Is false positive: {is_fp}")
    
    # Step 3: Secondary validation if needed
    if is_fp:
        print("\n🔄 Step 3: Secondary Validation")
        secondary_result = validate_link_with_secondary_check(url, initial_result, timeout)
        print(f"Secondary result: {secondary_result}")
        final_result = secondary_result
    else:
        final_result = initial_result
    
    # Step 4: Show detailed response if requested
    if show_details:
        print("\n📄 Step 4: Detailed Response Analysis")
        show_detailed_response(url, timeout)
    
    # Step 5: Summary
    print("\n📊 Summary")
    print(f"Final Status: {final_result[1]}")
    print(f"Status Code: {final_result[2]}")
    
    return final_result

def show_detailed_response(url, timeout=10.0):
    """Show detailed response information for debugging."""
    try:
        # Test with different methods
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        print("\n🔧 Testing with different methods:")
        
        # Method 1: Simple HEAD request
        try:
            response = requests.head(url, timeout=timeout, headers=headers, allow_redirects=True)
            print(f"HEAD request: {response.status_code} -> {response.url}")
        except Exception as e:
            print(f"HEAD request failed: {e}")
        
        # Method 2: Simple GET request
        try:
            response = requests.get(url, timeout=timeout, headers=headers, allow_redirects=True)
            print(f"GET request: {response.status_code} -> {response.url}")
            
            # Show response headers
            print(f"\n📋 Response Headers:")
            for header, value in response.headers.items():
                print(f"  {header}: {value}")
                
            # Show first 500 characters of content
            if response.text:
                print(f"\n📄 Content Preview (first 500 chars):")
                print(response.text[:500] + "..." if len(response.text) > 500 else response.text)
        except Exception as e:
            print(f"GET request failed: {e}")
        
        # Method 3: Test alternative methods from the package
        print(f"\n🔄 Package alternative methods:")
        alt_result = check_with_alternative_methods(url, timeout)
        print(f"Alternative methods result: {alt_result}")
        
        # Method 4: Test redirect chain validation
        redirect_result = validate_redirect_chain(url, timeout)
        print(f"Redirect chain result: {redirect_result}")
        
    except Exception as e:
        print(f"Error in detailed analysis: {e}")

## Test Multiple URLs Function

In [5]:
def test_multiple_urls(urls, timeout=10.0, delay=0.1):
    """
    Test multiple URLs and display results in a table.
    
    Args:
        urls: List of URLs to test
        timeout: Request timeout in seconds
        delay: Delay between requests
    """
    print(f"🔍 Testing {len(urls)} URLs...")
    print("=" * 80)
    
    # Use the package's batch checking function
    results = check_all_links_with_archives(urls, {}, timeout=timeout, delay=delay)
    
    # Create a DataFrame for easy viewing
    df = pd.DataFrame(results, columns=['URL', 'Status', 'Status Code'])
    
    # Add a column for false positive check
    df['False Positive'] = df.apply(lambda row: is_likely_false_positive(row['URL'], row['Status'], row['Status Code']), axis=1)
    
    # Display results
    print("\n📊 Results:")
    display(df)
    
    # Show summary
    print("\n📈 Summary:")
    status_counts = df['Status'].value_counts()
    for status, count in status_counts.items():
        print(f"{status}: {count}")
    
    # Show dead links
    dead_links = df[df['Status'] == 'dead']
    if not dead_links.empty:
        print(f"\n❌ Dead links found ({len(dead_links)}):")
        for _, row in dead_links.iterrows():
            print(f"  - {row['URL']} (Status: {row['Status Code']})")
    
    return df

## Example Usage

### Test the URLs that were previously missed:

In [13]:
requests.get('https://www.blackfilm.com/read/exclusive-malcolm-jamal-warner-talks-tnt-major-crimes/', timeout=10, headers= {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }, allow_redirects=True)

<Response [200]>

In [None]:
# Test the URLs that were previously missed
test_urls = [
    'https://www.wsj.com/sports/olympics/scottie-scheffler-olympic-golf-jon-rahm-tommy-fleetwood-32d5aabe?mod=hp_major_pos3',
    'https://www.skysports.com/golf/news/12176/13105831/houston-open-scottie-schefflers-win-streak-ends-as-stephan-jaeger-lands-first-pga-tour-title',
    'https://www.skysports.com/golf/live-blog/14640/12900853/us-open-2023-live-latest-news-and-updates-as-scottie-scheffler-jon-rahm-and-rory-mcilroy-all-feature',
    'https://www.skysports.com/golf/news/12176/12594637/zurich-classic-scottie-scheffler-taking-perspective-from-early-struggles-after-masters-win',
    'https://www.usatoday.com/story/sports/golf/2022/03/29/column-schefflers-desire-to-compete-takes-him-to-the-top/49997601/',
    'https://www.wsj.com/sports/golf/scottie-scheffler-arrested-pga-championship-video-633dab6c',
    'https://www.skysports.com/golf/news/12176/13144958/scottie-scheffler-charges-dropped-against-world-no-1-golfer-after-pga-championship-arrest',
    'https://www.skysports.com/golf/news/12176/13143779/charles-schwab-challenge-davis-riley-takes-maiden-individual-pga-tour-title-on-sombre-day-at-colonial',
    'https://www.netflix.com/tudum/articles/full-swing-release-date-cast-news',
    'https://www.skysports.com/golf/news/12176/12945749/fedexcup-scottie-scheffler-viktor-hovland-rory-mcilroy-lead-chase-at-tour-championship-in-pga-tour-finale'
]

print("Testing the URLs that were False Positives:")
results_df = test_multiple_urls(test_urls, timeout=10.0

Testing the URLs that were False Positives:
🔍 Testing 1 URLs...


Checking links: 100%|██████████| 1/1 [00:07<00:00,  7.19s/link]


📊 Results:





Unnamed: 0,URL,Status,Status Code,False Positive
0,https://www.blackfilm.com/read/exclusive-malco...,alive,200,False



📈 Summary:
alive: 1


### Test individual URL with detailed analysis:

In [None]:
# Test a single URL with detailed analysis
url_to_test = 'http://tvline.com/2015/05/13/american-crime-story-malcolm-jamal-warner-al-cowlings/'
result = test_url(url_to_test, timeout=10.0, show_details=True)

### Test with different types of URLs:

In [None]:
# Test with various types of URLs
various_urls = [
    'https://httpstat.us/200',  # Working URL
    'https://httpstat.us/404',  # Dead URL
    'https://httpstat.us/403',  # Forbidden
    'https://google.com',       # Live site
    'https://nonexistentdomain12345.com',  # DNS failure
    'https://www.essence.com/news_entertainment/entertainment/articles/flashback_fridays_malcolm_jamal_warner/',  # Known dead
]

print("Testing various types of URLs:")
results_df = test_multiple_urls(various_urls, timeout=5.0)

## Interactive Testing

You can now easily test any URL by calling the functions above. For example:

```python
# Test a single URL
test_url('https://example.com', timeout=10.0, show_details=True)

# Test multiple URLs
urls = ['https://url1.com', 'https://url2.com', 'https://url3.com']
test_multiple_urls(urls, timeout=10.0)
```

## Debugging Functions

Additional functions for debugging specific issues:

In [None]:
def debug_redirect_chain(url, timeout=10.0):
    """Debug redirect chains for a URL."""
    print(f"🔍 Debugging redirect chain for: {url}")
    print("=" * 60)
    
    try:
        session = requests.Session()
        session.max_redirects = 10
        
        # Track redirects
        response = session.get(url, timeout=timeout, allow_redirects=True)
        
        print(f"Final URL: {response.url}")
        print(f"Final Status: {response.status_code}")
        print(f"Number of redirects: {len(response.history)}")
        
        if response.history:
            print("\n📋 Redirect chain:")
            for i, resp in enumerate(response.history):
                print(f"  {i+1}. {resp.url} -> {resp.status_code}")
        
    except Exception as e:
        print(f"Error: {e}")

def debug_false_positive_logic(url, timeout=10.0):
    """Debug the false positive detection logic."""
    print(f"🔍 Debugging false positive logic for: {url}")
    print("=" * 60)
    
    # Get initial result
    result = check_link_status(url, timeout)
    print(f"Initial result: {result}")
    
    # Check false positive logic
    is_fp = is_likely_false_positive(url, result[1], result[2])
    print(f"Is false positive: {is_fp}")
    
    # Show why it's considered a false positive
    if result[2] in [301, 302, 303, 307, 308]:
        print("Reason: Status code indicates redirect")
    else:
        print("Reason: Status code does not indicate redirect")
    
    return result, is_fp

### Example debugging usage:

In [None]:
# Debug redirect chain
debug_redirect_chain('http://tvline.com/2015/05/13/american-crime-story-malcolm-jamal-warner-al-cowlings/')

print("\n" + "="*80 + "\n")

# Debug false positive logic
debug_false_positive_logic('http://tvline.com/2015/05/13/american-crime-story-malcolm-jamal-warner-al-cowlings/')