In [15]:
import pandas as pd
import webbrowser
import time
import os

# Path to the CSV file
csv_path = r"C:\Users\clint\Desktop\Geocoding_Task\YellowPages_scraper\yellowpages_robust_scraped_data.csv"

# Check if file exists
if os.path.exists(csv_path):
    print(f"File found: {csv_path}")
else:
    print(f"File not found: {csv_path}")
    print("Please check the file path.")

File found: C:\Users\clint\Desktop\Geocoding_Task\YellowPages_scraper\yellowpages_robust_scraped_data.csv


In [16]:
# Read the CSV file
try:
    df = pd.read_csv(csv_path)
    print(f"Successfully loaded CSV file with {len(df)} rows and {len(df.columns)} columns")
    
    # Display column names
    print("\nColumn names:")
    print(df.columns.tolist())
    
    # Check if BUSINESS_URL column exists
    if 'BUSINESS_URL' in df.columns:
        print(f"\nFound BUSINESS_URL column with {len(df['BUSINESS_URL'])} entries")
        
        # Display first few URLs
        print("\nFirst 5 URLs:")
        for i, url in enumerate(df['BUSINESS_URL'].head()):
            print(f"{i+1}. {url}")
    else:
        print("\nBUSINESS_URL column not found!")
        print("Available columns with 'URL' in name:")
        url_columns = [col for col in df.columns if 'URL' in col.upper()]
        print(url_columns)
        
except Exception as e:
    print(f"Error reading CSV file: {e}")

Successfully loaded CSV file with 13 rows and 19 columns

Column names:
['ADDRESS', 'AKA', 'BUSINESS_NAME', 'BUSINESS_URL', 'CATEGORIES', 'EXTRA_PHONES', 'JSONLD_CITY_1', 'JSONLD_LAT_1', 'JSONLD_LNG_1', 'JSONLD_NAME_1', 'JSONLD_PHONE_1', 'JSONLD_STATE_1', 'JSONLD_STREET_1', 'JSONLD_ZIP_1', 'ORIGINAL_PHONE', 'PHONE', 'SCRAPED_AT', 'SEARCH_URL', 'WEBSITE']

Found BUSINESS_URL column with 13 entries

First 5 URLs:
1. https://www.yellowpages.com/westley-ca/mip/kgs-dhoot-associates-inc-566452440
2. https://www.yellowpages.com/westley-ca/mip/joes-travel-plaza-2460291
3. https://www.yellowpages.com/westley-ca/mip/joes-travel-plaza-2460291?lid=2460291#gallery
4. https://www.yellowpages.com/west-sacramento-ca/mip/ampm-561252172
5. https://www.yellowpages.com/west-sacramento-ca/mip/arco-521645288


In [17]:
# Function to open URLs with safety checks
def open_urls_from_csv(dataframe, url_column='BUSINESS_URL', delay=2, max_urls=None):
    """
    Open URLs from a CSV file column
    
    Parameters:
    - dataframe: pandas DataFrame containing the URLs
    - url_column: name of the column containing URLs
    - delay: delay in seconds between opening URLs (to avoid overwhelming the browser)
    - max_urls: maximum number of URLs to open (None for all)
    """
    
    if url_column not in dataframe.columns:
        print(f"Column '{url_column}' not found in the dataframe")
        return
    
    # Get URLs and remove any NaN values
    urls = dataframe[url_column].dropna()
    
    if max_urls:
        urls = urls.head(max_urls)
    
    print(f"Preparing to open {len(urls)} URLs...")
    
    # Ask for confirmation before opening many URLs
    if len(urls) > 10:
        response = input(f"You are about to open {len(urls)} URLs. This might open many browser tabs. Continue? (y/n): ")
        if response.lower() != 'y':
            print("Operation cancelled.")
            return
    
    opened_count = 0
    failed_count = 0
    
    for i, url in enumerate(urls, 1):
        try:
            if pd.isna(url) or url == '':
                print(f"Skipping empty URL at row {i}")
                continue
                
            print(f"Opening URL {i}/{len(urls)}: {url}")
            webbrowser.open(url)
            opened_count += 1
            
            # Add delay between opening URLs
            if i < len(urls):  # Don't delay after the last URL
                time.sleep(delay)
                
        except Exception as e:
            print(f"Failed to open URL {i}: {url} - Error: {e}")
            failed_count += 1
    
    print(f"\nSummary:")
    print(f"Successfully opened: {opened_count} URLs")
    print(f"Failed to open: {failed_count} URLs")

# Example usage - uncomment the line below to open URLs
# open_urls_from_csv(df, 'BUSINESS_URL', delay=2, max_urls=5)  # Opens first 5 URLs with 2-second delay

In [19]:
# Open all URLs from the BUSINESS_URL column
# WARNING: This will open ALL URLs in your browser!
# Uncomment the line below to open all URLs:

open_urls_from_csv(df, 'BUSINESS_URL', delay=1)

# Or to open just a few URLs for testing:
# open_urls_from_csv(df, 'BUSINESS_URL', delay=2, max_urls=5)

# Or to open all URLs without confirmation (use with caution):
# open_urls_from_csv(df, 'BUSINESS_URL', delay=0.5)

Preparing to open 13 URLs...
Opening URL 1/13: https://www.yellowpages.com/westley-ca/mip/kgs-dhoot-associates-inc-566452440
Opening URL 1/13: https://www.yellowpages.com/westley-ca/mip/kgs-dhoot-associates-inc-566452440
Opening URL 2/13: https://www.yellowpages.com/westley-ca/mip/joes-travel-plaza-2460291
Opening URL 2/13: https://www.yellowpages.com/westley-ca/mip/joes-travel-plaza-2460291
Opening URL 3/13: https://www.yellowpages.com/westley-ca/mip/joes-travel-plaza-2460291?lid=2460291#gallery
Opening URL 3/13: https://www.yellowpages.com/westley-ca/mip/joes-travel-plaza-2460291?lid=2460291#gallery
Opening URL 4/13: https://www.yellowpages.com/west-sacramento-ca/mip/ampm-561252172
Opening URL 4/13: https://www.yellowpages.com/west-sacramento-ca/mip/ampm-561252172
Opening URL 5/13: https://www.yellowpages.com/west-sacramento-ca/mip/arco-521645288
Opening URL 5/13: https://www.yellowpages.com/west-sacramento-ca/mip/arco-521645288
Opening URL 6/13: https://www.yellowpages.com/west-sacr

In [None]:
# Additional utility functions for URL management

def preview_urls(dataframe, url_column='BUSINESS_URL', num_urls=10):
    """Preview the first few URLs before opening them"""
    if url_column not in dataframe.columns:
        print(f"Column '{url_column}' not found")
        return
    
    urls = dataframe[url_column].dropna().head(num_urls)
    print(f"Preview of first {len(urls)} URLs:")
    for i, url in enumerate(urls, 1):
        print(f"{i}. {url}")

def get_url_stats(dataframe, url_column='BUSINESS_URL'):
    """Get statistics about URLs in the column"""
    if url_column not in dataframe.columns:
        print(f"Column '{url_column}' not found")
        return
    
    urls = dataframe[url_column]
    total_urls = len(urls)
    valid_urls = len(urls.dropna())
    empty_urls = total_urls - valid_urls
    
    print(f"URL Statistics:")
    print(f"Total entries: {total_urls}")
    print(f"Valid URLs: {valid_urls}")
    print(f"Empty/NaN URLs: {empty_urls}")
    
    return {'total': total_urls, 'valid': valid_urls, 'empty': empty_urls}

# Run these functions to preview before opening URLs
if 'df' in locals():
    preview_urls(df, 'BUSINESS_URL', 10)
    get_url_stats(df, 'BUSINESS_URL')