In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import os

# Load the dataframe
df = pd.read_csv(r'C:\Users\clint\Desktop\Geocoding_Task\Web_Scraping\2.csv')

print(f"Total rows to process: {len(df)}")

# Initialize additional fields we know about (Chain, Latitude, Longitude)
additional_fields = ['Chain', 'Latitude', 'Longitude']
for field in additional_fields:
    df[field] = None

# Function to extract data from a URL
def extract_locdetinfo_data(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the leftcol div and extract all locdetinfo data
        leftcol = soup.find('div', id='leftcol')
        data = {}
        
        if leftcol:
            locdetinfo_divs = leftcol.find_all('div', class_='locdetinfo')
            
            for div in locdetinfo_divs:
                div_text = div.get_text().strip()
                match = re.match(r'([^:]+):\s*(.+)', div_text)
                if match:
                    field_name = match.group(1).strip()
                    field_value = match.group(2).strip()
                    data[field_name] = field_value
        
        # Extract Chain information from popb_rotate div
        popb_rotate_div = soup.find('div', class_='popb_rotate')
        if popb_rotate_div:
            img_tag = popb_rotate_div.find('img')
            if img_tag and img_tag.get('src'):
                # Extract chain name from image source
                img_src = img_tag.get('src')
                # Extract filename from the URL
                filename = img_src.split('/')[-1]
                # Remove file extension and 'logo' if present
                chain_name = filename.replace('.jpg', '').replace('.png', '').replace('logo', '').strip()
                data['Chain'] = chain_name
        
        # Extract coordinates from Google Maps link
        maps_link = soup.find('a', href=lambda x: x and 'google.com/maps/place' in x if x else False)
        if maps_link:
            href = maps_link.get('href')
            # Extract coordinates from URL like: https://www.google.com/maps/place/33.644014,-85.50071
            coord_match = re.search(r'place/(-?\d+\.?\d*),(-?\d+\.?\d*)', href)
            if coord_match:
                data['Latitude'] = coord_match.group(1)
                data['Longitude'] = coord_match.group(2)
        
        return data
        
    except Exception as e:
        print(f"Error extracting data from {url}: {e}")
        return {}

# BATCH PROCESSING to handle large dataset efficiently
batch_size = 1000
total_rows = len(df)
processed_dfs = []
all_new_columns = set(additional_fields)

print(f"Processing {total_rows} rows in batches of {batch_size}")
print("=" * 80)

for batch_num in range(0, total_rows, batch_size):
    batch_end = min(batch_num + batch_size, total_rows)
    current_batch_num = (batch_num // batch_size) + 1
    total_batches = (total_rows - 1) // batch_size + 1
    
    print(f"\nBATCH {current_batch_num}/{total_batches}: Processing rows {batch_num} to {batch_end-1}")
    print("-" * 60)
    
    # Get current batch
    batch_df = df.iloc[batch_num:batch_end].copy()
    
    # Process each row in the batch
    batch_processed_count = 0
    for idx, (index, row) in enumerate(batch_df.iterrows()):
        url = row['full_url']
        batch_processed_count += 1
        overall_count = batch_num + batch_processed_count
        
        print(f"  Row {batch_processed_count}/{len(batch_df)} (Overall: {overall_count}/{total_rows}): {row['name']}")
        
        # Extract data for this row
        extracted_data = extract_locdetinfo_data(url)
        
        # Track new columns discovered
        for field_name in extracted_data.keys():
            if field_name not in all_new_columns:
                all_new_columns.add(field_name)
                print(f"    New field discovered: {field_name}")
        
        # Update the batch dataframe with extracted data
        for field_name, field_value in extracted_data.items():
            if field_name not in batch_df.columns:
                batch_df[field_name] = None
            batch_df.loc[index, field_name] = field_value
        
        # Add a small delay to be respectful to the server
        time.sleep(1)
        
        # Show progress every 10 rows within batch
        if batch_processed_count % 10 == 0:
            print(f"    Completed {batch_processed_count} rows in this batch...")
    
    # Ensure all discovered columns exist in this batch
    for col in all_new_columns:
        if col not in batch_df.columns:
            batch_df[col] = None
    
    # Save intermediate batch result as backup
    batch_filename = f'4_batch_{current_batch_num}.csv'
    batch_path = os.path.join(r'C:\Users\clint\Desktop\Geocoding_Task\Web_Scraping', batch_filename)
    batch_df.to_csv(batch_path, index=False)
    print(f"  Batch {current_batch_num} saved as: {batch_filename}")
    
    # Store processed batch
    processed_dfs.append(batch_df)
    
    print(f"  BATCH {current_batch_num} COMPLETED: Processed {len(batch_df)} rows")

print("\n" + "=" * 80)
print("COMBINING ALL BATCHES...")

# Ensure all dataframes have the same columns
for i, batch_df in enumerate(processed_dfs):
    for col in all_new_columns:
        if col not in batch_df.columns:
            batch_df[col] = None

# Combine all batches
final_df = pd.concat(processed_dfs, ignore_index=True)

# Save final result
output_path = r'C:\Users\clint\Desktop\Geocoding_Task\Web_Scraping\4.csv'
final_df.to_csv(output_path, index=False)

print(f"✅ SUCCESS! All {total_rows} rows processed and saved to: 4.csv")
print(f"📊 Final dataset shape: {final_df.shape}")
print(f"📋 Total columns: {len(final_df.columns)}")
print(f"🆕 New columns discovered: {sorted(list(all_new_columns - set(additional_fields)))}")

# Display final statistics
print("\n" + "=" * 80)
print("FINAL RESULTS SUMMARY:")
print("-" * 40)
for col in sorted(final_df.columns):
    non_null_count = final_df[col].notna().sum()
    percentage = (non_null_count / len(final_df)) * 100
    print(f"{col}: {non_null_count}/{len(final_df)} ({percentage:.1f}%)")

# Display first few rows
print(f"\nFirst 5 rows of final dataset:")
final_df.head()

Total rows to process: 15526
Processing 15526 rows in batches of 1000

BATCH 1/16: Processing rows 0 to 999
------------------------------------------------------------
  Row 1/1000 (Overall: 1/15526): 205 TRUCK CENTER
    New field discovered: Highway
    New field discovered: Exit
    New field discovered: Street Address
    New field discovered: City
    New field discovered: State
    New field discovered: Postal Code
    New field discovered: Phone
    New field discovered: Phone 2
    New field discovered: Fax
    New field discovered: Hours of Operation
    New field discovered: # of Parking Spots
    New field discovered: # of Reserved Parking Spots
    New field discovered: # of Paid Parking Spots
    New field discovered: # of Fuel Lanes
    New field discovered: # of Showers
    New field discovered: # of Truck Service Bays
    New field discovered: Highway
    New field discovered: Exit
    New field discovered: Street Address
    New field discovered: City
    New field di

Unnamed: 0,state_id,state,name,href,full_url,stop_type,Chain,Latitude,Longitude,Highway,...,Bulk Def,Propane,# of Men's Showers,Phone 4,Mailing Address,Phone 5,Road Name,https,htp,http
0,1,Alabama,205 TRUCK CENTER,location_details.php?id=171,https://www.truckstopsandservices.com/location...,Trucker,chevron,33.644014,-85.50071,I-20,...,,,,,,,,,,
1,1,Alabama,231 FUEL STOP,location_details.php?id=53886,https://www.truckstopsandservices.com/location...,Trucker,ind_dealer_1,32.258949,-86.155029,US 231,...,,,,,,,,,,
2,1,Alabama,4 WAY QUICK STOP,location_details.php?id=53890,https://www.truckstopsandservices.com/location...,Trucker,sunoco,34.28574,-86.586754,AL 69,...,,,,,,,,,,
3,1,Alabama,A. W. HERNDON OIL CO. INC.,location_details.php?id=54388,https://www.truckstopsandservices.com/location...,Trucker,citgo,31.36038,-85.327866,US 431,...,,,,,,,,,,
4,1,Alabama,ALLEN'S FOOD MART #9,location_details.php?id=10176,https://www.truckstopsandservices.com/location...,Trucker,exxon,33.165995,-86.277705,I-20,...,,,,,,,,,,
