In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

# Initialize list to store all data
all_data = []

# Loop through all states (1 to 65)
for state_id in range(1, 66):
    print(f"Processing state ID: {state_id}")
    
    # Read the webpage for current state
    url = f"https://www.truckstopsandservices.com/listcatbusinesses.php?id=19&state={state_id}"
    
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract state from h1 element
        h1_element = soup.find('h1', class_='h2')
        state = ""
        if h1_element:
            h1_text = h1_element.get_text(strip=True)
            # Extract state from "Truckers Directory - Truck Stops in STATE"
            if "Truck Stops in" in h1_text:
                state = h1_text.split("Truck Stops in")[-1].strip()
        
        # Find the tbody element with class "nohover"
        tbody = soup.find('tbody', class_='nohover')
        
        if tbody:
            # Find all anchor tags with href attributes
            anchor_tags = tbody.find_all('a', href=True)
            
            state_count = 0
            for anchor in anchor_tags:
                href = anchor['href']
                name = anchor.get_text(strip=True)
                
                # Only include links that match the pattern "location_details.php?id="
                if 'location_details.php?id=' in href:
                    all_data.append({
                        'state_id': state_id,
                        'state': state,
                        'name': name,
                        'href': href,
                        'full_url': f"https://www.truckstopsandservices.com/{href}",
                        'stop_type': 'Trucker'
                    })
                    state_count += 1
            
            print(f"  State: {state} - Found {state_count} locations")
        else:
            print(f"  State ID {state_id}: No data found")
            
    except Exception as e:
        print(f"  Error processing state ID {state_id}: {str(e)}")
    
    # Add small delay to be respectful to the server
    time.sleep(0.5)

# Create final DataFrame
df = pd.DataFrame(all_data)

print(f"\nTotal locations found across all states: {len(df)}")
print(f"Number of states with data: {df['state'].nunique()}")
print("\nFirst 10 results:")
print(df.head(10))

Processing state ID: 1
  State: Alabama - Found 208 locations
  State: Alabama - Found 208 locations
Processing state ID: 2
Processing state ID: 2
  State: Arkansas - Found 160 locations
  State: Arkansas - Found 160 locations
Processing state ID: 3
Processing state ID: 3
  State: Arizona - Found 103 locations
  State: Arizona - Found 103 locations
Processing state ID: 4
Processing state ID: 4
  State: British Columbia - Found 46 locations
  State: British Columbia - Found 46 locations
Processing state ID: 5
Processing state ID: 5
  State: California - Found 235 locations
  State: California - Found 235 locations
Processing state ID: 6
Processing state ID: 6
  State: Colorado - Found 102 locations
  State: Colorado - Found 102 locations
Processing state ID: 7
Processing state ID: 7
  State: Connecticut - Found 27 locations
  State: Connecticut - Found 27 locations
Processing state ID: 8
Processing state ID: 8
  State: Delaware - Found 16 locations
  State: Delaware - Found 16 locations

In [2]:


# Show some statistics
print("\n=== SUMMARY STATISTICS ===")
print(f"Total truck stops found: {len(df)}")
print(f"Number of unique states: {df['state'].nunique()}")
print(f"State IDs processed: {df['state_id'].min()} to {df['state_id'].max()}")

# Show counts by state
print("\nTruck stops per state:")
state_counts = df.groupby(['state_id', 'state']).size().reset_index(name='count')
state_counts = state_counts.sort_values('count', ascending=False)
print(state_counts)



=== SUMMARY STATISTICS ===
Total truck stops found: 7928
Number of unique states: 59
State IDs processed: 1 to 63

Truck stops per state:
    state_id                      state  count
44        45                      Texas    878
48        49                  Wisconsin    328
12        13                   Illinois    328
9         10                    Georgia    304
34        35                       Ohio    273
20        21                   Michigan    272
38        39               Pennsylvania    255
16        17                  Louisiana    248
25        26             North Carolina    235
4          5                 California    235
13        14                    Indiana    223
22        23                   Missouri    221
0          1                    Alabama    208
35        36                   Oklahoma    207
41        42             South Carolina    202
8          9                    Florida    195
10        11                       Iowa    194
43        44   

In [3]:
# Explore the data further
print("Top 10 states with most truck stops:")
top_states = df['state'].value_counts().head(10)
print(top_states)

print("\nSample of truck stop names:")
print(df['name'].head(15).tolist())

# Check for any missing data
print("\nData quality check:")
print(f"Missing state names: {df['state'].isna().sum()}")
print(f"Missing truck stop names: {df['name'].isna().sum()}")
print(f"Empty state names: {(df['state'] == '').sum()}")

# Show DataFrame info
print("\nDataFrame info:")
print(df.info())

Top 10 states with most truck stops:
state
Texas             878
Wisconsin         328
Illinois          328
Georgia           304
Ohio              273
Michigan          272
Pennsylvania      255
Louisiana         248
North Carolina    235
California        235
Name: count, dtype: int64

Sample of truck stop names:
['205 TRUCK CENTER', '231 FUEL STOP', '4 WAY QUICK STOP', 'A. W. HERNDON OIL CO. INC.', "ALLEN'S FOOD MART #9", "ARNOLD'S TRUCK STOP", 'BOAZ FOOD MART', 'BOLIGEE ONE STOP', 'BP', 'BROOKWOOD SHELL TRUCK STOP', 'CAFFEE JUNCTION CHEVRON', 'CEFCO #0485', 'CHEVRON', 'CHICASAW CORNER STOP', 'CIRCLE K']

Data quality check:
Missing state names: 0
Missing truck stop names: 0
Empty state names: 0

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7928 entries, 0 to 7927
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   state_id   7928 non-null   int64 
 1   state      7928 non-null   object
 2   name   

In [4]:
df

Unnamed: 0,state_id,state,name,href,full_url,stop_type
0,1,Alabama,205 TRUCK CENTER,location_details.php?id=171,https://www.truckstopsandservices.com/location...,Trucker
1,1,Alabama,231 FUEL STOP,location_details.php?id=53886,https://www.truckstopsandservices.com/location...,Trucker
2,1,Alabama,4 WAY QUICK STOP,location_details.php?id=53890,https://www.truckstopsandservices.com/location...,Trucker
3,1,Alabama,A. W. HERNDON OIL CO. INC.,location_details.php?id=54388,https://www.truckstopsandservices.com/location...,Trucker
4,1,Alabama,ALLEN'S FOOD MART #9,location_details.php?id=10176,https://www.truckstopsandservices.com/location...,Trucker
...,...,...,...,...,...,...
7923,63,Alaska,MENTASTA LODGE,location_details.php?id=54692,https://www.truckstopsandservices.com/location...,Trucker
7924,63,Alaska,NORTHERN ENERGY CORP.,location_details.php?id=22842,https://www.truckstopsandservices.com/location...,Trucker
7925,63,Alaska,TESORO NORTH #78,location_details.php?id=22840,https://www.truckstopsandservices.com/location...,Trucker
7926,63,Alaska,TESORO TRUCK STOP #101,location_details.php?id=22818,https://www.truckstopsandservices.com/location...,Trucker


In [5]:
# Now scrape RV stops from rvandtravelers.com
print("\n=== STARTING RV STOPS SCRAPING ===")

# Loop through all states (1 to 65) for RV stops
for state_id in range(1, 66):
    print(f"Processing RV stops for state ID: {state_id}")
    
    # Read the webpage for current state (RV site)
    url = f"http://www.rvandtravelers.com/listcatbusinesses.php?id=19&state={state_id}"
    
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract state from h1 element
        h1_element = soup.find('h1', class_='h2')
        state = ""
        if h1_element:
            h1_text = h1_element.get_text(strip=True)
            # Extract state from title (may have different format)
            if "in" in h1_text:
                # Try to extract state from various possible formats
                parts = h1_text.split("in")
                if len(parts) > 1:
                    state = parts[-1].strip()
        
        # Find the tbody element with class "nohover"
        tbody = soup.find('tbody', class_='nohover')
        
        if tbody:
            # Find all anchor tags with href attributes
            anchor_tags = tbody.find_all('a', href=True)
            
            state_count = 0
            for anchor in anchor_tags:
                href = anchor['href']
                name = anchor.get_text(strip=True)
                
                # Only include links that match the pattern "location_details.php?id="
                if 'location_details.php?id=' in href:
                    all_data.append({
                        'state_id': state_id,
                        'state': state,
                        'name': name,
                        'href': href,
                        'full_url': f"http://www.rvandtravelers.com/{href}",
                        'stop_type': 'RVer'
                    })
                    state_count += 1
            
            print(f"  State: {state} - Found {state_count} RV locations")
        else:
            print(f"  State ID {state_id}: No RV data found")
            
    except Exception as e:
        print(f"  Error processing RV state ID {state_id}: {str(e)}")
    
    # Add small delay to be respectful to the server
    time.sleep(0.5)

# Create updated DataFrame with both Trucker and RVer data
df_combined = pd.DataFrame(all_data)

print(f"\n=== COMBINED RESULTS ===")
print(f"Total locations found (Trucker + RVer): {len(df_combined)}")
print(f"Number of states with data: {df_combined['state'].nunique()}")

# Show breakdown by stop type
print("\nBreakdown by stop type:")
stop_type_counts = df_combined['stop_type'].value_counts()
print(stop_type_counts)

# Save the combined data to CSV
df_combined.to_csv('2.csv', index=False)
print(f"\nData saved to '2.csv'")
print(f"Total records saved: {len(df_combined)}")



=== STARTING RV STOPS SCRAPING ===
Processing RV stops for state ID: 1
  State: Alabama - Found 200 RV locations
  State: Alabama - Found 200 RV locations
Processing RV stops for state ID: 2
Processing RV stops for state ID: 2
  State: Arkansas - Found 154 RV locations
  State: Arkansas - Found 154 RV locations
Processing RV stops for state ID: 3
Processing RV stops for state ID: 3
  State: Arizona - Found 89 RV locations
  State: Arizona - Found 89 RV locations
Processing RV stops for state ID: 4
Processing RV stops for state ID: 4
  State: British Columbia - Found 47 RV locations
  State: British Columbia - Found 47 RV locations
Processing RV stops for state ID: 5
Processing RV stops for state ID: 5
  State: California - Found 220 RV locations
  State: California - Found 220 RV locations
Processing RV stops for state ID: 6
Processing RV stops for state ID: 6
  State: Colorado - Found 100 RV locations
  State: Colorado - Found 100 RV locations
Processing RV stops for state ID: 7
Proc