In [0]:
%python
%restart_python
%pip install bs4
%pip install html5lib
%pip install requests
%pip install pandas
%pip install webdriver_manager


import os
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import re
import time
import pandas as pd

In [0]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Set up Chrome options
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")  # Required in many environments (especially Docker, cloud notebooks)

# Start Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Test it
driver.get("https://www.google.com")
print(driver.title)
driver.quit()


In [0]:
base_url = "https://www.pawnfinders.com/"


try:
    # Setting up selenium chrome 
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # launching a broweser session
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

except Exception as e:
        print(f"Error making chrome session")

def extract_state_links(url):
    try:
        driver.get(url)
        time.sleep(10)  # wait for the page to load fully
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            # Get all of the a tags to retrieve the state links 
            links = []
            for a_tag in soup.find_all("a", href=True):
                href = a_tag["href"].strip()
                if href and "pawn-shops" in href:
                    links.append(href)
            return links
        else:
            print(f"Failed to fetch page. Status: {response.status_code}")
    except Exception as e:
        print(f"Error: {e}")
    return []

In [0]:
def extract_shops(state_url):
    shop_links = []
    try:
        driver.get(state_url)
        time.sleep(3)  # initial load

        # Scroll through shops to dynamically load the content
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_pause_time = 2
        max_scrolls = 20  # adjust this if more scrolling is necessary 

        for i in range(max_scrolls):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # parse through the fully loaded page
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # find the specific shop links
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if "/#!biz/id/" in href and href not in shop_links:
                shop_links.append(href)

    except Exception as e:
        print(f"Error extracting shops from {state_url}: {e}")
        
    return shop_links


In [0]:
from urllib.parse import unquote

def location_info(location):
    info = []

    try:
        driver.get(location)
        time.sleep(3) #initial load

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # strip and retrieve the business name
        name_tag = soup.find("h3", itemprop="name")
        name = name_tag.text.strip() if name_tag else ""

        map_link = ""

        # Find the first iframe src that contains "google"
        for iframe_tag in soup.find_all("iframe", src=True):
            src = iframe_tag["src"]
            if "google" in src.lower():
                map_link = src
                break  

        lat = lon = None
        if map_link:
            clean_url = unquote(map_link.replace("&amp;", "&"))
    
            # Extract center coordinates from URL to retrieve the lat and long coordinates
            match = re.search(r'center=([\-.\d]+),([\-.\d]+)', clean_url)
            if match:
                lat = float(match.group(1))
                lon = float(match.group(2))
            else:
                print("No 'center=' parameter found in the Google Maps iframe src.")

        info.append({
            "name": name,
            "latitude": lat,
            "longitude": lon,
        })

    except Exception as e:
        print(f"Error extracting location from {location}: {e}")

    return info

In [0]:
from urllib.parse import urlparse

state_urls = extract_state_links(base_url)

print(f"\nFound {len(state_urls)} state URLs\n")

all_shops_info = []

# First iterate through the state urls retrieved
for state_url in state_urls:
    #extract the shops from each state url 
    state_shops = extract_shops(state_url)
    print("extracting: " + (state_url))
    for shop_url in state_shops: #iterate through the shop urls
        loc_info = location_info(shop_url) #retrieve information from each shop url
        if loc_info:
            all_shops_info.append(loc_info) #append information to all shops info list
        else:
            print(f"Skipping shop due to missing info: {shop_url}")

# Save only name, latitude, longitude
df = pd.DataFrame(all_shops_info, columns=["name", "latitude", "longitude"]) #save the name lat and long of the locations
df.to_csv("pawn_shops_data.csv", index=False)

print("\nFinal CSV 'pawn_shops_data.csv' saved .")
driver.quit()