In [1]:
import requests
import time
import pandas as pd
import re   
from bs4 import BeautifulSoup

In [3]:
# scrape_all_pages_Cash exchange.py
# Reuse existing notebook imports and helpers (scraper, extract_maprows_info, pd, time, re, max_retries, timeout)

# helper: normalize mapRows / brands HTML into a list of records
def extract_maprows_info(parsed):
    """
    Accepts parsed response (dict from .json() or raw HTML string) and returns
    a list of normalized record dicts. This is tolerant to several payload shapes:
      - dict with 'mapRows' -> {'data': [...]}
      - dict with 'brands' containing HTML
      - raw HTML containing brand blocks with data-brand attributes
    Returns [] on failure.
    """
    records = []
    try:
        # case: JSON dict with mapRows.data as list
        if isinstance(parsed, dict):
            mr = parsed.get("mapRows")
            if isinstance(mr, dict):
                data = mr.get("data")
                if isinstance(data, list) and data:
                    return data
            # sometimes there's a top-level 'data' list
            if isinstance(parsed.get("data"), list):
                return parsed.get("data")
            # try HTML inside 'brands' key
            brands_html = parsed.get("brands")
            if isinstance(brands_html, str):
                parsed = brands_html  # fall through to HTML parsing

        # case: parsed is HTML string -> use BeautifulSoup to extract blocks
        if isinstance(parsed, str):
            soup = BeautifulSoup(parsed, "html.parser")
            # common container class used on site: category__top__item (fallback to any element with data-brand)
            items = soup.find_all("div", class_=re.compile(r"category__top__item|category__item|category-item"))
            if not items:
                items = soup.find_all(attrs={"data-brand": True})
            for div in items:
                rec = {}
                rec["id"] = div.get("data-brand") or div.get("data-id") or None
                a = div.find("a")
                if a:
                    rec["url"] = a.get("href")
                    # try to get a title element if present
                    title = a.find(["h1", "h2", "h3", "h4", "h5"])
                    if title:
                        rec["name"] = title.get_text(strip=True)
                    else:
                        # fallback to anchor/text content
                        rec["name"] = a.get_text(separator=" ", strip=True)
                else:
                    rec["url"] = None
                    rec["name"] = div.get_text(separator=" ", strip=True)
                # optional image
                img = div.find("img")
                if img:
                    rec["image"] = img.get("src") or img.get("data-src")
                    if not rec.get("name"):
                        rec["name"] = img.get("alt")
                # include raw snippet for diagnostics
                rec["raw_html"] = str(div)
                records.append(rec)
            return records
    except Exception as e:
        # keep function safe for use in loops
        print("extract_maprows_info parse error:", e)
    return records


# ensure scraper is available
if "scraper" not in globals():
    try:
        scraper = cloudscraper.create_scraper()
    except Exception:
        raise RuntimeError("cloudscraper not available to create scraper")

# determine base URL (use existing `url` if present)
if "url" in globals() and isinstance(url, str):
    base_url = url
else:
    base_url = "https://www.wheree.com/get-brands?categories%5B%5D=248&page=1&category_id=245&category_slug=Automotive&location_id=231&location_level=0&location_slug=United_States"

# pages to fetch: 1..602
pages = list(range(1,603))

# retry/backoff settings (reuse if defined)
_retries = globals().get("max_retries", 5)
_timeout = globals().get("timeout", 60)
_backoff = 2

all_records = []

for p in pages:
    # build page URL by replacing page=... or appending if missing
    if re.search(r"page=\d+", base_url):
        page_url = re.sub(r"page=\d+", f"page={p}", base_url)
    else:
        sep = "&" if "?" in base_url else "?"
        page_url = f"{base_url}{sep}page={p}"

    resp = None
    for attempt in range(1, _retries + 1):
        try:
            resp = scraper.get(page_url, timeout=_timeout)
            resp.raise_for_status()
            break
        except Exception as e:
            print(f"[page {p}] attempt {attempt} failed: {e}")
            if attempt < _retries:
                time.sleep(_backoff * attempt)
            else:
                print(f"[page {p}] max retries reached, skipping page.")
    if resp is None:
        continue

    # try to parse JSON; prefer resp.json() but fall back to text for extract_maprows_info
    parsed = None
    try:
        parsed = resp.json()
    except Exception:
        parsed = resp.text

    # use existing extract_maprows_info (defined in notebook) to normalize records
    try:
        recs = extract_maprows_info(parsed)
    except Exception as e:
        print(f"[page {p}] extract_maprows_info failed: {e}")
        recs = []

    if not recs:
        print(f"[page {p}] no records extracted")
    else:
        all_records.extend(recs)
        print(f"[page {p}] extracted {len(recs)} records (total so far: {len(all_records)})")

# build dataframe and save
if not all_records:
    print("No records collected from pages 1..20.")
else:
    df_all = pd.DataFrame(all_records)
    out_fname = "gas stations.csv"
    df_all.to_csv(out_fname, index=False)
    display(df_all.head(20))
    print(f"Saved {len(df_all)} records to {out_fname}") 
# Reuse existing notebook imports and helpers (scraper, extract_maprows_info, pd, time, re, max_retries, timeout)



[page 1] extracted 10 records (total so far: 10)
[page 2] extracted 10 records (total so far: 20)
[page 3] extracted 10 records (total so far: 30)
[page 4] extracted 10 records (total so far: 40)
[page 5] extracted 10 records (total so far: 50)
[page 6] extracted 10 records (total so far: 60)
[page 7] extracted 10 records (total so far: 70)
[page 8] extracted 10 records (total so far: 80)
[page 9] extracted 10 records (total so far: 90)
[page 10] extracted 10 records (total so far: 100)
[page 11] extracted 10 records (total so far: 110)
[page 12] extracted 10 records (total so far: 120)
[page 13] extracted 10 records (total so far: 130)
[page 14] extracted 10 records (total so far: 140)
[page 15] extracted 10 records (total so far: 150)
[page 16] extracted 10 records (total so far: 160)
[page 17] extracted 10 records (total so far: 170)
[page 18] extracted 10 records (total so far: 180)
[page 19] extracted 10 records (total so far: 190)
[page 20] extracted 10 records (total so far: 200

Unnamed: 0,id,name,fulladdress,alias,ranking,rating,number_of_rates,price_level,created_at,longitude,...,city_id,category_id,category2_id,category3_id,short_description,image,openTime,sub_cat,affiliate,city
0,1313022,Petro Travel Center,"970 S Blake Ranch Rd, Kingman, AZ 86401, Unite...",petro-travel-center-1,1193306901,0.0,0,0,2025-01-15T00:33:56.000000Z,-113.788806,...,29,248,246,252,,https://static.where-e.com/United_States/Arizo...,"[{'day_in_week': 3, 'time_open': 60, 'time_clo...","[{'id': 248, 'name': 'Gas Stations ', 'slug': ...",,"{'id': 29, 'timezone': 'America/Phoenix'}"
1,1205397,AutoExpress Service Center-Beltsville Exxon,"11055 Baltimore Ave, Beltsville, MD 20705, Uni...",autoexpress-service-centerbeltsville-exxon,1075321725,0.0,0,0,2024-12-27T08:48:20.000000Z,-76.907348,...,55,246,247,248,,https://static.where-e.com/United_States/Maryl...,"[{'day_in_week': 3, 'time_open': 32400, 'time_...","[{'id': 246, 'name': 'Auto Repair Services', '...",,"{'id': 55, 'timezone': 'America/New_York'}"
2,193653,Bill Dodge Nissan Of Saco,"852 Portland Rd, Saco, ME 04072, United States",bill-dodge-nissan-of-saco,1030719166,0.0,0,0,2024-03-18T09:40:41.000000Z,-70.421528,...,75,246,247,248,,https://static.where-e.com/United_States/Maine...,,"[{'id': 246, 'name': 'Auto Repair Services', '...",,"{'id': 75, 'timezone': 'America/New_York'}"
3,121626,Famoso Dragstrip,"33559 Famoso Rd, McFarland, CA 93250, United S...",famoso-dragstrip,986049381,0.0,0,0,2024-03-18T08:30:19.000000Z,-119.135707,...,3,246,247,248,,https://static.where-e.com/United_States/Calif...,"[{'day_in_week': 3, 'time_open': 32400, 'time_...","[{'id': 246, 'name': 'Auto Repair Services', '...",,"{'id': 3, 'timezone': 'America/Los_Angeles'}"
4,1315628,Costco Gas Station,"3075 Hamrick Rd, Central Point, OR 97502, Unit...",costco-gas-station-1315628,944545774,0.0,0,0,2025-01-15T00:50:35.000000Z,-122.88593,...,56,248,255,274,,https://static.where-e.com/United_States/Orego...,"[{'day_in_week': 3, 'time_open': 21600, 'time_...","[{'id': 248, 'name': 'Gas Stations ', 'slug': ...",,"{'id': 56, 'timezone': 'America/Los_Angeles'}"
5,1192415,Costa Oil - 10 Minute Oil Change - Peachtree City,"100c Petrol Point, Peachtree City, GA 30269, U...",costa-oil-10-minute-oil-change-peachtree-city,910156250,0.0,0,0,2024-11-25T09:46:12.000000Z,-84.561916,...,46,256,246,248,,https://static.where-e.com/United_States/Georg...,"[{'day_in_week': 3, 'time_open': 32400, 'time_...","[{'id': 256, 'name': 'Oil Change Stations ', '...",,"{'id': 46, 'timezone': 'America/New_York'}"
6,215061,Spoken Moto,"310 SW Industrial Way, Bend, OR 97702, United ...",spoken-moto,878517421,0.0,0,2,2024-03-18T10:01:10.000000Z,-121.312584,...,56,246,248,250,,https://static.where-e.com/United_States/Orego...,,"[{'id': 246, 'name': 'Auto Repair Services', '...",,"{'id': 56, 'timezone': 'America/Los_Angeles'}"
7,326016,Edgewater Motor Sports Park,"4819 E Miami River Rd, Cleves, OH 45002, Unite...",edgewater-motor-sports-park,848300899,0.0,0,0,2024-03-18T11:51:53.000000Z,-84.743248,...,9,246,248,255,,https://static.where-e.com/United_States/Ohio/...,,"[{'id': 246, 'name': 'Auto Repair Services', '...",,"{'id': 9, 'timezone': 'America/New_York'}"
8,196347,Bostongarage,"145 Webster St, Hanover, MA 02339, United States",bostongarage,843750000,0.0,0,0,2024-03-18T09:41:15.000000Z,-70.85372,...,45,246,248,250,,https://static.where-e.com/United_States/Massa...,"[{'day_in_week': 3, 'time_open': 60, 'time_clo...","[{'id': 246, 'name': 'Auto Repair Services', '...",,"{'id': 45, 'timezone': 'America/New_York'}"
9,270441,Ridge Motorsports Park,"1060 W Eells Hill Rd, Shelton, WA 98584, Unite...",ridge-motorsports-park,812488723,0.0,0,0,2024-03-18T10:53:09.000000Z,-123.195717,...,41,246,248,0,,https://static.where-e.com/United_States/Washi...,"[{'day_in_week': 3, 'time_open': 28800, 'time_...","[{'id': 246, 'name': 'Auto Repair Services', '...",,"{'id': 41, 'timezone': 'America/Los_Angeles'}"


Saved 6013 records to gas stations.csv


## data Cleaning

In [4]:
import pandas as pd
import re

In [6]:
#cleang up
import ast  # safely evaluate string representation of list/dict


# Function to extract id and name
def extract_ids_and_names(detail_str):
    try:
        data_list = ast.literal_eval(detail_str)  # safely parse string to Python list
        return [(item['id'], item['name']) for item in data_list]
    except Exception as e:
        return None

# Apply the function to create new columns
df['extracted'] = df['sub_cat'].apply(extract_ids_and_names)

# Optional: separate into two new columns for clarity
df['ids'] = df['extracted'].apply(lambda x: [i[0] for i in x] if x else None)
df['names'] = df['extracted'].apply(lambda x: [i[1] for i in x] if x else None)

# Function to extract the first name
def extract_first_name(detail_str):
    try:
        data_list = ast.literal_eval(detail_str)  # convert string to list of dicts
        if isinstance(data_list, list) and len(data_list) > 0:
            return data_list[0].get('name')
    except Exception:
        return None
    return None

# Apply function
df['category'] = df['sub_cat'].apply(extract_first_name)

# Example pattern: captures two-letter state abbreviation before ZIP code
state_pattern = re.compile(r'\b([A-Z]{2})\b(?=\s*\d{5})')

def extract_state(address):
    """Extracts state abbreviation (e.g., FL, NY, CA) from address string."""
    if pd.isna(address):
        return None
    match = state_pattern.search(address)
    if match:
        return match.group(1)
    return None

# Apply function to your actual column
df['state'] = df['fulladdress'].apply(extract_state)




In [7]:
#cleaning data 
df.drop(columns=['alias', 'ranking', 'rating','city','number_of_rates', 'price_level','created_at','category_id', 'category2_id', 'category3_id','short_description', 'image', 'openTime', 'sub_cat', 'affiliate','extracted', 'ids'], inplace=True)
#RENAMING COLUMNS
df.rename(columns={
    
    'fulladdress': 'address',
    'names': 'categories',
    'category': 'sub_category',
    }, inplace=True)

df['categories'] = df['categories'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df.head()

Unnamed: 0,id,name,address,longitude,latitude,city_id,categories,sub_category,state
0,1313022,Petro Travel Center,"970 S Blake Ranch Rd, Kingman, AZ 86401, Unite...",-113.788806,35.177913,29,"Gas Stations , Auto Repair Services, Parking A...",Gas Stations,AZ
1,1205397,AutoExpress Service Center-Beltsville Exxon,"11055 Baltimore Ave, Beltsville, MD 20705, Uni...",-76.907348,39.034232,55,"Auto Repair Services, Towing Services, Gas Sta...",Auto Repair Services,MD
2,193653,Bill Dodge Nissan Of Saco,"852 Portland Rd, Saco, ME 04072, United States",-70.421528,43.53865,75,"Auto Repair Services, Towing Services, Gas Sta...",Auto Repair Services,ME
3,121626,Famoso Dragstrip,"33559 Famoso Rd, McFarland, CA 93250, United S...",-119.135707,35.600261,3,"Auto Repair Services, Towing Services, Gas Sta...",Auto Repair Services,CA
4,1315628,Costco Gas Station,"3075 Hamrick Rd, Central Point, OR 97502, Unit...",-122.88593,42.372556,56,"Gas Stations , Auto Parts & Supplies Services,...",Gas Stations,OR


In [8]:
# copy
orgin_df = df.copy()

In [10]:
#filter the data sub_category
df = df[df['sub_category'].str.contains('Gas Stations', na=False)]
df.shape

(3188, 9)

In [11]:
df.head()

Unnamed: 0,id,name,address,longitude,latitude,city_id,categories,sub_category,state
0,1313022,Petro Travel Center,"970 S Blake Ranch Rd, Kingman, AZ 86401, Unite...",-113.788806,35.177913,29,"Gas Stations , Auto Repair Services, Parking A...",Gas Stations,AZ
4,1315628,Costco Gas Station,"3075 Hamrick Rd, Central Point, OR 97502, Unit...",-122.88593,42.372556,56,"Gas Stations , Auto Parts & Supplies Services,...",Gas Stations,OR
11,166452,Costco Gas Station,"1709 Automation Pkwy, San Jose, CA 95131, Unit...",-121.883386,37.388233,3,"Gas Stations , Auto Parts & Supplies Services",Gas Stations,CA
12,1321245,Costco Gas Station,"6720 NE 84th St, Vancouver, WA 98665, United S...",-122.604092,45.684195,41,"Gas Stations , Auto Parts & Supplies Services,...",Gas Stations,WA
15,169020,Costco Gas Station,"18109 33rd Ave W, Lynnwood, WA 98037, United S...",-122.275199,47.833774,41,"Gas Stations , Car Wash Services, Auto Parts &...",Gas Stations,WA


In [12]:
#create a copy of the df
df.to_csv("C:\\Users\\RodahNambuyaChepkori\\Documents\\Web Scrapping\\Data\\gas stations.csv", index=False)
print("✅ Changes appended (file updated successfully).")

✅ Changes appended (file updated successfully).


In [13]:
# save the original dataframe
orgin_df.to_csv("C:\\Users\\RodahNambuyaChepkori\\Documents\\Web Scrapping\\Data\\original gas stations.csv", index=False)
print("✅ Original data saved successfully.")

✅ Original data saved successfully.


# alternative link

url="https://gas-stations.cmac.ws/"


In [6]:
url="https://www.vaporana.com/directory/"
response = requests.get(url)
soup=BeautifulSoup(response.text, 'html.parser')
print(response.status_code)

200


In [15]:
detail=soup.find_all('p')
for d in detail:
    links=d.find_all('a')
    name=d.get_text()
print(name, links)

Vape shops typically carry a wide range of products including e-cigarettes, vape pens, pod systems, box mods, tanks, coils, batteries, chargers, and e-liquids in various flavors and nicotine strengths. Many shops also sell accessories like cases, drip tips, and replacement parts. Some shops may specialize in certain brands or product types, so it's worth calling ahead if you're looking for something specific. []
