In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [7]:
# Define headers to mimic a browser request
headers = {"User-Agent": "Mozilla/5.0"}

# Step 1: Get the WeWork Sitemap
sitemap_url = "https://www.wework.com/sitemap"
response = requests.get(sitemap_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Step 2: Extract all WeWork location URLs
location_urls = []
for link in soup.find_all("p", class_="ray-text--body-small smallgaps"):
    a_tag = link.find("a", href=True)
    if a_tag and "/buildings/" in a_tag["href"]:  # Filtering only location URLs
        location_urls.append("https://www.wework.com" + a_tag["href"])

# Remove duplicates
location_urls = list(set(location_urls))

print(f"Found {len(location_urls)} WeWork locations!")

# Step 3: Scrape Addresses from Each Location Page
wework_data = []

for url in location_urls:
    try:
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, "html.parser")

        # Extract address from <span data-address-anchor="location-anchor">
        address_element = soup.find("span", {"data-address-anchor": "location-anchor"})
        address = address_element.text.strip() if address_element else "Not Found"

        # Store data
        wework_data.append({"URL": url, "Address": address})

        print(f"Scraped: {url} → {address}")

        # Sleep to prevent blocking (optional)
        time.sleep(1)

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        wework_data.append({"URL": url, "Address": "Error"})

# Step 4: Convert to DataFrame & Save
df_wework = pd.DataFrame(wework_data)
df_wework.to_csv("wework_addresses.csv", index=False)

print("✅ Scraping complete! Data saved to wework_addresses.csv")


Found 436 WeWork locations!
Scraped: https://www.wework.com/buildings/seoul-square--seoul → 13F, 416 Hangang-daero Jung-gu Seoul, Seoul 04637
Scraped: https://www.wework.com/buildings/university-park--austin--TX → 3300 N Interstate 35 Suite 700 Austin, TX 78705
Scraped: https://www.wework.com/buildings/urban-escape--stockholm → Regeringsgatan 29, Stockholms Stockholm, Stockholms län 111 51
Scraped: https://www.wework.com/buildings/wallarkaden--cologne → Pilgrimstraße 6 Köln, Nordrhein-Westfalen 50674
Scraped: https://www.wework.com/buildings/25-king-street--brisbane--QLD → 25 King Street Bowen Hills Brisbane, QLD 4006
Scraped: https://www.wework.com/buildings/5-harcourt-road--dublin → 5 Harcourt Road Dublin, Ireland D02 FW64
Scraped: https://www.wework.com/buildings/dlf-cyber-city-block-10--chennai → Not Found
Scraped: https://www.wework.com/buildings/rajapushpa-summit--hyderabad → Not Found
Scraped: https://www.wework.com/buildings/hub71--abu-dhabi → Al Khatem Tower, ADGM Square, Al M

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# List of WeWork location URLs to scrape
wework_urls = [
    "https://www.wework.com/en-GB/buildings/corrientes-800--buenos-aires",
    "https://www.wework.com/buildings/seoul-square--seoul",
    "https://www.wework.com/buildings/university-park--austin--TX",
    "https://www.wework.com/buildings/urban-escape--stockholm",
    "https://www.wework.com/buildings/wallarkaden--cologne",
    "https://www.wework.com/buildings/25-king-street--brisbane--QLD",
    "https://www.wework.com/buildings/5-harcourt-road--dublin",
    "https://www.wework.com/buildings/dlf-cyber-city-block-10--chennai",
    "https://www.wework.com/buildings/rajapushpa-summit--hyderabad",
    "https://www.wework.com/buildings/hub71--abu-dhabi",
]

# Headers to mimic a real browser request
headers = {"User-Agent": "Mozilla/5.0"}

# Empty list to store the scraped data
wework_data = []

# Loop through each WeWork location URL and scrape the address
for url in wework_urls:
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract address from the correct class
        address_element = soup.find("span", class_="title-section__address")
        address = address_element.text.strip() if address_element else "Not Found"

        # Store data in the list
        wework_data.append({"URL": url, "Address": address})

        print(f"Scraped: {url} → {address}")

        # Sleep to avoid being blocked (optional)
        time.sleep(1)

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        wework_data.append({"URL": url, "Address": "Error"})

# Convert the scraped data into a DataFrame
df_wework_addresses = pd.DataFrame(wework_data)

# Save to CSV for further use
df_wework_addresses.to_csv("wework_addresses_v2.csv", index=False)

print("✅ Scraping complete! Data saved to wework_addresses.csv")


Scraped: https://www.wework.com/en-GB/buildings/corrientes-800--buenos-aires → Avenida Corrientes 800 Buenos Aires, Buenos Aires C1008
Scraped: https://www.wework.com/buildings/seoul-square--seoul → 13F, 416 Hangang-daero Jung-gu Seoul, Seoul 04637
Scraped: https://www.wework.com/buildings/university-park--austin--TX → 3300 N Interstate 35 Suite 700 Austin, TX 78705
Scraped: https://www.wework.com/buildings/urban-escape--stockholm → Regeringsgatan 29, Stockholms Stockholm, Stockholms län 111 51
Scraped: https://www.wework.com/buildings/wallarkaden--cologne → Pilgrimstraße 6 Köln, Nordrhein-Westfalen 50674
Scraped: https://www.wework.com/buildings/25-king-street--brisbane--QLD → 25 King Street Bowen Hills Brisbane, QLD 4006
Scraped: https://www.wework.com/buildings/5-harcourt-road--dublin → 5 Harcourt Road Dublin, Ireland D02 FW64
Scraped: https://www.wework.com/buildings/dlf-cyber-city-block-10--chennai → Not Found
Scraped: https://www.wework.com/buildings/rajapushpa-summit--hyderabad 