In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib.parse
import re
import opencc

def fetch_data(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"}
    response = requests.get(url, headers=headers)
    return BeautifulSoup(response.text, 'html.parser')

In [138]:
#TESTING AREA
#一把大
def fetch_data(url):
    response = requests.get(url, verify=False)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def clean_business_name(name):
    return re.sub(r"^\d+", "", name).strip()

def format_phone_number(phone):
    numbers = re.findall(r"\d", phone)
    cleaned_phone = ''.join(numbers)
    return f"{cleaned_phone[:3]}-{cleaned_phone[3:6]}-{cleaned_phone[6:]}"

def corrected_integrated_scraper(url):
    soup = fetch_data(url)
    stores = soup.find_all("div", class_="list_store")
    extracted_data = []

    for store in stores:
        # 抓取商家名字
        business_name_tag = store.find("h3")
        business_name = clean_business_name(business_name_tag.get_text(strip=True)) if business_name_tag else "未知商家名字"
        
        # 获取商家子页面的URL
        business_url_tag = business_name_tag.find("a", href=True) if business_name_tag else None
        business_url = business_url_tag["href"] if business_url_tag else "未知URL"
        
        # 抓取并格式化电话号码
        phone_tag = store.find("a", href=lambda x: x and x.startswith("tel:"))
        phone = format_phone_number(phone_tag.get_text(strip=True)) if phone_tag else "未知电话"
        
        # 抓取地址
        address_tag = store.find("div", class_="add").find("p")
        address = address_tag.get_text(strip=True) if address_tag else "未知地址"
        
        extracted_data.append({
            "商家名字": business_name,
            "电话": phone,
            "地址": address,
            "URL": business_url
        })

    return extracted_data

# 运行爬虫函数
url = "https://la.yibada.com/business/list-6-34-0-0-0-0-1"
data = corrected_integrated_scraper(url)

# 打印数据
for item in data:
    print(f"商家名字: {item['商家名字']}\n电话: {item['电话']}\n地址: {item['地址']}\nURL: {item['URL']}\n{'-'*50}")



商家名字: 西雅图资深地产经纪人-西雅图-西雅图房产开发商,华人全职经纪人-29年西雅图丰富经验
电话: 206-200-5087
地址: 西雅图地区, Los Angeles, CA 98004
URL: https://la.yibada.com/business/view_68010.html
--------------------------------------------------
商家名字: 愛家地產貸款─唐冠軍 A+ REALTY & MORTGAGE
电话: 626-533-2217
地址: 17462 Colima Rd., Rowland Heights, CA 91748
URL: https://la.yibada.com/business/view_22764.html
--------------------------------------------------
商家名字: 美聯地產─劉靜 RE/MAX 2000 REALTY - NANCY LIU
电话: 626-369-1628
地址: 17843 Colima Rd., Rowland Heights, CA 91748
URL: https://la.yibada.com/business/view_26920.html
--------------------------------------------------
商家名字: 世紀地產經紀─楊大衛 CENTURY 21 - DAVID YANG
电话: 949-439-8806
地址: 4000 Barranca Pkwy., #110, Irvine, CA 92604
URL: https://la.yibada.com/business/view_22876.html
--------------------------------------------------
商家名字: 陳政豐房地產經紀 COLDWELL BANKER TOP TEAM - ANDY CHEN
电话: 626-731-0622
地址: 15348 Central Ave., Chino, CA 91710
URL: https://la.yibada.com/business/view_24834.html
---------

In [12]:
#Construction contractor
def parse_page(data, df, names_set):
    base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020901"  # 你的网站的基础URL
    # 获取所有含有商家链接的<a>标签
    anchors = data.find_all('a', class_='tree12')
    # 对于每个<a>标签
    for anchor in anchors:
        # 从二级页面中提取出商家信息
        name = anchor.text.strip()
        if name in names_set:
            continue
        names_set.add(name)
        # 提取出链接
        href = anchor['href']
        # 将相对URL转换为绝对URL
        link_url = urllib.parse.urljoin(base_url, href)
        # 获取二级页面的数据
        subpage_data = fetch_data(link_url)
        address_span = subpage_data.find('span', id='ctl00_cphLeft_Views1_txtLinkAddress')
        address = address_span.text.strip() if address_span else "Not Found"
        # get phone number from primary page
        phone_img = anchor.find_next('img', src='../images/phone.gif')
        phone = phone_img.next_sibling if phone_img and phone_img.next_sibling else "Not Found"
        # check for website
        website_a = subpage_data.find('a', class_='text')
        website = website_a.text.strip() if website_a else "Not Found"
        # Append data to DataFrame
        df = pd.concat([df, pd.DataFrame({'Name': [name], 'Address': [address], 'Phone': [phone], 'Website': [website]})], ignore_index=True)
    return df

# Initialize DataFrame and set
df = pd.DataFrame(columns=['Name', 'Address', 'Phone', 'Website'])
names_set = set()

# 抓取数据并解析
base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020901&PageSize=10&PageID="  # 你的一级页面的URL
page_num = 1
while True:
    print(f"Processing page {page_num}...")
    url = base_url + str(page_num)
    data = fetch_data(url)
    df = parse_page(data, df, names_set)
    if not data.find('a', id='ctl00_cphRight_Main0102view_1_hlNext'): # 如果没有找到后页的链接，表示已经到最后一页
        break
    page_num += 1

# Save to CSV
df.to_csv('business_data.csv', index=False)
print("Data saved to business_data.csv")


Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Processing page 19...
Processing page 20...
Processing page 21...
Processing page 22...
Processing page 23...
Processing page 24...
Processing page 25...
Processing page 26...
Processing page 27...
Processing page 28...
Processing page 29...
Processing page 30...
Processing page 31...
Processing page 32...
Processing page 33...
Processing page 34...
Processing page 35...
Processing page 36...
Processing page 37...
Processing page 38...
Processing page 39...
Processing page 40...
Processing page 41...
Processing page 42...
Processing page 43...
Processing page 44...
Processing page 45...
Processing page 46.

In [14]:
def parse_page(data, df, names_set):
    base_url = "https://www.cn411.ca/main02view03.aspx?LinkTreeID=S020302"  # 你的网站的基础URL
    # 获取所有含有商家链接的<a>标签
    anchors = data.find_all('a', class_='tree12')
    # 对于每个<a>标签
    for anchor in anchors:
        # 从二级页面中提取出商家信息
        name = anchor.text.strip()
        if name in names_set:
            continue
        names_set.add(name)
        # 提取出链接
        href = anchor['href']
        # 将相对URL转换为绝对URL
        link_url = urllib.parse.urljoin(base_url, href)
        # 获取二级页面的数据
        subpage_data = fetch_data(link_url)
        address_span = subpage_data.find('span', id='ctl00_cphLeft_Views1_txtLinkAddress')
        address = address_span.text.strip() if address_span else "Not Found"
        # get phone number from primary page
        phone_img = anchor.find_next('img', src='../images/phone.gif')
        phone = phone_img.next_sibling if phone_img and phone_img.next_sibling else "Not Found"
        # check for website
        website_a = subpage_data.find('a', class_='text')
        website = website_a.text.strip() if website_a else "Not Found"
        # Append data to DataFrame
        df = pd.concat([df, pd.DataFrame({'Name': [name], 'Address': [address], 'Phone': [phone], 'Website': [website]})], ignore_index=True)
    return df

# Initialize DataFrame and set
df = pd.DataFrame(columns=['Name', 'Address', 'Phone', 'Website'])
names_set = set()

# 抓取数据并解析
base_url = "https://www.cn411.ca/main02view03.aspx?LinkTreeID=S020302&PageSize=10&PageID="  # 你的一级页面的URL
page_num = 1
while True:
    print(f"Processing page {page_num}...")
    url = base_url + str(page_num)
    data = fetch_data(url)
    df = parse_page(data, df, names_set)
    if not data.find('a', id='ctl00_cphRight_Main0102view_1_hlNext'): # 如果没有找到后页的链接，表示已经到最后一页
        break
    page_num += 1

# Save to CSV
df.to_csv('real_estate_agency.csv', index=False)
print("Data saved")

Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Processing page 19...
Processing page 20...
Processing page 21...
Processing page 22...
Processing page 23...
Processing page 24...
Processing page 25...
Processing page 26...
Processing page 27...
Processing page 28...
Processing page 29...
Processing page 30...
Processing page 31...
Processing page 32...
Processing page 33...
Processing page 34...
Processing page 35...
Processing page 36...
Processing page 37...
Processing page 38...
Processing page 39...
Processing page 40...
Processing page 41...
Processing page 42...
Processing page 43...
Processing page 44...
Processing page 45...
Processing page 46.

In [18]:
#律师
def parse_page(data, df, names_set):
    base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020607"  # 你的网站的基础URL
    # 获取所有含有商家链接的<a>标签
    anchors = data.find_all('a', class_='tree12')
    # 对于每个<a>标签
    for anchor in anchors:
        # 从二级页面中提取出商家信息
        name = anchor.text.strip()
        if name in names_set:
            continue
        names_set.add(name)
        # 提取出链接
        href = anchor['href']
        # 将相对URL转换为绝对URL
        link_url = urllib.parse.urljoin(base_url, href)
        # 获取二级页面的数据
        subpage_data = fetch_data(link_url)
        address_span = subpage_data.find('span', id='ctl00_cphLeft_Views1_txtLinkAddress')
        address = address_span.text.strip() if address_span else "Not Found"
        # get phone number from primary page
        phone_img = anchor.find_next('img', src='../images/phone.gif')
        phone = phone_img.next_sibling if phone_img and phone_img.next_sibling else "Not Found"
        # check for website
        website_a = subpage_data.find('a', class_='text')
        website = website_a.text.strip() if website_a else "Not Found"
        # Append data to DataFrame
        df = pd.concat([df, pd.DataFrame({'Name': [name], 'Address': [address], 'Phone': [phone], 'Website': [website]})], ignore_index=True)
    return df

# Initialize DataFrame and set
df = pd.DataFrame(columns=['Name', 'Address', 'Phone', 'Website'])
names_set = set()

# 抓取数据并解析
base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020607&PageSize=100&PageID="  # 你的一级页面的URL
page_num = 1
while True:
    print(f"Processing page {page_num}...")
    url = base_url + str(page_num)
    data = fetch_data(url)
    df = parse_page(data, df, names_set)
    if not data.find('a', id='ctl00_cphRight_Main0102view_1_hlNext'): # 如果没有找到后页的链接，表示已经到最后一页
        break
    page_num += 1

# Save to CSV
df.to_csv('attorney.csv', index=False)
print("Data saved")

Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Data saved


In [20]:
#会计
def parse_page(data, df, names_set):
    base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020601"  # 你的网站的基础URL
    # 获取所有含有商家链接的<a>标签
    anchors = data.find_all('a', class_='tree12')
    # 对于每个<a>标签
    for anchor in anchors:
        # 从二级页面中提取出商家信息
        name = anchor.text.strip()
        if name in names_set:
            continue
        names_set.add(name)
        # 提取出链接
        href = anchor['href']
        # 将相对URL转换为绝对URL
        link_url = urllib.parse.urljoin(base_url, href)
        # 获取二级页面的数据
        subpage_data = fetch_data(link_url)
        address_span = subpage_data.find('span', id='ctl00_cphLeft_Views1_txtLinkAddress')
        address = address_span.text.strip() if address_span else "Not Found"
        # get phone number from primary page
        phone_img = anchor.find_next('img', src='../images/phone.gif')
        phone = phone_img.next_sibling if phone_img and phone_img.next_sibling else "Not Found"
        # check for website
        website_a = subpage_data.find('a', class_='text')
        website = website_a.text.strip() if website_a else "Not Found"
        # Append data to DataFrame
        df = pd.concat([df, pd.DataFrame({'Name': [name], 'Address': [address], 'Phone': [phone], 'Website': [website]})], ignore_index=True)
    return df

# Initialize DataFrame and set
df = pd.DataFrame(columns=['Name', 'Address', 'Phone', 'Website'])
names_set = set()

# 抓取数据并解析
base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020601&PageSize=100&PageID="  # 你的一级页面的URL
page_num = 1
while True:
    print(f"Processing page {page_num}...")
    url = base_url + str(page_num)
    data = fetch_data(url)
    df = parse_page(data, df, names_set)
    if not data.find('a', id='ctl00_cphRight_Main0102view_1_hlNext'): # 如果没有找到后页的链接，表示已经到最后一页
        break
    page_num += 1

# Save to CSV
df.to_csv('accountant.csv', index=False)
print("Data saved")

Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Data saved


In [22]:
def parse_page(data, df, names_set):
    base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020606"  # 你的网站的基础URL
    # 获取所有含有商家链接的<a>标签
    anchors = data.find_all('a', class_='tree12')
    # 对于每个<a>标签
    for anchor in anchors:
        # 从二级页面中提取出商家信息
        name = anchor.text.strip()
        if name in names_set:
            continue
        names_set.add(name)
        # 提取出链接
        href = anchor['href']
        # 将相对URL转换为绝对URL
        link_url = urllib.parse.urljoin(base_url, href)
        # 获取二级页面的数据
        subpage_data = fetch_data(link_url)
        address_span = subpage_data.find('span', id='ctl00_cphLeft_Views1_txtLinkAddress')
        address = address_span.text.strip() if address_span else "Not Found"
        # get phone number from primary page
        phone_img = anchor.find_next('img', src='../images/phone.gif')
        phone = phone_img.next_sibling if phone_img and phone_img.next_sibling else "Not Found"
        # check for website
        website_a = subpage_data.find('a', class_='text')
        website = website_a.text.strip() if website_a else "Not Found"
        # Append data to DataFrame
        df = pd.concat([df, pd.DataFrame({'Name': [name], 'Address': [address], 'Phone': [phone], 'Website': [website]})], ignore_index=True)
    return df

# Initialize DataFrame and set
df = pd.DataFrame(columns=['Name', 'Address', 'Phone', 'Website'])
names_set = set()

# 抓取数据并解析
base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020606&PageSize=100&PageID="  # 你的一级页面的URL
page_num = 1
while True:
    print(f"Processing page {page_num}...")
    url = base_url + str(page_num)
    data = fetch_data(url)
    df = parse_page(data, df, names_set)
    if not data.find('a', id='ctl00_cphRight_Main0102view_1_hlNext'): # 如果没有找到后页的链接，表示已经到最后一页
        break
    page_num += 1

# Save to CSV
df.to_csv('insurance.csv', index=False)
print("Data saved")

Processing page 1...
Processing page 2...
Processing page 3...
Data saved


In [24]:
def parse_page(data, df, names_set):
    base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020403"  # 你的网站的基础URL
    # 获取所有含有商家链接的<a>标签
    anchors = data.find_all('a', class_='tree12')
    # 对于每个<a>标签
    for anchor in anchors:
        # 从二级页面中提取出商家信息
        name = anchor.text.strip()
        if name in names_set:
            continue
        names_set.add(name)
        # 提取出链接
        href = anchor['href']
        # 将相对URL转换为绝对URL
        link_url = urllib.parse.urljoin(base_url, href)
        # 获取二级页面的数据
        subpage_data = fetch_data(link_url)
        address_span = subpage_data.find('span', id='ctl00_cphLeft_Views1_txtLinkAddress')
        address = address_span.text.strip() if address_span else "Not Found"
        # get phone number from primary page
        phone_img = anchor.find_next('img', src='../images/phone.gif')
        phone = phone_img.next_sibling if phone_img and phone_img.next_sibling else "Not Found"
        # check for website
        website_a = subpage_data.find('a', class_='text')
        website = website_a.text.strip() if website_a else "Not Found"
        # Append data to DataFrame
        df = pd.concat([df, pd.DataFrame({'Name': [name], 'Address': [address], 'Phone': [phone], 'Website': [website]})], ignore_index=True)
    return df

# Initialize DataFrame and set
df = pd.DataFrame(columns=['Name', 'Address', 'Phone', 'Website'])
names_set = set()

# 抓取数据并解析
base_url = "https://www.cn411.ca/main02view03.aspx?&LinkTreeID=S020403&PageSize=100&PageID="  # 你的一级页面的URL
page_num = 1
while True:
    print(f"Processing page {page_num}...")
    url = base_url + str(page_num)
    data = fetch_data(url)
    df = parse_page(data, df, names_set)
    if not data.find('a', id='ctl00_cphRight_Main0102view_1_hlNext'): # 如果没有找到后页的链接，表示已经到最后一页
        break
    page_num += 1

# Save to CSV
df.to_csv('clinic.csv', index=False)
print("Data saved")

Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Data saved


In [25]:
def parse_page(data, df, names_set):
    base_url = "https://www.cn411.ca/main02view03.aspx?LinkTreeID=S021101"  # 你的网站的基础URL
    # 获取所有含有商家链接的<a>标签
    anchors = data.find_all('a', class_='tree12')
    # 对于每个<a>标签
    for anchor in anchors:
        # 从二级页面中提取出商家信息
        name = anchor.text.strip()
        if name in names_set:
            continue
        names_set.add(name)
        # 提取出链接
        href = anchor['href']
        # 将相对URL转换为绝对URL
        link_url = urllib.parse.urljoin(base_url, href)
        # 获取二级页面的数据
        subpage_data = fetch_data(link_url)
        address_span = subpage_data.find('span', id='ctl00_cphLeft_Views1_txtLinkAddress')
        address = address_span.text.strip() if address_span else "Not Found"
        # get phone number from primary page
        phone_img = anchor.find_next('img', src='../images/phone.gif')
        phone = phone_img.next_sibling if phone_img and phone_img.next_sibling else "Not Found"
        # check for website
        website_a = subpage_data.find('a', class_='text')
        website = website_a.text.strip() if website_a else "Not Found"
        # Append data to DataFrame
        df = pd.concat([df, pd.DataFrame({'Name': [name], 'Address': [address], 'Phone': [phone], 'Website': [website]})], ignore_index=True)
    return df

# Initialize DataFrame and set
df = pd.DataFrame(columns=['Name', 'Address', 'Phone', 'Website'])
names_set = set()

# 抓取数据并解析
base_url = "https://www.cn411.ca/main02view03.aspx?LinkTreeID=S021101&PageSize=100&PageID="  # 你的一级页面的URL
page_num = 1
while True:
    print(f"Processing page {page_num}...")
    url = base_url + str(page_num)
    data = fetch_data(url)
    df = parse_page(data, df, names_set)
    if not data.find('a', id='ctl00_cphRight_Main0102view_1_hlNext'): # 如果没有找到后页的链接，表示已经到最后一页
        break
    page_num += 1

# Save to CSV
df.to_csv('health_care.csv', index=False)
print("Data saved")

Processing page 1...
Processing page 2...
Data saved


In [28]:
#51黄页
url = "https://www.51.ca/service/categories/dentists"
response = requests.get(url)

# 解析HTML
soup = BeautifulSoup(response.text, 'html.parser')

# 找到所有的目标div元素
div_elements = soup.find_all('div', {'class': 'wg51__yp-listing-item'})

# 准备一个空的列表来保存你的数据
data = []

for div in div_elements:
    data_location = div.input['data-locations']
    data_primary_phone = div.input['data-primary-phone']
    data_title = div.input['data-title']
    
    # 将这一行的数据添加到你的数据列表中
    data.append([data_title, data_location, data_primary_phone])

# 创建一个pandas的DataFrame
df = pd.DataFrame(data, columns=['name', 'location', 'phone number'])

# 将DataFrame保存为CSV文件
df.to_csv('canada_dentists.csv', index=False)
print('data saved')

data saved


In [53]:
#人在温哥华

def fetch_data(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    # 如果页面不存在，返回 None
    if response.status_code == 404:
        return None
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def extract_info(soup):
    if soup is None:
        return 'N/A', 'N/A'
    phone_info = soup.find('b', text='联系电话')
    phone = phone_info.find_next_sibling('p').text.strip() if phone_info and phone_info.find_next_sibling('p') else 'N/A'
    address_info = soup.find('div', class_='addr_con')
    address = address_info.find('p', class_='addrs').text.strip() if address_info and address_info.find('p', class_='addrs') else 'N/A'
    return phone, address

def parse_page(data):
    data_list = []
    for item in data.find_all('li', {'class': 'list'}):
        name = item.find('a', {'class': 'yp-list-title'}).text.strip()
        second_level_url = 'https://c.vanpeople.com' + item['data-url']
        secondary_page_data = fetch_data(second_level_url)
        phone, address = extract_info(secondary_page_data)
        data_list.append({'Name': name, 'Address': address, 'Phone': phone})

    return pd.DataFrame(data_list, columns=['Name', 'Address', 'Phone'])

url = 'https://c.vanpeople.com/jiazheng/'
data = fetch_data(url)
df = parse_page(data)

df.to_csv('va_daycare.csv', index=False)
print('Done!')

Done!


In [22]:
#华人资讯网
# Let's first import the necessary libraries
# We will collect the data in lists and convert these lists into a dataframe
titles = []
addresses = []
phones = []
categories_list = []

def get_html(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    return response.text

def parse_html(html):
    soup = BeautifulSoup(html, 'lxml')
    # 找到所有的dl标签，无论它们是'sponsor_company'还是'regular_company'
    dls = soup.find_all('dl', class_=['sponsor_company', 'regular_company'])
    for dl in dls:
        # 提取标题
        title_element = dl.find('dt', class_='tag_title')
        title = title_element.text.strip() if title_element else "NA"
        # 使用正则表达式去掉标题前面的"Ad"或数字
        title = re.sub(r'^(Ad|\d+|推)', '', title).strip()
        title = re.sub(r'(竞价|点击查看)$', '', title).strip()
        titles.append(title)
        # 提取地址
        address_element = dl.find('a', title='点击查看地图')
        address = address_element.text.strip() if address_element else "NA"
        addresses.append(address)
        # 提取电话
        phone_element = dl.find('div', class_='list_phone')
        phone = phone_element.text.strip().split(':')[-1].strip() if phone_element else "NA"
        phones.append(phone)
        # 提取分类
        categories_element = dl.find('div', class_='list_category')
        if categories_element:
            category_list = [a.text for a in categories_element.find_all('a', class_='map_link')]
            categories = ', '.join(category_list)
        else:
            categories = "NA"
        categories_list.append(categories)

def main():
    base_url = "https://www.sdchinaren.com/company/catid_29/task_list/%E5%9C%A3%E5%9C%B0%E4%BA%9A%E5%93%A5%E5%BB%BA%E7%AD%91%E8%A3%85%E4%BF%AE%E5%89%8D%E5%8D%81%E5%90%8D.html"
    # 循环遍历所有的页面
    for i in range(1, 10):
        if i == 1:
            url = base_url + ".html"
        else:
            url = base_url + "/page_" + str(i) + ".html"
        html = get_html(url)
        parse_html(html)

main()

# Now that we have collected the data, we can convert it into a dataframe
df = pd.DataFrame({
    '标题': titles,
    '电话': phones,
    '地址': addresses,
    '分类': categories_list
})

# Save the dataframe into a csv file
df.to_csv('SD_construction.csv', index=False)
print('Done!')

Done!


In [130]:
#金海湾

def extract_info_from_url(url):
    # Fetch the HTML content
    response = requests.get(url, verify=False)
    html_content = response.text

    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Lists to store the extracted data
    names = []
    addresses = []
    phone_numbers = []
    urls = []

    # Loop through each service list div
    for div in soup.find_all('div', class_='service_list_left'):
        # Extract name or use "N/A" if not found
        name_tag = div.find('strong').find('a')
        name = name_tag.text.strip() if name_tag else "N/A"
        names.append(name)

        # Extract the business URL
        base_host = "https://losangeles.jinbay.com"
        business_url = base_host + name_tag['href'] if name_tag else "N/A"
        urls.append(business_url)

        # Extract address or use "N/A" if not found
        address_tag = div.find('p', class_='text')
        address_content = address_tag.contents[0] if address_tag and address_tag.contents else "N/A"
        addresses.append(address_content.strip())

        # Extract and format phone numbers from the same div or use "N/A" if not found
        phone_tag = div.find_next_sibling('div', class_='service_list_right').find('strong', class_='tel') if div.find_next_sibling('div', class_='service_list_right') else None
        raw_phone = phone_tag.text.strip() if phone_tag else "N/A"
        
        # If phone number exists, format it, otherwise use "N/A"
        if raw_phone != "N/A":
            cleaned_phone = re.sub(r'\D', '', raw_phone)
            formatted_phone = f"{cleaned_phone[:3]}-{cleaned_phone[3:6]}-{cleaned_phone[6:]}"
            phone_numbers.append(formatted_phone)
        else:
            phone_numbers.append(raw_phone)

    return names, addresses, phone_numbers, urls

# Generate the list of URLs
base_url = "https://losangeles.jinbay.com/yellowpages/360/yellowlist_a60300_sc0_sa0_k_l0_s0_p{}.html"
urls = [base_url]
for i in range(1, 11):  
    urls.append(base_url.format(i))

# Lists to store the entire dataset
all_names = []
all_addresses = []
all_phone_nums = []
all_business_urls = []

# Extract data for each URL
for url in urls:
    names, addresses, phone_nums, business_urls = extract_info_from_url(url)
    all_names.extend(names)
    all_addresses.extend(addresses)
    all_phone_nums.extend(phone_nums)
    all_business_urls.extend(business_urls)

# Create a DataFrame
df = pd.DataFrame({
    'name': all_names,
    'phone': all_phone_nums,
    'address': all_addresses,
    'url': all_business_urls
})

# Drop duplicates based on the address
df = df.drop_duplicates(subset='address')

# Save the data to a CSV file
df.to_csv('business_data_with_pandas.csv', index=False)
print('done!')



done!


In [58]:
#华人工商网
base_url = "https://www.ccyp.com"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
converter = opencc.OpenCC('t2s')  # t2s表示从繁体到简体

def extract_data(page_number):
    url = f"https://www.ccyp.com/subject_nca_area/list/D25/%E7%9C%BC%E7%A7%91%E8%A6%96%E5%AD%B8%E9%86%AB%E5%B8%AB?page={page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    business_list = []
    
    for item in soup.select("a.title"):
        business_name = converter.convert(item.get_text(strip=True))
        business_link = base_url + item.get("href")
        phone_number = converter.convert(item.find_next("strong", class_="list-data-phone").get_text(strip=True))
        formatted_phone = re.sub(r'[^0-9]', '', phone_number)
        formatted_phone = f"{formatted_phone[:3]}-{formatted_phone[3:6]}-{formatted_phone[6:]}"
        
        address = item.find_next("div", class_="list-data-item full-address").get_text(strip=True)
        formatted_address = converter.convert(address)
        
        business_list.append({
            "Business Name": business_name,
            "Phone Number": formatted_phone,
            "Address": formatted_address,
            "URL": business_link
        })
    
    return business_list

all_data = []

for i in range(1, 4):  
    data_for_page = extract_data(i)
    all_data.extend(data_for_page)
    print(f"After page {i}, extracted {len(all_data)} businesses.")

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_data)http://ny.ccyp.com/subject/list/D19/%E5%85%A7%E7%A7%91?page=5

# Save the DataFrame to a CSV file
csv_path = "business_data_with_pandas.csv"
df.to_csv(csv_path, index=False)

print(f"Data saved to {csv_path}")

After page 1, extracted 30 businesses.
After page 2, extracted 60 businesses.
After page 3, extracted 90 businesses.
Data saved to business_data_with_pandas.csv


In [66]:
#纽约华人工商网，diff html structure
base_url = "http://tx.ccyp.com/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
converter = opencc.OpenCC('t2s')  # t2s表示从繁体到简体

def extract_data(page_number):
    url = f"http://tx.ccyp.com/subject/list/D25/%E7%9C%BC%E7%A7%91%E8%A6%96%E5%AD%B8%E9%86%AB%E5%B8%AB={page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    business_list = []
    
    for container in soup.select(".row.noad"):
        # Get the first title link in the container
        title_link = container.select_one("a.title")
        if not title_link:
            continue
        
        # Extract business name
        business_name = converter.convert(title_link.get_text(strip=True))
        
        # Extract business link
        business_link = base_url + title_link.get("href")
        
        # Extract phone number
        phone_tag = title_link.find_next("strong")
        if phone_tag:
            phone_number = converter.convert(phone_tag.get_text(strip=True))
            formatted_phone = re.sub(r'[^0-9]', '', phone_number)
            formatted_phone = f"{formatted_phone[:3]}-{formatted_phone[3:6]}-{formatted_phone[6:]}"
        else:
            formatted_phone = None
        
        # Extract address
        address_div = phone_tag.find_next("div") if phone_tag else None
        if address_div:
            address_parts = [span.get_text(strip=True) for span in address_div.find_all("span")]
            formatted_address = converter.convert(", ".join(address_parts))
        else:
            formatted_address = None
        
        business_list.append({
            "Business Name": business_name,
            "Phone Number": formatted_phone,
            "Address": formatted_address,
            "URL": business_link
        })
    
    return business_list

all_data = []

for i in range(1, 2):  
    data_for_page = extract_data(i)
    all_data.extend(data_for_page)
    print(f"After page {i}, extracted {len(all_data)} businesses.")

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_data)

# Save the DataFrame to a CSV file
csv_path = "business_data_with_pandas.csv"
df.to_csv(csv_path, index=False)

print(f"Data saved to {csv_path}")

After page 1, extracted 20 businesses.
Data saved to business_data_with_pandas.csv


In [75]:
#YIBADA
converter = opencc.OpenCC('t2s')  # 从繁体转换到简体

def fetch_data(url):
    response = requests.get(url, verify=False)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def clean_business_name(name):
    return re.sub(r"^\d+", "", name).strip()

def format_phone_number(phone):
    numbers = re.findall(r"\d", phone)
    cleaned_phone = ''.join(numbers)
    return f"{cleaned_phone[:3]}-{cleaned_phone[3:6]}-{cleaned_phone[6:]}"

def corrected_integrated_scraper(url):
    soup = fetch_data(url)
    stores = soup.find_all("div", class_="list_store")
    extracted_data = []

    for store in stores:
        # 抓取商家名字
        business_name_tag = store.find("h3")
        business_name = clean_business_name(business_name_tag.get_text(strip=True)) if business_name_tag else "未知商家名字"
        
        # 获取商家子页面的URL
        business_url_tag = business_name_tag.find("a", href=True) if business_name_tag else None
        business_url = business_url_tag["href"] if business_url_tag else "未知URL"
        
        # 抓取并格式化电话号码
        phone_tag = store.find("a", href=lambda x: x and x.startswith("tel:"))
        phone = format_phone_number(phone_tag.get_text(strip=True)) if phone_tag else "未知电话"
        
        # 抓取地址
        address_tag = store.find("div", class_="add").find("p")
        address = address_tag.get_text(strip=True) if address_tag else "未知地址"
        
        extracted_data.append({
            "商家名字": business_name,
            "电话": phone,
            "地址": address,
            "URL": business_url
        })

    return extracted_data

# 运行爬虫函数
url = "https://ny.yibada.com/business/list-16-154-0-0-0-0-1"
data = corrected_integrated_scraper(url)

def scraper_all_pages(start_page, end_page):
    all_data = []
    
    # Loop through each page and scrape the data
    for page_number in range(start_page, end_page + 1):
        url = f"https://ny.yibada.com/business/list-16-154-0-0-0-0-{page_number}"
        data = corrected_integrated_scraper(url)
        all_data.extend(data)
    
    return all_data

def convert_to_simplified(data_list):
    for data in data_list:
        data['商家名字'] = converter.convert(data['商家名字'])
        data['地址'] = converter.convert(data['地址'])
    return data_list

# Scrape data from all pages
all_data = scraper_all_pages(1, 3)

# Convert traditional Chinese to simplified Chinese
all_data_simplified = convert_to_simplified(all_data)

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(all_data_simplified)

# Save the DataFrame to a CSV file
csv_file_path = "business_data_with_pandas.csv"
df.to_csv(csv_file_path, index=False)

print(f"Data has been saved to {csv_file_path}")



Data has been saved to business_data_with_pandas.csv


In [15]:
#加拿大，轻松加拿大
def extract_info_from_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    results = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 找到所有的商家列表项
        listings = soup.find_all('div', class_='acadp-listings-title-block')

        for listing in listings:
            # 抓取商家名字
            business_name_tag = listing.find('h4', class_='acadp-no-margin')
            business_name = business_name_tag.a.text.strip() if business_name_tag and business_name_tag.a else None
            
            # 抓取电话并格式化
            phone_tag = listing.find_next('span', class_='acadp-phone')
            if phone_tag and phone_tag.a:
                raw_phone = phone_tag.a.text.strip()
                phone = format_phone_number(raw_phone)
            else:
                phone = None

            # 抓取地址并移除“地址：”
            address_tag = listing.find_next('span', class_='acadp-street-address')
            if address_tag and address_tag.a:
                raw_address = address_tag.a.text.strip()
                address = raw_address.replace('地址: ', '')
            else:
                location_tag = listing.find_next('a', href=lambda x: x and 'listing-location' in x)
                address = location_tag.text.strip() if location_tag else None

            # 抓取商家子页面的URL
            business_url = business_name_tag.a['href'] if business_name_tag and business_name_tag.a else None

            results.append({
                'Business Name': business_name,
                'Phone': phone,
                'Address': address,
                'URL': business_url
            })
    else:
        print(f"Failed to retrieve the webpage for URL: {url}")

    return results


# 主代码
all_results = []
base_url = "https://easyca.ca/listing-category/113"
all_results.extend(extract_info_from_page(base_url))

# 从第2页到第46页
for i in range(2, 19):
    url = f"{base_url}/page/{i}"
    all_results.extend(extract_info_from_page(url))

df = pd.DataFrame(all_results)

# Save to CSV
df.to_csv('business_listings.csv', index=False)
print('done')

done


In [10]:
#加拿大华人网
def format_phone(phone_str):
    numbers = re.sub(r'\D', '', phone_str)
    return f"{numbers[:3]}-{numbers[3:6]}-{numbers[6:]}"

base_url = "http://www.sinoca.com/yp/health/index{}.html"
business_info = []

for i in range(1, 13):
    if i == 1:
        url = base_url.format('')
    else:
        url = base_url.format(f'_{i}')
    
    response = requests.get(url)
    response.encoding = 'utf-8'  # 确保内容是按UTF-8编码解码的
    soup = BeautifulSoup(response.text, 'lxml')
    
    for span in soup.select('span.t_7 a'):
        name = span.text.strip()
        phone_elem = span.find_next(string=re.compile(r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'))
        phone = format_phone(phone_elem) if phone_elem else None
        location_elem = span.find_next("font", color="green")
        location = location_elem.text if location_elem else None
        business_url = span['href']
        business_info.append((name, phone, location, business_url))

# 使用pandas保存数据
df = pd.DataFrame(business_info, columns=['Name', 'Phone', 'Location', 'URL'])

# 导出数据到CSV文件
df.to_csv('business_info.csv', index=False, encoding='utf-8-sig')
print('done')

done


In [7]:
#约克论坛
url = "https://info.yorkbbs.ca/info/v1/get/post/list"

headers = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
    "Appid": "100005",
    "Client": "pc",
    "Content-Type": "application/json;charset=UTF-8",
    "Cookie": "_ga=GA1.1.568928666.1692727553; _ss_s_uid=9a0c0781995d49c6eaa401a11a6d4d83; adv_display_times=%7B%22banner_info_992809550%22%3A1692727554%2C%22banner_info_992809558%22%3A1692727573%7D; _ga_4039BYD691=GS1.1.1692802423.4.1.1692805145.60.0.0",
    "Nonce-Str": "bugNaV5u4GSDMbi8fO_9GOVfsit69I6h",
    "Origin": "https://info.yorkbbs.ca",
    "Referer": "https://info.yorkbbs.ca/list/house?offset=1",
    "Sec-Ch-Ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "Windows",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "Sign": "8D360CF0BAEDE5B1995B8616261422EC",
    "Timestamp": "1692805145706",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    "Uuid": "d47ced21df411564353daf5e252d28c2"
}

payload = {
    "offset": 1,
    "size": 100,
    "topicEngName": "house",
    "options": {},
    "distance": 50000000
}

response = requests.post(url, headers=headers, json=payload)

# 检查响应状态
if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print(f"Error: {response.status_code}. {response.text}")


Error: 403. {"origin":"https://info.yorkbbs.ca","referer":"https://info.yorkbbs.ca/list/house?offset=1","url":"172.17.0.1:9999/info/v1/get/post/list","code":-1,"msg":"Forbidden resource","timestamp":"2023-08-23T15:43:51.492Z"}


In [43]:
#约克论坛
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


chrome_driver_path = 'C:/Users/Max Xiang/Desktop/chromedriver-win64/chromedriver.exe'
chrome_test_version_path = 'C:\\Users\\Max Xiang\\Desktop\\chrome-win64\\chrome.exe'

# 设置 Chrome 选项
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = chrome_test_version_path

# 启动浏览器驱动
service = Service(executable_path=chrome_driver_path)
browser = webdriver.Chrome(service=service, options=chrome_options)

# 访问网站
browser.get('https://info.yorkbbs.ca/list/insu2')

wait = WebDriverWait(browser, 30)

all_data = []

while True:
    # 等待页面加载完成
    wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'info-item')))
    
    items = browser.find_elements(By.CLASS_NAME, 'info-item')
    
    for item in items:
        # 尝试获取图片链接
        links = item.find_elements(By.CSS_SELECTOR, 'a.image-fit.info-cover.animation')
        
        # 如果图片链接不存在，尝试获取标题链接
        if not links:
            links = item.find_elements(By.CSS_SELECTOR, 'a.info-title__wrapper')
        
        # 如果都不存在，跳过该项
        if not links:
            continue
        
        # 获取子页面的URL
        sub_page_url = links[0].get_attribute('href')
        
        shop_name = item.find_element(By.CLASS_NAME, 'info-shop').text if len(item.find_elements(By.CLASS_NAME, 'info-shop')) > 0 else item.find_element(By.CLASS_NAME, 'info-title').text
        phone_elements = item.find_elements(By.CSS_SELECTOR, '.button-tel span')
        phone_number = phone_elements[-1].text if phone_elements else "NA"
        address_text = item.find_element(By.CLASS_NAME, 'info-address').text if len(item.find_elements(By.CLASS_NAME, 'info-address')) > 0 else "NA"
        
        all_data.append((shop_name, phone_number, address_text, sub_page_url))

    # 尝试点击“下一页”按钮
    try:
        next_button = browser.find_element(By.CSS_SELECTOR, '#app > div.page > div.page-wrapper.main > div.page-left > div.pagination > div > button.btn-next')
        if not next_button.is_enabled():
            break
        next_button.click()
    except NoSuchElementException:
        break

# 关闭浏览器
browser.quit()

# 创建 DataFrame 并保存为 CSV 文件
df = pd.DataFrame(all_data, columns=['Shop Name', 'Phone Number', 'Address', 'Subpage URL'])
df.to_csv('yorkbbs_data.csv', index=False)


In [15]:
url = "https://info.yorkbbs.ca/list/house?offset=1"  # 这里替换为你的一级页面的URL
data = fetch_data(url)

# 打印出一级页面的HTML内容
print(data.prettify())

<!DOCTYPE html>
<html lang="zh-CN">
 <head>
  <meta charset="utf-8"/>
  <link href="/favicon.ico" rel="icon"/>
  <link href="/favicon.ico" rel="bookmark"/>
  <link href="/favicon.ico" rel="shortcut icon"/>
  <link href="https://media3.imgyb.xyz/media/v1/image/40b879e42a4649814598c47a4569a94d.png" rel="apple-touch-icon"/>
  <link href="https://media3.imgyb.xyz/media/v1/image/40b879e42a4649814598c47a4569a94d.png" rel="mask-icon"/>
  <meta content="https://media3.imgyb.xyz/media/v1/image/40b879e42a4649814598c47a4569a94d.png" name="msapplication-TileImage"/>
  <title>
   å¤ä¼¦å¤é»é¡µ - çº¦åè®ºå - å æ¿å¤§ç¬¬ä¸ä¸­æç½
  </title>
  <meta content="" name="keywords">
   <meta content="" name="description"/>
   <script>
    (function(w, d, s, l, i) {
        w[l] = w[l] || [];
        w[l].push({ 'gtm.start': new Date().getTime(), event: 'gtm.js' });
        var f = d.getElementsByTagName(s)[0],
          j = d.createElement(s),
          dl = l != 'dataLayer' ? '&l=' + l : '';
     

In [132]:
#禁用SSL验证后读取HTML
def fetch_data(url):
    response = requests.get(url, verify=False)  # 禁用SSL验证
    response.raise_for_status()  # 如果请求失败，这行代码会引发异常
    return BeautifulSoup(response.text, 'html.parser')

url = "https://la.yibada.com/business/list-6-34-0-0-0-0-1"
data = fetch_data(url)

print(data.prettify())



<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   洛杉矶房地产-洛杉矶地产经纪-洛杉矶房地产公司-房产居家-洛杉矶黄页 - 易八达全球华人第一资讯门户网站
  </title>
  <link href="https://la.yibada.com/business/list-6-34-0-0-0-0-1" rel="canonical"/>
  <meta content="洛杉矶房地产,洛杉矶华人地产经纪人,洛杉矶资深地产经纪人,华人资深地产经纪人,华人房地产经纪,洛杉矶房地产经纪人,洛杉矶资深地产经纪人,洛杉矶房屋买卖,洛杉矶房屋买卖,洛杉矶房地产,洛杉矶资深地产经纪人" name="keywords"/>
  <meta content="在这里您可以方便找到经验丰富的房地产经纪人。每一个经纪人都有自己熟悉的区域和擅长的业务。本目录方便您寻找合适的经纪人，方便你投资理财，买房置地。" name="description"/>
  <meta content="洛杉矶房地产-洛杉矶地产经纪-洛杉矶房地产公司-房产居家-洛杉矶黄页易八达全球华人第一资讯门户网站" property="og:title"/>
  <meta content="在这里您可以方便找到经验丰富的房地产经纪人。每一个经纪人都有自己熟悉的区域和擅长的业务。本目录方便您寻找合适的经纪人，方便你投资理财，买房置地。" property="og:description"/>
  <meta content="website" property="og:type"/>
  <meta content="https://la.yibada.com/business/list-6-34-0-0-0-0-1" property="og:url"/>

In [36]:
url = 'https://www.ccyp.com/subject_sca_area/list/A17/%E5%BE%8B%E5%B8%AB?page=2'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

titles = soup.select('.title')

for title in titles:
    print(title.get_text(strip=True))

新時代聯合律師事務所
喬靖凱‧吳慈音律師事務所
朱振宇律師事務所
西海岸移民法律中心─羅蘭崗
戴嘉慕律師事務所
安娜律師樓
安娜律師樓
鄧洪律師事務所
鄧洪律師事務所
李荻律師事務所
擎天律師樓
威信聯合事務所
方見堯律師事務所
方德律師事務所
加勒特和塔利聯合律師事務所
格蘭公民入籍‧移民律師
格蘭公民入籍‧移民律師
蕭婷丰律師事務所
蕭婷丰律師事務所
郝琦聯合律師事務所
郝琦聯合律師事務所
黃子虔移民律師事務所
何隱佳律師事務所
徐仲熙律師事務所
黃笑生律師事務所
黃笑生律師事務所
黃笑生律師事務所
黃健巍律師事務所
黃希鵬律師事務所
黃宇慶律師事務所


In [21]:
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://www.google.com")
driver.quit()