## Web scrapping notebook for openrice

##最普通的方式抓取（获取全部餐厅的数据，只有250条）

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# 添加头部信息模拟浏览器请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# 创建CSV文件并写入数据
with open('restaurants.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['name', 'address', 'region', 'type1', 'type2', 'type3', 'person_price', 'well_rating', 'bad_rating'])  # 写入表头

    # 循环遍历多个页面
    for page_num in range(1, 50):
        url = f'https://www.openrice.com/zh/hongkong/restaurants?page={page_num}'
        response = requests.get(url, headers=headers)
        print(f"正在获取第 {page_num} 页数据...")

        # 解析网页内容
        soup = BeautifulSoup(response.text, 'html.parser')

        # 定位并提取商户数据
        restaurant_list = soup.find_all('div', class_='poi-list-cell-desktop-right')

        for restaurant in restaurant_list:
            name = restaurant.find('div', class_='text').text.strip()

            # 获取地址信息
            address_info = restaurant.find('div', class_='poi-list-cell-line-info').text.strip()
            address_parts = address_info.split('/')
            address = address_parts[0].split('\n')[0].strip()  # 取第一个斜杠之前的部分，并删除额外的空格和换行符

            # 获取食物类型信息
            food_type_info = restaurant.find('div', class_='poi-list-cell-line-info-details').text.strip()
            food_types = [t.strip() for t in food_type_info.split(' / ')]

            # 补充缺失的食物类型，最多只取前3个
            while len(food_types) < 3:
                food_types.append('')

            region = food_types.pop(0)  # 第一个是 region，剩下的是食物类型
            type1 = food_types.pop(0)  # 取出第一个食物类型作为 type1

            # 判断是否有 type2 和 type3
            if len(food_types)>1:
                # print(food_types)
                type2 = food_types.pop(0)  # 取出第二个食物类型作为 type2
                if '/' in type2:
                    type2, type3 = type2.split('/', 1)
                else:
                    type3 = ''
            else:
                type2 = ''
                type3 = ''



            person_price = food_types.pop() if food_types else ''  # 最后一个是人均消费
            well_rating = restaurant.find('div', class_='smile icon-wrapper big-score').text.strip()
            bad_rating = restaurant.find('div', class_='cry icon-wrapper').text.strip()

            # 将数据写入CSV文件
            writer.writerow([name, address, region, type1, type2, type3, person_price, well_rating, bad_rating])

print("数据获取并写入CSV文件完成。")


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443
DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=1 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 1 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=2 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 2 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=3 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 3 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=4 HTTP/1.1" 200 None


正在获取第 4 页数据...


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443
DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=5 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 5 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=6 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 6 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=7 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 7 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=8 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 8 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=9 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 9 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=10 HTTP/1.1" 200 None


正在获取第 10 页数据...


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443
DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=11 HTTP/1.1" 200 None


正在获取第 11 页数据...


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443
DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=12 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 12 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=13 HTTP/1.1" 200 None


正在获取第 13 页数据...


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443
DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=14 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 14 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=15 HTTP/1.1" 200 None


正在获取第 15 页数据...


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443
DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=16 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 16 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=17 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 17 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=18 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 18 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=19 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 19 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=20 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 20 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=21 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 21 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=22 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 22 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=23 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 23 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=24 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 24 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=25 HTTP/1.1" 200 None


正在获取第 25 页数据...


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443
DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=26 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 26 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=27 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 27 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=28 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 28 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=29 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 29 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=30 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 30 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=31 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 31 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=32 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 32 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=33 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 33 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=34 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 34 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=35 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 35 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=36 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 36 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=37 HTTP/1.1" 200 None


正在获取第 37 页数据...


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443
DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=38 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 38 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=39 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 39 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=40 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 40 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=41 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 41 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=42 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 42 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=43 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 43 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=44 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 44 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=45 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 45 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=46 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 46 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=47 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 47 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=48 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.openrice.com:443


正在获取第 48 页数据...


DEBUG:urllib3.connectionpool:https://www.openrice.com:443 "GET /zh/hongkong/restaurants?page=49 HTTP/1.1" 200 None


正在获取第 49 页数据...
数据获取并写入CSV文件完成。


##多线程获取

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import concurrent.futures

# 添加头部信息模拟浏览器请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def fetch_and_parse_data(page_num):
    url = f'https://www.openrice.com/zh/hongkong/restaurants?page={page_num}'
    response = requests.get(url, headers=headers)
    print(f"正在获取第 {page_num} 页数据...")

    soup = BeautifulSoup(response.text, 'html.parser')
    restaurant_list = soup.find_all('div', class_='poi-list-cell-desktop-right')

    data = []
    for restaurant in restaurant_list:
        name = restaurant.find('div', class_='text').text.strip()

        # 获取地址信息
        address_info = restaurant.find('div', class_='poi-list-cell-line-info').text.strip()
        address_parts = address_info.split('/')
        address = address_parts[0].split('\n')[0].strip()

        # 获取食物类型信息
        food_type_info = restaurant.find('div', class_='poi-list-cell-line-info-details').text.strip()
        food_types = [t.strip() for t in food_type_info.split(' / ')]

        while len(food_types) < 3:
            food_types.append('')

        region = food_types.pop(0)
        type1 = food_types.pop(0)

        if len(food_types) > 1:
            type2 = food_types.pop(0)
            if '/' in type2:
                type2, type3 = type2.split('/', 1)
            else:
                type3 = ''
        else:
            type2 = ''
            type3 = ''

        person_price = food_types.pop() if food_types else ''
        well_rating = restaurant.find('div', class_='smile icon-wrapper big-score').text.strip()
        bad_rating = restaurant.find('div', class_='cry icon-wrapper').text.strip()

        data.append([name, address, region, type1, type2, type3, person_price, well_rating, bad_rating])

    return data

def write_to_csv(data):
    with open('restaurants.csv', 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

def main():
    with open('restaurants.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['name', 'address', 'region', 'type1', 'type2', 'type3', 'person_price', 'well_rating', 'bad_rating'])  # 写入表头

    pages = range(1, 500)  # 假设要获取前50页的数据

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = executor.map(fetch_and_parse_data, pages)

        for result in results:
            write_to_csv(result)

    print("数据获取并写入CSV文件完成。")

if __name__ == "__main__":
    main()


正在获取第 2 页数据...
正在获取第 7 页数据...
正在获取第 3 页数据...
正在获取第 8 页数据...
正在获取第 4 页数据...
正在获取第 9 页数据...
正在获取第 1 页数据...
正在获取第 10 页数据...
正在获取第 5 页数据...
正在获取第 6 页数据...
正在获取第 11 页数据...
正在获取第 16 页数据...
正在获取第 14 页数据...
正在获取第 18 页数据...
正在获取第 19 页数据...
正在获取第 13 页数据...
正在获取第 17 页数据...
正在获取第 15 页数据...
正在获取第 12 页数据...
正在获取第 20 页数据...
正在获取第 21 页数据...
正在获取第 22 页数据...
正在获取第 24 页数据...
正在获取第 23 页数据...
正在获取第 25 页数据...
正在获取第 26 页数据...
正在获取第 28 页数据...
正在获取第 27 页数据...
正在获取第 29 页数据...
正在获取第 30 页数据...
正在获取第 31 页数据...
正在获取第 33 页数据...
正在获取第 32 页数据...
正在获取第 34 页数据...
正在获取第 35 页数据...
正在获取第 37 页数据...
正在获取第 36 页数据...
正在获取第 38 页数据...
正在获取第 39 页数据...
正在获取第 40 页数据...
正在获取第 41 页数据...
正在获取第 42 页数据...
正在获取第 44 页数据...
正在获取第 43 页数据...
正在获取第 45 页数据...
正在获取第 46 页数据...
正在获取第 47 页数据...
正在获取第 48 页数据...
正在获取第 49 页数据...
正在获取第 50 页数据...
正在获取第 51 页数据...
正在获取第 52 页数据...
正在获取第 53 页数据...
正在获取第 54 页数据...
正在获取第 56 页数据...
正在获取第 57 页数据...
正在获取第 55 页数据...
正在获取第 58 页数据...
正在获取第 59 页数据...
正在获取第 60 页数据...
正在获取第 62 页数据...
正在获取第 61 页数据...
正在获取第 63 页数据...
正

In [None]:
'''
Use different url headings and approach to get the data
'''

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# https://www.openrice.com/en/hongkong/restaurants?page={}&searchSort=31&region=0&district_id={} 

#https://www.openrice.com/en/hongkong/restaurants?cuisineId=3006&priceRangeId=3&tabIndex=0&tabType=
# https://www.openrice.com/en/hongkong/restaurants?sortBy=ORScoreDesc&cuisineId=3006&priceRangeId=3&tabIndex=0&tabType=

#https://www.openrice.com/en/hongkong/restaurants/cuisine/italian?page=1&searchSort=31&region=0&priceRangeId=3&tabIndex=0&tabType=

# use different url headings

def fetch_and_parse_data(page_num):
    url_1 = 'https://www.openrice.com/zh/hongkong/restaurants?page='
    url_2 = '&searchSort=31&region=0&district_id='
    
    response = requests.get(url, headers=headers)
    print(f"正在获取第 {page_num} 页数据...")

    soup = BeautifulSoup(response.text, 'html.parser')
    restaurant_list = soup.find_all('div', class_='poi-list-cell-desktop-right')

    data = []
    for restaurant in restaurant_list:
        name = restaurant.find('div', class_='text').text.strip()

        # 获取地址信息
        address_info = restaurant.find('div', class_='poi-list-cell-line-info').text.strip()
        address_parts = address_info.split('/')
        address = address_parts[0].split('\n')[0].strip()

        # 获取食物类型信息
        food_type_info = restaurant.find('div', class_='poi-list-cell-line-info-details').text.strip()
        food_types = [t.strip() for t in food_type_info.split(' / ')]

        while len(food_types) < 3:
            food_types.append('')

        region = food_types.pop(0)
        type1 = food_types.pop(0)

        if len(food_types) > 1:
            type2 = food_types.pop(0)
            if '/' in type2:
                type2, type3 = type2.split('/', 1)
            else:
                type3 = ''
        else:
            type2 = ''
            type3 = ''

        person_price = food_types.pop() if food_types else ''
        well_rating = restaurant.find('div', class_='smile icon-wrapper big-score').text.strip()
        bad_rating = restaurant.find('div', class_='cry icon-wrapper').text.strip()

        data.append([name, address, region, type1, type2, type3, person_price, well_rating, bad_rating])

    return data

def write_to_csv(data):
    with open('restaurants.csv', 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

def main():
    with open('restaurants.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['name', 'address', 'region', 'type1', 'type2', 'type3', 'person_price', 'well_rating', 'bad_rating'])  # 写入表头

    pages = range(1, 500)  # 假设要获取前50页的数据

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = executor.map(fetch_and_parse_data, pages)

        for result in results:
            write_to_csv(result)

    print("数据获取并写入CSV文件完成。")

if __name__ == "__main__":
    main()


##使用selenium抓取（动态抓取不成功）

In [None]:
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.11.1-py2.py3-none-any.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.8/287.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.3.0-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.1.0-py3-none-any.whl (11 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.0-py2.py3-none-any.whl (17 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.1.0-py3-none-any.whl (12 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.1.2-py3-non

In [None]:
!pip install selenium

In [None]:
!apt-get update
!apt install chromium-chromedriver

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
chromium-chromedriver is already the newest version (1:85.0.4183.83-0ubuntu2.22.04.1).
0 upgraded, 0 newly inst

In [None]:
!which chromedriver

/usr/bin/chromedriver


In [1]:
import requests
# 获取当前的 IP 地址
print(requests.get('https://api.ipify.org').text)

158.247.226.12


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')  # Run Chrome in headless mode (no GUI)
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize Chrome WebDriver
driver = webdriver.Chrome(options=chrome_options)

# Website URL to open
website_url = 'https://www.openrice.com/zh/hongkong/restaurants/'  # Change this to the website you want to test

try:
    # Open the website
    driver.get(website_url)
    print('链接成功')
    # Print the page title to verify if the website is opened successfully
    print("Page Title:", driver.title)

finally:
    # Quit the WebDriver
    driver.quit()


ModuleNotFoundError: No module named 'selenium'

##导出为静态页面进行获取

In [None]:
from bs4 import BeautifulSoup
import csv

# 创建CSV文件并写入数据
with open('restaurants_all.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['name', 'address', 'region', 'type1', 'type2', 'type3', 'person_price', 'well_rating', 'bad_rating'])  # 写入表头

    # 打开静态页面文件
    with open('22.html', 'r', encoding='utf-8') as file:
        content = file.read()

        # 解析静态页面内容
        soup = BeautifulSoup(content, 'html.parser')

        # 定位并提取商户数据
        restaurant_list = soup.find_all('div', class_='poi-list-cell-desktop-right')

        for restaurant in restaurant_list:
            name = restaurant.find('div', class_='text').text.strip()

            # 获取地址信息
            address_info = restaurant.find('div', class_='poi-list-cell-line-info').text.strip()
            address_parts = address_info.split('/')
            address = address_parts[0].split('\n')[0].strip()  # 取第一个斜杠之前的部分，并删除额外的空格和换行符

            # 获取食物类型信息
            food_type_info = restaurant.find('div', class_='poi-list-cell-line-info-details').text.strip()
            food_types = [t.strip() for t in food_type_info.split(' / ')]

            # 补充缺失的食物类型，最多只取前3个
            while len(food_types) < 3:
                food_types.append('')

            region = food_types.pop(0)  # 第一个是 region，剩下的是食物类型
            type1 = food_types.pop(0)  # 取出第一个食物类型作为 type1

            # 判断是否有 type2 和 type3
            if len(food_types) > 1:
                type2 = food_types.pop(0)  # 取出第二个食物类型作为 type2
                if '/' in type2:
                    type2, type3 = type2.split('/', 1)
                else:
                    type3 = ''
            else:
                type2 = ''
                type3 = ''

            person_price = food_types.pop() if food_types else ''  # 最后一个是人均消费
            well_rating = restaurant.find('div', class_='smile icon-wrapper big-score').text.strip()
            bad_rating = restaurant.find('div', class_='cry icon-wrapper').text.strip()

            # 将数据写入CSV文件
            writer.writerow([name, address, region, type1, type2, type3, person_price, well_rating, bad_rating])

print("数据获取并写入CSV文件完成。")

数据获取并写入CSV文件完成。


In [None]:
from bs4 import BeautifulSoup
import csv

# 创建CSV文件并写入数据
with open('restaurants_Italian.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['name', 'address', 'region', 'type1', 'type2', 'type3', 'person_price', 'well_rating', 'bad_rating'])  # 写入表头

    # 打开静态页面文件
    with open('11.html', 'r', encoding='utf-8') as file:
        content = file.read()

        # 解析静态页面内容
        soup = BeautifulSoup(content, 'html.parser')

        # 定位并提取商户数据
        restaurant_list = soup.find_all('div', class_='poi-list-cell-desktop-right')

        for restaurant in restaurant_list:
            name = restaurant.find('div', class_='text').text.strip()

            # 获取地址信息
            address_info = restaurant.find('div', class_='poi-list-cell-line-info').text.strip()
            address_parts = address_info.split('/')
            address = address_parts[0].split('\n')[0].strip()  # 取第一个斜杠之前的部分，并删除额外的空格和换行符

            # 获取食物类型信息
            food_type_info = restaurant.find('div', class_='poi-list-cell-line-info-details').text.strip()
            food_types = [t.strip() for t in food_type_info.split(' / ')]

            # 补充缺失的食物类型，最多只取前3个
            while len(food_types) < 3:
                food_types.append('')

            region = food_types.pop(0)  # 第一个是 region，剩下的是食物类型
            type1 = food_types.pop(0)  # 取出第一个食物类型作为 type1

            # 判断是否有 type2 和 type3
            if len(food_types) > 1:
                type2 = food_types.pop(0)  # 取出第二个食物类型作为 type2
                if '/' in type2:
                    type2, type3 = type2.split('/', 1)
                else:
                    type3 = ''
            else:
                type2 = ''
                type3 = ''

            person_price = food_types.pop() if food_types else ''  # 最后一个是人均消费
            well_rating = restaurant.find('div', class_='smile icon-wrapper big-score').text.strip()
            bad_rating = restaurant.find('div', class_='cry icon-wrapper').text.strip()

            # 将数据写入CSV文件
            writer.writerow([name, address, region, type1, type2, type3, person_price, well_rating, bad_rating])

print("数据获取并写入CSV文件完成。")


数据获取并写入CSV文件完成。


In [None]:
from bs4 import BeautifulSoup
import csv

# 创建CSV文件并写入数据
with open('restaurants_Italian100.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['name', 'address', 'region', 'type1', 'type2', 'type3', 'person_price', 'well_rating', 'bad_rating'])  # 写入表头

    # 打开静态页面文件
    with open('33.html', 'r', encoding='utf-8') as file:
        content = file.read()

        # 解析静态页面内容
        soup = BeautifulSoup(content, 'html.parser')

        # 定位并提取商户数据
        restaurant_list = soup.find_all('div', class_='poi-list-cell-desktop-right')

        for restaurant in restaurant_list:
            name = restaurant.find('div', class_='text').text.strip()

            # 获取地址信息
            address_info = restaurant.find('div', class_='poi-list-cell-line-info').text.strip()
            address_parts = address_info.split('/')
            address = address_parts[0].split('\n')[0].strip()  # 取第一个斜杠之前的部分，并删除额外的空格和换行符

            # 获取食物类型信息
            food_type_info = restaurant.find('div', class_='poi-list-cell-line-info-details').text.strip()
            food_types = [t.strip() for t in food_type_info.split(' / ')]

            # 补充缺失的食物类型，最多只取前3个
            while len(food_types) < 3:
                food_types.append('')

            region = food_types.pop(0)  # 第一个是 region，剩下的是食物类型
            type1 = food_types.pop(0)  # 取出第一个食物类型作为 type1

            # 判断是否有 type2 和 type3
            if len(food_types) > 1:
                type2 = food_types.pop(0)  # 取出第二个食物类型作为 type2
                if '/' in type2:
                    type2, type3 = type2.split('/', 1)
                else:
                    type3 = ''
            else:
                type2 = ''
                type3 = ''

            person_price = food_types.pop() if food_types else ''  # 最后一个是人均消费
            well_rating = restaurant.find('div', class_='smile icon-wrapper big-score')
            if well_rating:
                well_rating = well_rating.text.strip()
            else:
                well_rating = ''  # 如果找不到评分信息，则将评分设置为空字符串

            bad_rating = restaurant.find('div', class_='cry icon-wrapper')
            if bad_rating:
                bad_rating = bad_rating.text.strip()
            else:
                bad_rating = ''  # 如果找不到评分信息，则将评分设置为空字符串

            # 将数据写入CSV文件
            writer.writerow([name, address, region, type1, type2, type3, person_price, well_rating, bad_rating])

print("数据获取并写入CSV文件完成。")


数据获取并写入CSV文件完成。


##Test

问题：
1.网站阻止自动化工具: 网站采取措施阻止自动化工具（Selenium）访问其内容。可能是因为网站的安全策略或反爬虫措施。已尝试模拟真实用户行为，例如更改User-Agent头部信息或使用代理服务器，但是还是无法访问。
2.静态网页获取的方式，只能获取250条数据，这跟我们自己在访问网站时候的情况一样，只能看到250家餐厅的数据。
3.openrice没有开放API，无法通过API方式去访问。

In [None]:
# web scraping to openrice.com with scrapy